Variant Lookup Class and Methods
This module contains the abstract class
VariantLookup to facilitate retrieval of
variants that overlap a given genomic coordinate range. Concrete implementations must implement the
query() method for retrieving variants
that overlap the given range.
Two concrete implementations are provided that both take a list of VCF files to be queried:
Each class can also use minor allele frequency (MAF) to filter variants.
The helper class SimpleVariant is included to facilitate VCF querying and reporting out results.
Examples
>>> from pathlib import Path
>>> lookup = cached(vcf_paths=[Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.00, include_missing_mafs=True)
>>> lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
>>> variants = lookup.query(refname="chr2", start=7999, end=9900)
>>> len(variants)
14
>>> for v in variants: print(v)
complex-variant-sv-1/1@chr2:8000[T/<DEL> NA]
rare-dbsnp-snp1-1/1@chr2:9000[A/C 0.0010]
common-dbsnp-snp1-1/1@chr2:9010[C/T 0.0100]
common-dbsnp-snp2-1/1@chr2:9020[A/C 0.0200]
rare-ac-an-snp-1/1@chr2:9030[G/A 0.0010]
rare-af-snp-1/1@chr2:9040[C/A 0.0005]
common-ac-an-snp-1/1@chr2:9050[C/G 0.0470]
common-af-snp-1/1@chr2:9060[T/C 0.0400]
common-multiallelic-1/2@chr2:9070[C/A 0.0525]
common-multiallelic-2/2@chr2:9070[C/T 0.0525]
common-insertion-1/1@chr2:9080[A/ACGT 0.0400]
common-deletion-1/1@chr2:9090[CTA/C 0.0400]
common-mixed-1/2@chr2:9100[CA/GG 0.1200]
common-mixed-2/2@chr2:9101[A/ACACA 0.1200]
>>> lookup.query(refname="ch12", start=7999, end=9900)
[]
Classes
FileBasedVariantLookup
Bases: ContextManager, VariantLookup
Implementation of VariantLookup that queries against indexed VCF files each time a query is
performed. Assumes the index is located adjacent to the VCF file and has the same base name with
either a .csi or .tbi suffix.
Example:
>>> with FileBasedVariantLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
Source code in prymer/api/variant_lookup.py
| class FileBasedVariantLookup(ContextManager, VariantLookup):
"""Implementation of `VariantLookup` that queries against indexed VCF files each time a query is
performed. Assumes the index is located adjacent to the VCF file and has the same base name with
either a .csi or .tbi suffix.
Example:
```python
>>> with FileBasedVariantLookup([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0, include_missing_mafs=False) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
```
""" # noqa: E501
def __init__(self, vcf_paths: list[Path], min_maf: Optional[float], include_missing_mafs: bool):
self._readers: list[VariantFile] = []
super().__init__(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
if len(vcf_paths) == 0:
raise ValueError("No VCF paths given to query.")
for path in vcf_paths:
if (
not path.with_suffix(path.suffix + ".csi").is_file()
and not path.with_suffix(path.suffix + ".tbi").is_file()
):
raise ValueError(f"Cannot perform fetch with missing index file for VCF: {path}")
open_fh = pysam.VariantFile(str(path))
self._readers.append(open_fh)
def __enter__(self) -> "FileBasedVariantLookup":
"""Enter the context manager."""
return self
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
"""Exit this context manager while closing the underlying VCF handles."""
self.close()
return None
def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
"""Queries variants from the VCFs used by this lookup and returns a `SimpleVariant`."""
simple_variants: list[SimpleVariant] = []
for fh, path in zip(self._readers, self.vcf_paths, strict=True):
if not fh.header.contigs.get(refname):
_logger.debug(f"Header in VCF file {path} does not contain chromosome {refname}.")
continue
# pysam.fetch is 0-based, half-open
variants = [variant for variant in fh.fetch(contig=refname, start=start - 1, end=end)]
simple_variants.extend(self.to_variants(variants, source_vcf=path))
return sorted(simple_variants, key=lambda x: x.pos)
def close(self) -> None:
"""Close the underlying VCF file handles."""
for handle in self._readers:
handle.close()
|
Functions
__enter__
__enter__() -> FileBasedVariantLookup
Enter the context manager.
Source code in prymer/api/variant_lookup.py
| def __enter__(self) -> "FileBasedVariantLookup":
"""Enter the context manager."""
return self
|
__exit__
__exit__(
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None
Exit this context manager while closing the underlying VCF handles.
Source code in prymer/api/variant_lookup.py
| def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
traceback: Optional[TracebackType],
) -> None:
"""Exit this context manager while closing the underlying VCF handles."""
self.close()
return None
|
close
Close the underlying VCF file handles.
Source code in prymer/api/variant_lookup.py
| def close(self) -> None:
"""Close the underlying VCF file handles."""
for handle in self._readers:
handle.close()
|
SimpleVariant
dataclass
Represents a variant from a given genomic range.
The reference and alternate alleles typically match the VCF from which the variant originated,
differing when the reference and alternate alleles share common leading bases. For example,
given a reference allele of CACACA and alternate allele CACA, the reference and alternate
alleles have their common leading bases removed except for a single "anchor base" (i.e. CAC is
removed), yielding the reference allele ACA and alternate allele A. This will also change
the reference variant position (pos property), which is defined as the position of the first
base in the reference allele.
The variant_type is derived from the reference and alternate alleles after the above (see
VariantType.from_alleles()).
Furthermore, the variant end position (end property) is defined for insertions as the base
after the insertion. For all other variant types, this is the last base in the reference
allele.
Attributes:
| Name |
Type |
Description |
id |
str
|
|
refname |
str
|
the reference sequence name
|
pos |
int
|
the start position of the variant (1-based inclusive)
|
end |
int
|
the end position of the variant (1-based inclusive)
|
ref |
str
|
|
alt |
str
|
|
variant_type |
VariantType
|
|
maf |
Optional[float]
|
optionally, the minor allele frequency
|
Source code in prymer/api/variant_lookup.py
| @dataclass(slots=True, frozen=True, init=True)
class SimpleVariant:
"""Represents a variant from a given genomic range.
The reference and alternate alleles typically match the VCF from which the variant originated,
differing when the reference and alternate alleles share common leading bases. For example,
given a reference allele of `CACACA` and alternate allele `CACA`, the reference and alternate
alleles have their common leading bases removed except for a single "anchor base" (i.e. `CAC` is
removed), yielding the reference allele `ACA` and alternate allele `A`. This will also change
the reference variant position (`pos` property), which is defined as the position of the first
base in the reference allele.
The `variant_type` is derived from the reference and alternate alleles after the above (see
[`VariantType.from_alleles()`][prymer.api.variant_lookup.VariantType.from_alleles]).
Furthermore, the variant end position (`end` property) is defined for insertions as the base
after the insertion. For all other variant types, this is the last base in the reference
allele.
Attributes:
id: the variant identifier
refname: the reference sequence name
pos: the start position of the variant (1-based inclusive)
end: the end position of the variant (1-based inclusive)
ref: the reference base
alt: the alternate base
variant_type: the variant type
maf: optionally, the minor allele frequency
"""
id: str
refname: str
pos: int
ref: str
alt: str
end: int = field(init=False)
variant_type: VariantType = field(init=False)
maf: Optional[float] = None
def __post_init__(self) -> None:
# Simplify the ref/alt alleles
# find the leading prefix of the ref and alternate allele
prefix_length = 0
for r, a in zip(self.ref, self.alt, strict=False):
if r != a:
break
prefix_length += 1
if prefix_length > 1: # > 1 to always keep an anchor base
offset = prefix_length - 1
object.__setattr__(self, "ref", self.ref[offset:])
object.__setattr__(self, "alt", self.alt[offset:])
object.__setattr__(self, "pos", self.pos + offset)
# Derive the end and variant_type from the ref and alt alleles.
object.__setattr__(
self, "variant_type", VariantType.from_alleles(ref=self.ref, alt=self.alt)
)
# span all reference bases (e.g. SNP -> 1bp, MNV -> len(MNV), DEL -> len(DEL),
end: int = self.pos + len(self.ref) - 1
if self.variant_type == VariantType.INSERTION:
# span the base preceding and following the insertion, so add a base at the end
end += 1
object.__setattr__(self, "end", end)
def __str__(self) -> str:
"""Compact String representation of the variant that includes all relevant info."""
maf_string = f"{self.maf:.4f}" if self.maf is not None else "NA"
return f"{self.id}@{self.refname}:{self.pos}[{self.ref}/{self.alt} {maf_string}]"
def to_span(self) -> Span:
"""Creates a Span object that represents the genomic span of this variant.
Insertions will span the base preceding and following the inserted bases."""
return Span(refname=self.refname, start=self.pos, end=self.end, strand=Strand.POSITIVE)
@staticmethod
def build(variant: VariantRecord) -> list["SimpleVariant"]:
"""Convert `pysam.VariantRecord` to `SimpleVariant`. Only the first ALT allele is used."""
maf = calc_maf_from_filter(variant)
simple_variants: list[SimpleVariant] = []
for i, alt in enumerate(variant.alts, start=1):
simple_variant = SimpleVariant(
id=f"{variant.id}-{i}/{len(variant.alts)}",
refname=variant.chrom,
pos=variant.pos,
ref=variant.ref,
alt=alt,
maf=maf,
)
simple_variants.append(simple_variant)
return simple_variants
|
Functions
__str__
Compact String representation of the variant that includes all relevant info.
Source code in prymer/api/variant_lookup.py
| def __str__(self) -> str:
"""Compact String representation of the variant that includes all relevant info."""
maf_string = f"{self.maf:.4f}" if self.maf is not None else "NA"
return f"{self.id}@{self.refname}:{self.pos}[{self.ref}/{self.alt} {maf_string}]"
|
build
staticmethod
build(variant: VariantRecord) -> list[SimpleVariant]
Convert pysam.VariantRecord to SimpleVariant. Only the first ALT allele is used.
Source code in prymer/api/variant_lookup.py
| @staticmethod
def build(variant: VariantRecord) -> list["SimpleVariant"]:
"""Convert `pysam.VariantRecord` to `SimpleVariant`. Only the first ALT allele is used."""
maf = calc_maf_from_filter(variant)
simple_variants: list[SimpleVariant] = []
for i, alt in enumerate(variant.alts, start=1):
simple_variant = SimpleVariant(
id=f"{variant.id}-{i}/{len(variant.alts)}",
refname=variant.chrom,
pos=variant.pos,
ref=variant.ref,
alt=alt,
maf=maf,
)
simple_variants.append(simple_variant)
return simple_variants
|
to_span
Creates a Span object that represents the genomic span of this variant.
Insertions will span the base preceding and following the inserted bases.
Source code in prymer/api/variant_lookup.py
| def to_span(self) -> Span:
"""Creates a Span object that represents the genomic span of this variant.
Insertions will span the base preceding and following the inserted bases."""
return Span(refname=self.refname, start=self.pos, end=self.end, strand=Strand.POSITIVE)
|
VariantLookup
Bases: ABC
Base class to represent a variant from a given genomic range.
Attributes:
| Name |
Type |
Description |
vcf_paths |
list[Path]
|
the paths to the source VCFs for the variants
|
min_maf |
Optional[float]
|
optionally, return only variants with at least this minor allele frequency
|
include_missing_mafs |
bool
|
when filtering variants with a minor allele frequency,
True to include variants with no annotated minor allele frequency, otherwise False.
If no minor allele frequency is given, then this parameter does nothing.
|
Source code in prymer/api/variant_lookup.py
| class VariantLookup(ABC):
"""Base class to represent a variant from a given genomic range.
Attributes:
vcf_paths: the paths to the source VCFs for the variants
min_maf: optionally, return only variants with at least this minor allele frequency
include_missing_mafs: when filtering variants with a minor allele frequency,
`True` to include variants with no annotated minor allele frequency, otherwise `False`.
If no minor allele frequency is given, then this parameter does nothing.
"""
def __init__(
self,
vcf_paths: list[Path],
min_maf: Optional[float],
include_missing_mafs: bool,
) -> None:
self.vcf_paths: list[Path] = vcf_paths
self.min_maf: Optional[float] = min_maf
self.include_missing_mafs: bool = include_missing_mafs
@final
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
"""Gets all variants that overlap a genomic range, optionally filter based on MAF threshold.
Args:
refname: the reference name
start: the 1-based start position
end: the 1-based end position
maf: the MAF of the variant
include_missing_mafs: whether to include variants with a missing MAF
(overrides self.include_missing_mafs)
"""
if maf is None:
maf = self.min_maf
if include_missing_mafs is None:
include_missing_mafs = self.include_missing_mafs
variants = self._query(refname=refname, start=start, end=end)
if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]
@staticmethod
def to_variants(
variants: list[VariantRecord], source_vcf: Optional[Path] = None
) -> list[SimpleVariant]:
"""Converts a list of `pysam.VariantRecords` to a list of `SimpleVariants` for ease of use.
Filters variants based on their FILTER status, and sorts by start position.
Args:
variants: the variants to convert
source_vcf: the optional path to the source VCF, used only for debug messages
Returns:
a list of `SimpleVariants`, one per alternate allele per variant.
"""
simple_vars = []
for variant in variants:
if (
"PASS" in list(variant.filter) or len(list(variant.filter)) == 0
): # if passing or empty filters
simple_variants = SimpleVariant.build(variant)
if any(v.variant_type == VariantType.OTHER for v in simple_variants):
_logger.debug(
f"Input VCF file {source_vcf} may contain complex variants: {variant}"
)
simple_vars.extend(simple_variants)
return sorted(simple_vars, key=lambda v: (v.pos, v.id))
@abstractmethod
def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
"""Subclasses must implement this method."""
|
Functions
query
query(
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]
Gets all variants that overlap a genomic range, optionally filter based on MAF threshold.
Parameters:
| Name |
Type |
Description |
Default |
refname |
str
|
|
required
|
start |
int
|
the 1-based start position
|
required
|
end |
int
|
|
required
|
maf |
Optional[float]
|
|
None
|
include_missing_mafs |
bool
|
whether to include variants with a missing MAF
(overrides self.include_missing_mafs)
|
None
|
Source code in prymer/api/variant_lookup.py
| @final
def query(
self,
refname: str,
start: int,
end: int,
maf: Optional[float] = None,
include_missing_mafs: bool = None,
) -> list[SimpleVariant]:
"""Gets all variants that overlap a genomic range, optionally filter based on MAF threshold.
Args:
refname: the reference name
start: the 1-based start position
end: the 1-based end position
maf: the MAF of the variant
include_missing_mafs: whether to include variants with a missing MAF
(overrides self.include_missing_mafs)
"""
if maf is None:
maf = self.min_maf
if include_missing_mafs is None:
include_missing_mafs = self.include_missing_mafs
variants = self._query(refname=refname, start=start, end=end)
if len(variants) == 0:
_logger.debug(f"No variants extracted from region of interest: {refname}:{start}-{end}")
if maf is None or maf <= 0.0:
return variants
elif include_missing_mafs: # return variants with a MAF above threshold or missing
return [v for v in variants if (v.maf is None or v.maf >= maf)]
else:
return [v for v in variants if v.maf is not None and v.maf >= maf]
|
to_variants
staticmethod
to_variants(
variants: list[VariantRecord],
source_vcf: Optional[Path] = None,
) -> list[SimpleVariant]
Converts a list of pysam.VariantRecords to a list of SimpleVariants for ease of use.
Filters variants based on their FILTER status, and sorts by start position.
Parameters:
| Name |
Type |
Description |
Default |
variants |
list[VariantRecord]
|
|
required
|
source_vcf |
Optional[Path]
|
the optional path to the source VCF, used only for debug messages
|
None
|
Returns:
| Type |
Description |
list[SimpleVariant]
|
a list of SimpleVariants, one per alternate allele per variant.
|
Source code in prymer/api/variant_lookup.py
| @staticmethod
def to_variants(
variants: list[VariantRecord], source_vcf: Optional[Path] = None
) -> list[SimpleVariant]:
"""Converts a list of `pysam.VariantRecords` to a list of `SimpleVariants` for ease of use.
Filters variants based on their FILTER status, and sorts by start position.
Args:
variants: the variants to convert
source_vcf: the optional path to the source VCF, used only for debug messages
Returns:
a list of `SimpleVariants`, one per alternate allele per variant.
"""
simple_vars = []
for variant in variants:
if (
"PASS" in list(variant.filter) or len(list(variant.filter)) == 0
): # if passing or empty filters
simple_variants = SimpleVariant.build(variant)
if any(v.variant_type == VariantType.OTHER for v in simple_variants):
_logger.debug(
f"Input VCF file {source_vcf} may contain complex variants: {variant}"
)
simple_vars.extend(simple_variants)
return sorted(simple_vars, key=lambda v: (v.pos, v.id))
|
VariantOverlapDetector
Bases: VariantLookup
Implements VariantLookup by reading the entire VCF into memory and loading the resulting
Variants into an OverlapDetector.
Source code in prymer/api/variant_lookup.py
| class VariantOverlapDetector(VariantLookup):
"""Implements `VariantLookup` by reading the entire VCF into memory and loading the resulting
Variants into an `OverlapDetector`."""
def __init__(self, vcf_paths: list[Path], min_maf: Optional[float], include_missing_mafs: bool):
super().__init__(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
self._overlap_detector: OverlapDetector[_VariantInterval] = OverlapDetector()
if len(vcf_paths) == 0:
raise ValueError("No VCF paths given to query.")
for path in vcf_paths:
if (
not path.with_suffix(path.suffix + ".csi").is_file()
and not path.with_suffix(path.suffix + ".tbi").is_file()
):
raise ValueError(f"Cannot perform fetch with missing index file for VCF: {path}")
with reader(path) as fh:
variant_intervals = iter(
_VariantInterval.build(simple_variant)
for variant in fh
for simple_variant in self.to_variants([variant], source_vcf=path)
)
self._overlap_detector.add_all(variant_intervals)
def _query(self, refname: str, start: int, end: int) -> list[SimpleVariant]:
"""Queries variants from the VCFs used by this lookup."""
query = Interval(
refname=refname, start=start - 1, end=end
) # interval is half-open, 0-based
overlapping_variants = (
variant_interval.variant
for variant_interval in self._overlap_detector.get_overlaps(query)
)
return sorted(overlapping_variants, key=lambda v: (v.pos, v.id))
|
VariantType
Bases: UppercaseStrEnum
Represents the type of variant.
Source code in prymer/api/variant_lookup.py
| @unique
class VariantType(UppercaseStrEnum):
"""Represents the type of variant."""
SNP = auto()
MNV = auto()
INSERTION = auto()
DELETION = auto()
OTHER = auto()
@staticmethod
def from_alleles(ref: str, alt: str) -> "VariantType":
"""Builds a variant type from the given reference and alternate allele.
Args:
ref: the reference allele
alt: the alternate allele
Returns:
the variant type
"""
variant_type: VariantType
if "<" in alt:
variant_type = VariantType.OTHER
elif (len(ref) == 1) and (len(alt) == 1):
variant_type = VariantType.SNP
elif len(ref) == len(alt):
variant_type = VariantType.MNV
elif len(ref) < len(alt):
variant_type = VariantType.INSERTION
elif len(ref) > len(alt):
variant_type = VariantType.DELETION
else:
raise ValueError(f"Could not determine variant type for ref `{ref}` and alt `{alt}`")
return variant_type
|
Functions
from_alleles
staticmethod
from_alleles(ref: str, alt: str) -> VariantType
Builds a variant type from the given reference and alternate allele.
Parameters:
| Name |
Type |
Description |
Default |
ref |
str
|
|
required
|
alt |
str
|
|
required
|
Returns:
| Type |
Description |
VariantType
|
|
Source code in prymer/api/variant_lookup.py
| @staticmethod
def from_alleles(ref: str, alt: str) -> "VariantType":
"""Builds a variant type from the given reference and alternate allele.
Args:
ref: the reference allele
alt: the alternate allele
Returns:
the variant type
"""
variant_type: VariantType
if "<" in alt:
variant_type = VariantType.OTHER
elif (len(ref) == 1) and (len(alt) == 1):
variant_type = VariantType.SNP
elif len(ref) == len(alt):
variant_type = VariantType.MNV
elif len(ref) < len(alt):
variant_type = VariantType.INSERTION
elif len(ref) > len(alt):
variant_type = VariantType.DELETION
else:
raise ValueError(f"Could not determine variant type for ref `{ref}` and alt `{alt}`")
return variant_type
|
Functions
cached
cached(
vcf_paths: list[Path],
min_maf: float,
include_missing_mafs: bool = False,
) -> VariantOverlapDetector
Constructs a VariantLookup that caches all variants in memory for fast lookup.
Appropriate for small VCFs.
Source code in prymer/api/variant_lookup.py
| def cached(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> VariantOverlapDetector:
"""Constructs a `VariantLookup` that caches all variants in memory for fast lookup.
Appropriate for small VCFs."""
return VariantOverlapDetector(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
|
calc_maf_from_filter
calc_maf_from_filter(
variant: VariantRecord,
) -> Optional[float]
Calculates minor allele frequency (MAF) based on whether VariantRecord denotes
CAF, AF, AC and AN in the INFO field. In none of those are found, looks in the FORMAT
field for genotypes and calculates MAF.
If the variant has multiple alternate alleles, then the returned MAF is the sum of MAFs across
all alternate alleles.
Parameters:
| Name |
Type |
Description |
Default |
variant |
VariantRecord
|
the variant from which the MAF will be computed
|
required
|
Returns:
| Type |
Description |
Optional[float]
|
the minor allele frequency (MAF)
|
Source code in prymer/api/variant_lookup.py
| def calc_maf_from_filter(variant: pysam.VariantRecord) -> Optional[float]:
"""Calculates minor allele frequency (MAF) based on whether `VariantRecord` denotes
CAF, AF, AC and AN in the INFO field. In none of those are found, looks in the FORMAT
field for genotypes and calculates MAF.
If the variant has multiple _alternate_ alleles, then the returned MAF is the sum of MAFs across
all alternate alleles.
Args:
variant: the variant from which the MAF will be computed
Returns:
the minor allele frequency (MAF)
"""
maf: Optional[float] = None
if "CAF" in variant.info:
# Assumes CAF is a list of allele frequencies, with the first being the reference allele
maf = 1 - float(variant.info["CAF"][0])
elif "AF" in variant.info:
maf = sum(float(af) for af in variant.info["AF"])
elif "AC" in variant.info and "AN" in variant.info:
ac = sum(int(ac) for ac in variant.info["AC"])
an = int(variant.info["AN"])
maf = ac / an
elif len(list(variant.samples)) > 0: # if genotypes are not empty
gts = [idx for sample in variant.samples.values() for idx in sample["GT"] if "GT" in sample]
if len(gts) > 0:
num_alt = sum(1 for idx in gts if idx != 0)
maf = num_alt / len(gts)
return maf
|
disk_based
disk_based(
vcf_paths: list[Path],
min_maf: float,
include_missing_mafs: bool = False,
) -> FileBasedVariantLookup
Constructs a VariantLookup that queries indexed VCFs on disk for each lookup.
Appropriate for large VCFs.
Example:
>>> with disk_based([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
Source code in prymer/api/variant_lookup.py
| def disk_based(
vcf_paths: list[Path], min_maf: float, include_missing_mafs: bool = False
) -> FileBasedVariantLookup:
"""Constructs a `VariantLookup` that queries indexed VCFs on disk for each lookup.
Appropriate for large VCFs.
Example:
```python
>>> with disk_based([Path("./tests/api/data/miniref.variants.vcf.gz")], min_maf=0.0) as lookup:
... lookup.query(refname="chr2", start=7999, end=8000)
[SimpleVariant(id='complex-variant-sv-1/1', refname='chr2', pos=8000, ref='T', alt='<DEL>', end=8000, variant_type=<VariantType.OTHER: 'OTHER'>, maf=None)]
```
""" # noqa: E501
return FileBasedVariantLookup(
vcf_paths=vcf_paths, min_maf=min_maf, include_missing_mafs=include_missing_mafs
)
|