Skip to content

offtarget

Classes

BwaAlnInteractive

Bases: ExecutableRunner

Wrapper around a novel mode of 'bwa aln' that allows for "interactive" use of bwa to keep the process running and be able to send it chunks of reads periodically and get alignments back without waiting for a full batch of reads to be sent.

See:

Attributes:

Name Type Description
max_hits int

the maximum number of hits to report - if more than this number of seed hits are found, report only the count and not each hit.

reverse_complement bool

reverse complement each query sequence before alignment.

include_alt_hits bool

if True include hits to references with names ending in _alt, otherwise do not include them.

header

the SAM alignment header.

Source code in prymer/offtarget/bwa.py
class BwaAlnInteractive(ExecutableRunner):
    """Wrapper around a novel mode of 'bwa aln' that allows for "interactive" use of bwa to keep
    the process running and be able to send it chunks of reads periodically and get alignments
    back without waiting for a full batch of reads to be sent.

    See:

    - [https://github.com/fulcrumgenomics/bwa-aln-interactive](https://github.com/fulcrumgenomics/bwa-aln-interactive)
    - [https://bioconda.github.io/recipes/bwa-aln-interactive/README.html](https://bioconda.github.io/recipes/bwa-aln-interactive/README.html)

    Attributes:
        max_hits: the maximum number of hits to report - if more than this number of seed hits
                  are found, report only the count and not each hit.
        reverse_complement: reverse complement each query sequence before alignment.
        include_alt_hits: if True include hits to references with names ending in _alt, otherwise
                          do not include them.
        header: the SAM alignment header.
    """

    def __init__(
        self,
        ref: Path,
        max_hits: int,
        executable: str | Path = BWA_EXECUTABLE_NAME,
        max_mismatches: int = 3,
        max_mismatches_in_seed: int = 3,
        max_gap_opens: int = 0,
        max_gap_extensions: int = -1,
        min_indel_to_end_distance: int = 3,
        seed_length: int = 20,
        reverse_complement: bool = False,
        include_alt_hits: bool = False,
        threads: Optional[int] = None,
    ):
        """
        Args:
            ref: the path to the reference FASTA, which must be indexed with bwa.
            max_hits: the maximum number of hits to report - if more than this number of seed hits
                      are found, report only the count and not each hit.
            executable: string or Path representation of the `bwa-aln-interactive` executable path
            max_mismatches: the maximum number of mismatches allowed in the full query sequence
            max_mismatches_in_seed: the maximum number of mismatches allowed in the seed region
            max_gap_opens: the maximum number of gap opens allowed in the full query sequence
            max_gap_extensions: the maximum number of gap extensions allowed in the full query
                                sequence
            min_indel_to_end_distance: do not place an indel within this many bp of the ends of
                the query sequence
            seed_length: the length of the seed region
            reverse_complement: reverse complement each query sequence before alignment
            include_alt_hits: if true include hits to references with names ending in _alt,
                              otherwise do not include them.
            threads: the number of threads to use.  If `None`, use all available CPUs.
        """
        threads = os.cpu_count() if threads is None else threads
        executable_path = ExecutableRunner.validate_executable_path(executable=executable)
        self.reverse_complement: bool = reverse_complement
        self.include_alt_hits: bool = include_alt_hits
        self.max_hits: int = max_hits

        missing_aux_paths = []
        for aux_ext in BWA_AUX_EXTENSIONS:
            aux_path = Path(f"{ref}{aux_ext}")
            if not aux_path.exists():
                missing_aux_paths.append(aux_path)
        if len(missing_aux_paths) > 0:
            message: str
            if len(missing_aux_paths) > 1:
                message = "BWA index files do not exist:\n\t"
            else:
                message = "BWA index file does not exist:\n\t"
            message += "\t\n".join(f"{p}" for p in missing_aux_paths)
            raise FileNotFoundError(f"{message}\nIndex with: `{executable_path} index {ref}`")

        # -N = non-iterative mode: search for all n-difference hits (slooow)
        # -S = output SAM (run samse)
        # -Z = interactive mode (no input buffer and force processing with empty lines between recs)
        command: list[str] = [
            f"{executable_path}",
            "aln",
            "-t",
            f"{threads}",
            "-n",
            f"{max_mismatches}",
            "-o",
            f"{max_gap_opens}",
            "-e",
            f"{max_gap_extensions}",
            "-i",
            f"{min_indel_to_end_distance}",
            "-l",
            f"{seed_length}",
            "-k",
            f"{max_mismatches_in_seed}",
            "-X",
            f"{max_hits}",
            "-N",
            "-S",
            "-Z",
            "-D",
            f"{ref}",
            "/dev/stdin",
        ]

        super().__init__(command=command, stderr=subprocess.PIPE)

        header = []
        for line in self._subprocess.stdout:
            if line.startswith("@"):
                header.append(line)
            if line.startswith("@PG"):
                break

        self.header = AlignmentHeader.from_text("".join(header))

        # NB: ExecutableRunner will default to redirecting stderr to /dev/null. However, we would
        # like to preserve stderr messages from bwa for potential debugging. To do this, we create
        # a single thread to continuously read from stderr and redirect text lines to a debug
        # logger. The close() method of this class will additionally join the stderr logging thread.
        self._logger = logging.getLogger(self.__class__.__qualname__)
        self._stderr_thread = Thread(
            daemon=True,
            target=self._stream_to_sink,
            args=(self._subprocess.stderr, self._logger.debug),
        )
        self._stderr_thread.start()

    def __signal_bwa(self) -> None:
        """Signals BWA to process the queries."""
        self._subprocess.stdin.flush()
        # NB: the executable compiled on different platforms requires a different number of newlines
        # NB: it is not understood why, but 16 newlines seems to work for all platforms tested
        self._subprocess.stdin.write("\n" * 16)
        self._subprocess.stdin.flush()

    @override
    def close(self) -> bool:
        """
        Gracefully terminates the underlying subprocess if it is still running.

        Returns:
            True: if the subprocess was terminated successfully
            False: if the subprocess failed to terminate or was not already running
        """
        safely_closed: bool = super().close()
        self._stderr_thread.join()
        return safely_closed

    def map_one(self, query: str, id: str = "unknown") -> BwaResult:
        """Maps a single query to the genome and returns the result.

        Args:
            query: the query to map with BWA

        Returns:
            a `BwaResult` based on mapping the query
        """
        return self.map_all([Query(bases=query, id=id)])[0]

    def map_all(self, queries: list[Query]) -> list[BwaResult]:
        """Maps multiple queries and returns the results.  This is more efficient than using
        `map_one` on each query one-by-one as it batches reads to BWA.

        Args:
            queries: the queries to map with BWA

        Returns:
            one `BwaResult`s for each query
        """
        if len(queries) == 0:
            return []

        # Send the reads to BWA
        for query in queries:
            fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
            self._subprocess.stdin.write(fastq_str)

        # Force the input to be sent to the underlying process.
        self.__signal_bwa()

        # Read back the results
        results: list[BwaResult] = []
        for query in queries:
            # get the next alignment and convert to a result
            line: str = next(self._subprocess.stdout).strip()
            assert not line.startswith("@"), f"SAM record must not start with '@'! {line}"
            alignment = AlignedSegment.fromstring(line, self.header)
            results.append(self._to_result(query=query, rec=alignment))

        return results

    def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult:
        """
        Convert the query and alignment to a result.

        Args:
            query: the original query
            rec: the alignment
        """
        if query.id != rec.query_name:
            raise ValueError(
                "Query and Results are out of order" f"Query=${query.id}, Result=${rec.query_name}"
            )

        num_hits: int = int(rec.get_tag("HN")) if rec.has_tag("HN") else 0
        # `to_hits()` removes artifactual hits which span the boundary between concatenated
        # reference sequences. If we are reporting a list of hits, the number of hits should match
        # the size of this list. Otherwise, we either have zero hits, or more than the maximum
        # number of hits. In both of the latter cases, we have to rely on the count reported in the
        # `HN` tag.
        hits: list[BwaHit]
        if 0 < num_hits <= self.max_hits:
            hits = self.to_hits(rec=rec)
            num_hits = len(hits)
        else:
            hits = []

        return BwaResult(query=query, hit_count=num_hits, hits=hits)

    def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
        """Extracts the hits from the given alignment.  Beyond the current alignment
        additional alignments are parsed from the XA SAM tag.

        Args:
            rec: the given alignment
        """
        negative = rec.is_reverse != self.reverse_complement
        first_hit: BwaHit = BwaHit(
            refname=rec.reference_name,
            start=rec.reference_start + 1,  # NB: pysam is 0-based, Hit is 1-based
            negative=negative,
            cigar=Cigar.from_cigartuples(rec.cigartuples),
            edits=int(rec.get_tag("NM")),
        )

        # Add the hits
        # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
        hits: list[BwaHit] = [first_hit]
        if rec.has_tag("XA"):
            for xa in cast(str, rec.get_tag("XA")).split(";"):
                if xa == "":
                    continue
                fields = xa.split(",")

                # If the reverse-complement of the query sequence was fed to BWA, various fields
                # should be negated/reversed in the Hit
                negative = fields[1][0] == "-"
                if self.reverse_complement:
                    negative = not negative

                hit: BwaHit = BwaHit(
                    refname=fields[0],
                    start=int(fields[1][1:]),  # the XA tag is 1-based
                    negative=negative,
                    cigar=Cigar.from_cigarstring(fields[2]),
                    edits=int(fields[3]),
                )
                hits.append(hit)

        if not self.include_alt_hits:
            hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

        # Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s
        # alignment process, which concatenates all reference sequences and reports hits which span
        # across contigs.
        # NB: the type ignore is necessary because pysam's type hint for `get_reference_length` is
        # incorrect.
        # https://github.com/pysam-developers/pysam/pull/1313
        hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)]  # type: ignore[arg-type]  # noqa: E501

        return hits

Functions

__init__
__init__(
    ref: Path,
    max_hits: int,
    executable: str | Path = BWA_EXECUTABLE_NAME,
    max_mismatches: int = 3,
    max_mismatches_in_seed: int = 3,
    max_gap_opens: int = 0,
    max_gap_extensions: int = -1,
    min_indel_to_end_distance: int = 3,
    seed_length: int = 20,
    reverse_complement: bool = False,
    include_alt_hits: bool = False,
    threads: Optional[int] = None,
)

Parameters:

Name Type Description Default
ref Path

the path to the reference FASTA, which must be indexed with bwa.

required
max_hits int

the maximum number of hits to report - if more than this number of seed hits are found, report only the count and not each hit.

required
executable str | Path

string or Path representation of the bwa-aln-interactive executable path

BWA_EXECUTABLE_NAME
max_mismatches int

the maximum number of mismatches allowed in the full query sequence

3
max_mismatches_in_seed int

the maximum number of mismatches allowed in the seed region

3
max_gap_opens int

the maximum number of gap opens allowed in the full query sequence

0
max_gap_extensions int

the maximum number of gap extensions allowed in the full query sequence

-1
min_indel_to_end_distance int

do not place an indel within this many bp of the ends of the query sequence

3
seed_length int

the length of the seed region

20
reverse_complement bool

reverse complement each query sequence before alignment

False
include_alt_hits bool

if true include hits to references with names ending in _alt, otherwise do not include them.

False
threads Optional[int]

the number of threads to use. If None, use all available CPUs.

None
Source code in prymer/offtarget/bwa.py
def __init__(
    self,
    ref: Path,
    max_hits: int,
    executable: str | Path = BWA_EXECUTABLE_NAME,
    max_mismatches: int = 3,
    max_mismatches_in_seed: int = 3,
    max_gap_opens: int = 0,
    max_gap_extensions: int = -1,
    min_indel_to_end_distance: int = 3,
    seed_length: int = 20,
    reverse_complement: bool = False,
    include_alt_hits: bool = False,
    threads: Optional[int] = None,
):
    """
    Args:
        ref: the path to the reference FASTA, which must be indexed with bwa.
        max_hits: the maximum number of hits to report - if more than this number of seed hits
                  are found, report only the count and not each hit.
        executable: string or Path representation of the `bwa-aln-interactive` executable path
        max_mismatches: the maximum number of mismatches allowed in the full query sequence
        max_mismatches_in_seed: the maximum number of mismatches allowed in the seed region
        max_gap_opens: the maximum number of gap opens allowed in the full query sequence
        max_gap_extensions: the maximum number of gap extensions allowed in the full query
                            sequence
        min_indel_to_end_distance: do not place an indel within this many bp of the ends of
            the query sequence
        seed_length: the length of the seed region
        reverse_complement: reverse complement each query sequence before alignment
        include_alt_hits: if true include hits to references with names ending in _alt,
                          otherwise do not include them.
        threads: the number of threads to use.  If `None`, use all available CPUs.
    """
    threads = os.cpu_count() if threads is None else threads
    executable_path = ExecutableRunner.validate_executable_path(executable=executable)
    self.reverse_complement: bool = reverse_complement
    self.include_alt_hits: bool = include_alt_hits
    self.max_hits: int = max_hits

    missing_aux_paths = []
    for aux_ext in BWA_AUX_EXTENSIONS:
        aux_path = Path(f"{ref}{aux_ext}")
        if not aux_path.exists():
            missing_aux_paths.append(aux_path)
    if len(missing_aux_paths) > 0:
        message: str
        if len(missing_aux_paths) > 1:
            message = "BWA index files do not exist:\n\t"
        else:
            message = "BWA index file does not exist:\n\t"
        message += "\t\n".join(f"{p}" for p in missing_aux_paths)
        raise FileNotFoundError(f"{message}\nIndex with: `{executable_path} index {ref}`")

    # -N = non-iterative mode: search for all n-difference hits (slooow)
    # -S = output SAM (run samse)
    # -Z = interactive mode (no input buffer and force processing with empty lines between recs)
    command: list[str] = [
        f"{executable_path}",
        "aln",
        "-t",
        f"{threads}",
        "-n",
        f"{max_mismatches}",
        "-o",
        f"{max_gap_opens}",
        "-e",
        f"{max_gap_extensions}",
        "-i",
        f"{min_indel_to_end_distance}",
        "-l",
        f"{seed_length}",
        "-k",
        f"{max_mismatches_in_seed}",
        "-X",
        f"{max_hits}",
        "-N",
        "-S",
        "-Z",
        "-D",
        f"{ref}",
        "/dev/stdin",
    ]

    super().__init__(command=command, stderr=subprocess.PIPE)

    header = []
    for line in self._subprocess.stdout:
        if line.startswith("@"):
            header.append(line)
        if line.startswith("@PG"):
            break

    self.header = AlignmentHeader.from_text("".join(header))

    # NB: ExecutableRunner will default to redirecting stderr to /dev/null. However, we would
    # like to preserve stderr messages from bwa for potential debugging. To do this, we create
    # a single thread to continuously read from stderr and redirect text lines to a debug
    # logger. The close() method of this class will additionally join the stderr logging thread.
    self._logger = logging.getLogger(self.__class__.__qualname__)
    self._stderr_thread = Thread(
        daemon=True,
        target=self._stream_to_sink,
        args=(self._subprocess.stderr, self._logger.debug),
    )
    self._stderr_thread.start()
__signal_bwa
__signal_bwa() -> None

Signals BWA to process the queries.

Source code in prymer/offtarget/bwa.py
def __signal_bwa(self) -> None:
    """Signals BWA to process the queries."""
    self._subprocess.stdin.flush()
    # NB: the executable compiled on different platforms requires a different number of newlines
    # NB: it is not understood why, but 16 newlines seems to work for all platforms tested
    self._subprocess.stdin.write("\n" * 16)
    self._subprocess.stdin.flush()
close
close() -> bool

Gracefully terminates the underlying subprocess if it is still running.

Returns:

Name Type Description
True bool

if the subprocess was terminated successfully

False bool

if the subprocess failed to terminate or was not already running

Source code in prymer/offtarget/bwa.py
@override
def close(self) -> bool:
    """
    Gracefully terminates the underlying subprocess if it is still running.

    Returns:
        True: if the subprocess was terminated successfully
        False: if the subprocess failed to terminate or was not already running
    """
    safely_closed: bool = super().close()
    self._stderr_thread.join()
    return safely_closed
map_all
map_all(queries: list[Query]) -> list[BwaResult]

Maps multiple queries and returns the results. This is more efficient than using map_one on each query one-by-one as it batches reads to BWA.

Parameters:

Name Type Description Default
queries list[Query]

the queries to map with BWA

required

Returns:

Type Description
list[BwaResult]

one BwaResults for each query

Source code in prymer/offtarget/bwa.py
def map_all(self, queries: list[Query]) -> list[BwaResult]:
    """Maps multiple queries and returns the results.  This is more efficient than using
    `map_one` on each query one-by-one as it batches reads to BWA.

    Args:
        queries: the queries to map with BWA

    Returns:
        one `BwaResult`s for each query
    """
    if len(queries) == 0:
        return []

    # Send the reads to BWA
    for query in queries:
        fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
        self._subprocess.stdin.write(fastq_str)

    # Force the input to be sent to the underlying process.
    self.__signal_bwa()

    # Read back the results
    results: list[BwaResult] = []
    for query in queries:
        # get the next alignment and convert to a result
        line: str = next(self._subprocess.stdout).strip()
        assert not line.startswith("@"), f"SAM record must not start with '@'! {line}"
        alignment = AlignedSegment.fromstring(line, self.header)
        results.append(self._to_result(query=query, rec=alignment))

    return results
map_one
map_one(query: str, id: str = 'unknown') -> BwaResult

Maps a single query to the genome and returns the result.

Parameters:

Name Type Description Default
query str

the query to map with BWA

required

Returns:

Type Description
BwaResult

a BwaResult based on mapping the query

Source code in prymer/offtarget/bwa.py
def map_one(self, query: str, id: str = "unknown") -> BwaResult:
    """Maps a single query to the genome and returns the result.

    Args:
        query: the query to map with BWA

    Returns:
        a `BwaResult` based on mapping the query
    """
    return self.map_all([Query(bases=query, id=id)])[0]
to_hits
to_hits(rec: AlignedSegment) -> list[BwaHit]

Extracts the hits from the given alignment. Beyond the current alignment additional alignments are parsed from the XA SAM tag.

Parameters:

Name Type Description Default
rec AlignedSegment

the given alignment

required
Source code in prymer/offtarget/bwa.py
def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
    """Extracts the hits from the given alignment.  Beyond the current alignment
    additional alignments are parsed from the XA SAM tag.

    Args:
        rec: the given alignment
    """
    negative = rec.is_reverse != self.reverse_complement
    first_hit: BwaHit = BwaHit(
        refname=rec.reference_name,
        start=rec.reference_start + 1,  # NB: pysam is 0-based, Hit is 1-based
        negative=negative,
        cigar=Cigar.from_cigartuples(rec.cigartuples),
        edits=int(rec.get_tag("NM")),
    )

    # Add the hits
    # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
    hits: list[BwaHit] = [first_hit]
    if rec.has_tag("XA"):
        for xa in cast(str, rec.get_tag("XA")).split(";"):
            if xa == "":
                continue
            fields = xa.split(",")

            # If the reverse-complement of the query sequence was fed to BWA, various fields
            # should be negated/reversed in the Hit
            negative = fields[1][0] == "-"
            if self.reverse_complement:
                negative = not negative

            hit: BwaHit = BwaHit(
                refname=fields[0],
                start=int(fields[1][1:]),  # the XA tag is 1-based
                negative=negative,
                cigar=Cigar.from_cigarstring(fields[2]),
                edits=int(fields[3]),
            )
            hits.append(hit)

    if not self.include_alt_hits:
        hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

    # Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s
    # alignment process, which concatenates all reference sequences and reports hits which span
    # across contigs.
    # NB: the type ignore is necessary because pysam's type hint for `get_reference_length` is
    # incorrect.
    # https://github.com/pysam-developers/pysam/pull/1313
    hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)]  # type: ignore[arg-type]  # noqa: E501

    return hits

BwaHit dataclass

Represents a single hit or alignment of a sequence to a location in the genome.

Attributes:

Name Type Description
refname str

the reference name of the hit

start int

the start position of the hit (1-based inclusive)

negative bool

whether the hit is on the negative strand

cigar Cigar

the cigar string returned by BWA

edits int

the number of edits between the read and the reference

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class BwaHit:
    """Represents a single hit or alignment of a sequence to a location in the genome.

    Attributes:
        refname: the reference name of the hit
        start: the start position of the hit (1-based inclusive)
        negative: whether the hit is on the negative strand
        cigar: the cigar string returned by BWA
        edits: the number of edits between the read and the reference
    """

    refname: str
    start: int
    negative: bool
    cigar: Cigar
    edits: int

    @property
    def mismatches(self) -> int:
        """The number of mismatches for the hit"""
        indel_sum = sum(elem.length for elem in self.cigar.elements if elem.operator.is_indel)
        if indel_sum > self.edits:
            raise ValueError(
                f"indel_sum ({indel_sum}) > self.edits ({self.edits}) with cigar: {self.cigar}"
            )
        return self.edits - indel_sum

    @property
    def end(self) -> int:
        """The end position of the hit (1-based inclusive)"""
        return coordmath.get_closed_end(self.start, self.cigar.length_on_target())

    @staticmethod
    def build(
        refname: str, start: int, negative: bool, cigar: str, edits: int, revcomp: bool = False
    ) -> "BwaHit":
        """Generates a hit object.

        Args:
            refname: the reference name of the hit
            start: the start position of the hit (1-based inclusive)
            negative: whether the hit is on the negative strand
            cigar: the cigar string returned by BWA
            edits: the number of edits between the read and the reference
            revcomp: whether the reverse-complement of the query sequence was fed to BWA, in which
                case various fields should be negated/reversed in the Hit

        Returns:
            A Hit that represents the mapping of the original query sequence that was supplied
        """
        negative = negative != revcomp
        return BwaHit(
            refname=refname,
            start=start,
            negative=negative,
            cigar=Cigar.from_cigarstring(cigar),
            edits=edits,
        )

    def __str__(self) -> str:
        """Returns the string representation in bwa's XA tag format."""
        # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
        return ",".join(
            [
                self.refname,
                ("-" if self.negative else "+") + f"{self.start}",
                f"{self.cigar}",
                f"{self.edits}",
            ]
        )

Attributes

end property
end: int

The end position of the hit (1-based inclusive)

mismatches property
mismatches: int

The number of mismatches for the hit

Functions

__str__
__str__() -> str

Returns the string representation in bwa's XA tag format.

Source code in prymer/offtarget/bwa.py
def __str__(self) -> str:
    """Returns the string representation in bwa's XA tag format."""
    # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
    return ",".join(
        [
            self.refname,
            ("-" if self.negative else "+") + f"{self.start}",
            f"{self.cigar}",
            f"{self.edits}",
        ]
    )
build staticmethod
build(
    refname: str,
    start: int,
    negative: bool,
    cigar: str,
    edits: int,
    revcomp: bool = False,
) -> BwaHit

Generates a hit object.

Parameters:

Name Type Description Default
refname str

the reference name of the hit

required
start int

the start position of the hit (1-based inclusive)

required
negative bool

whether the hit is on the negative strand

required
cigar str

the cigar string returned by BWA

required
edits int

the number of edits between the read and the reference

required
revcomp bool

whether the reverse-complement of the query sequence was fed to BWA, in which case various fields should be negated/reversed in the Hit

False

Returns:

Type Description
BwaHit

A Hit that represents the mapping of the original query sequence that was supplied

Source code in prymer/offtarget/bwa.py
@staticmethod
def build(
    refname: str, start: int, negative: bool, cigar: str, edits: int, revcomp: bool = False
) -> "BwaHit":
    """Generates a hit object.

    Args:
        refname: the reference name of the hit
        start: the start position of the hit (1-based inclusive)
        negative: whether the hit is on the negative strand
        cigar: the cigar string returned by BWA
        edits: the number of edits between the read and the reference
        revcomp: whether the reverse-complement of the query sequence was fed to BWA, in which
            case various fields should be negated/reversed in the Hit

    Returns:
        A Hit that represents the mapping of the original query sequence that was supplied
    """
    negative = negative != revcomp
    return BwaHit(
        refname=refname,
        start=start,
        negative=negative,
        cigar=Cigar.from_cigarstring(cigar),
        edits=edits,
    )

BwaResult dataclass

Represents zero or more hits or alignments found by BWA for the given query.

The number of hits found may be more than the number of hits listed in the hits attribute.

Attributes:

Name Type Description
query Query

the query (as given, no RC'ing)

hit_count int

the total number of hits found by bwa (this may be more than len(hits))

hits list[BwaHit]

the subset of hits that were reported by bwa

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class BwaResult:
    """
    Represents zero or more hits or alignments found by BWA for the given query.

    The number of hits found may be more than the number of hits listed in the `hits` attribute.

    Attributes:
        query: the query (as given, no RC'ing)
        hit_count: the total number of hits found by bwa (this may be more than len(hits))
        hits: the subset of hits that were reported by bwa
    """

    query: Query
    hit_count: int
    hits: list[BwaHit]

OffTargetDetector

Bases: AbstractContextManager

A class for detecting off-target mappings of primers and primer pairs that uses a custom version of "bwa aln" named "bwa-aln-interactive".

OffTargetDetector uses a custom, interactive version of bwa aln to perform off-target detection. This approach is faster and more sensitive than traditional isPCR and in addition can correctly detect primers that are repetitive and contain many thousands or millions of mappings to the genome.

Note that while this class invokes BWA with multiple threads, it is not itself thread-safe. Only one thread at a time should invoke methods on this class without external synchronization.

Off-target detection may be performed for individual primers and/or primer pairs.

To detect off-target hits for individual primers, use the OffTargetDetector.filters() method, which will remove any primers that have more than the specified number of maximum hits against the reference.

To detect off-target amplification of primer pairs, use the OffTargetDetector.check_one() or OffTargetDetector.check_all() methods. These methods screen the individual primers in each pair for off-target hits, and verify that the possible amplicons inferred by the primers' alignments does not exceed the specified maximum number of primer pair hits.

Source code in prymer/offtarget/offtarget_detector.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
class OffTargetDetector(AbstractContextManager):
    """A class for detecting off-target mappings of primers and primer pairs that uses a custom
    version of "bwa aln" named "bwa-aln-interactive".

    `OffTargetDetector` uses a [custom, interactive
    version](https://github.com/fulcrumgenomics/bwa-aln-interactive/) of `bwa aln` to perform
    off-target detection. This approach is faster and more sensitive than traditional isPCR and in
    addition can correctly detect primers that are repetitive and contain many thousands or millions
    of mappings to the genome.

    Note that while this class invokes BWA with multiple threads, it is not itself thread-safe.
    Only one thread at a time should invoke methods on this class without external synchronization.

    Off-target detection may be performed for individual primers and/or primer pairs.

    To detect off-target hits for individual primers, use the `OffTargetDetector.filters()` method,
    which will remove any primers that have more than the specified number of maximum hits against
    the reference.

    To detect off-target amplification of primer pairs, use the `OffTargetDetector.check_one()` or
    `OffTargetDetector.check_all()` methods. These methods screen the individual primers in each
    pair for off-target hits, and verify that the possible amplicons inferred by the primers'
    alignments does not exceed the specified maximum number of primer pair hits.
    """

    def __init__(  # noqa: C901
        self,
        ref: Path,
        max_primer_hits: int,
        max_primer_pair_hits: int,
        three_prime_region_length: int,
        max_mismatches_in_three_prime_region: int,
        max_mismatches: int,
        max_gap_opens: int = 0,
        max_gap_extends: int = -1,
        max_amplicon_size: int = 1000,
        min_primer_pair_hits: int = 1,
        cache_results: bool = True,
        threads: Optional[int] = None,
        keep_spans: bool = True,
        keep_primer_spans: bool = True,
        executable: str | Path = BWA_EXECUTABLE_NAME,
    ) -> None:
        """
        Initialize an [[OffTargetDetector]].

        This class accepts a large number of parameters to control the behavior of the off-target
        detection. The parameters are used in the various aspects of off-target detection as
        follows:

        1. Alignment (off-target hit detection): `ref`, `executable`, `threads`,
           `three_prime_region_length`, `max_mismatches_in_three_prime_region`, `max_mismatches`,
           and `max_primer_hits`.
        2. Filtering of individual primers: `max_primer_hits`.
        3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`,
           `max_primer_pair_hits`, and `max_amplicon_size`.

        The `three_prime_region_length` parameter is used as the seed length for `bwa aln`.

        Args:
            ref: the reference genome fasta file (must be indexed with BWA)
            max_primer_hits: the maximum number of hits an individual primer can have in the genome
                before it is considered an invalid primer, and all primer pairs containing the
                primer failed. Must be greater than or equal to 0.
            max_primer_pair_hits: the maximum number of amplicons a primer pair can make and be
                considered passing. Must be greater than or equal to 0.
            min_primer_pair_hits: The minimum number of amplicons a primer pair can make and be
                considered passing. (In most cases, this is the number of amplicons a primer pair is
                expected to generate.) The default is 1, which is appropriate when the primer pair
                is being evaluated for off-target hits against the same reference genome from which
                the primers were generated. If the primer pair was generated from a different
                reference sequence, it may be appropriate to set this value to 0. Must be greater
                    than or equal to 0.
            three_prime_region_length: the number of bases at the 3' end of the primer in which the
                parameter `max_mismatches_in_three_prime_region` is evaluated. This value is used as
                the seed length (`bwa aln -l`). Must be a minimum of 8.
            max_mismatches_in_three_prime_region: the maximum number of mismatches that are
                tolerated in the three prime region of each primer defined by
                `three_prime_region_length`. Must be between 0 and `three_prime_region_length`,
                inclusive.
            max_mismatches: the maximum number of mismatches allowed in the full length primer
                (including any in the three prime region). Must be greater than or equal to 0.
            max_gap_opens: the maximum number of gaps (insertions or deletions) allowable in an
                alignment of a oligo to the reference. Must be greater than or equal to 0.
            max_gap_extends: the maximum number of gap extensions allowed; extending a gap
                beyond a single base costs 1 gap extension.  Can be set to -1 to allow
                unlimited extensions up to max diffs (aka max mismatches), while disallowing
                "long gaps". Must be greater than or equal to -1.
            max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater
                than 0.
            cache_results: if True, cache results for faster re-querying
            threads: the number of threads to use when invoking bwa
            keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans
                populated, otherwise not
            keep_primer_spans: if True, [[OffTargetResult]] objects will be reported with left and
                right primer spans
            executable: string or Path representation of the `bwa` executable path

        Raises:
            ValueError: If `max_amplicon_size` is not greater than 0.
            ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or
                `min_primer_pair_hits` are not greater than or equal to 0.
            ValueError: If `three_prime_region_length` is not greater than or equal to 8.
            ValueError: If `max_mismatches_in_three_prime_region` is outside the range 0 to
                `three_prime_region_length`, inclusive.
            ValueError: If `max_mismatches` is not greater than or equal to 0.
            ValueError: If `max_gap_opens` is not greater than or equal to 0.
            ValueError: If `max_gap_extends` is not -1 or greater than or equal to 0.
        """
        errors: list[str] = []
        if max_amplicon_size < 1:
            errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}")
        if max_primer_hits < 0:
            errors.append(
                f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}"
            )
        if max_primer_pair_hits < 0:
            errors.append(
                "'max_primer_pair_hits' must be greater than or equal to 0. "
                f"Saw {max_primer_pair_hits}"
            )
        if min_primer_pair_hits < 0:
            errors.append(
                "'min_primer_pair_hits' must be greater than or equal to 0. "
                f"Saw {min_primer_pair_hits}"
            )
        if three_prime_region_length < MINIMUM_THREE_PRIME_REGION_LENGTH:
            errors.append(
                "'three_prime_region_length' must be greater than or equal to "
                f"{MINIMUM_THREE_PRIME_REGION_LENGTH}. Saw {three_prime_region_length}"
            )
        if (
            max_mismatches_in_three_prime_region < 0
            or max_mismatches_in_three_prime_region > three_prime_region_length
        ):
            errors.append(
                "'max_mismatches_in_three_prime_region' must be between 0 and "
                f"'three_prime_region_length'={three_prime_region_length} inclusive. "
                f"Saw {max_mismatches_in_three_prime_region}"
            )
        if max_mismatches < 0:
            errors.append(
                f"'max_mismatches' must be greater than or equal to 0. Saw {max_mismatches}"
            )
        if max_gap_opens < 0:
            errors.append(
                f"'max_gap_opens' must be greater than or equal to 0. Saw {max_gap_opens}"
            )
        if max_gap_extends < -1:
            errors.append(
                "'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'="
                f"{max_mismatches}) or greater than or equal to 0. Saw {max_gap_extends}"
            )

        if len(errors) > 0:
            raise ValueError("\n".join(errors))

        self._primer_cache: dict[str, BwaResult] = {}
        self._primer_pair_cache: dict[PrimerPair, OffTargetResult] = {}
        self._bwa = BwaAlnInteractive(
            executable=executable,
            ref=ref,
            reverse_complement=True,
            threads=threads,
            seed_length=three_prime_region_length,
            max_mismatches_in_seed=max_mismatches_in_three_prime_region,
            max_mismatches=max_mismatches,
            max_gap_opens=max_gap_opens,
            max_gap_extensions=max_gap_extends,
            max_hits=max_primer_hits,
        )
        self._max_primer_hits = max_primer_hits
        self._max_primer_pair_hits: int = max_primer_pair_hits
        self._min_primer_pair_hits: int = min_primer_pair_hits
        self._max_amplicon_size: int = max_amplicon_size
        self._cache_results: bool = cache_results
        self._keep_spans: bool = keep_spans
        self._keep_primer_spans: bool = keep_primer_spans

    def filter(self, primers: list[PrimerType]) -> list[PrimerType]:
        """
        Remove primers that have more than `max_primer_hits` mappings to the genome.

        This method maps each primer to the specified reference with `bwa aln` to search for
        off-target hits. Note that when the reference includes the sequence from which the primers
        were designed, the on-target hit will be included in the hit count. `max_primer_hits` should
        be set to at least 1 in this case.

        Args:
            primers: A list of primers to filter.

        Returns:
            The input primers, filtered to remove any primers with more than `max_primer_hits`
            mappings to the genome.
        """
        results: dict[str, BwaResult] = self.mappings_of(primers)
        return [
            primer for primer in primers if results[primer.bases].hit_count <= self._max_primer_hits
        ]

    def check_one(self, primer_pair: PrimerPair) -> OffTargetResult:
        """
        Check an individual primer pair for off-target amplification.

        See `OffTargetDetector.check_all()` for details.

        Args:
            primer_pair: The primer pair to check.

        Returns:
            The results of the off-target detection.
            See `OffTargetResult` for details regarding the attributes included in the result.
        """
        result: dict[PrimerPair, OffTargetResult] = self.check_all([primer_pair])
        return result[primer_pair]

    def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTargetResult]:
        """
        Check a collection of primer pairs for off-target amplification.

        This method maps each primer to the specified reference with `bwa aln` to search for
        off-target hits. Possible amplicons are identified from the pairwise combinations of these
        alignments, up to the specified `max_amplicon_size`.

        Primer pairs are marked as passing if both of the following are true:
        1. Each primer has no more than `max_primer_hits` alignments to the genome.
        2. The size of the set of possible amplicons does not exceed `max_primer_pair_hits`, and is
           at least `min_primer_pair_hits`.

        Args:
            primer_pairs: The primer pairs to check.

        Returns:
            A dictionary mapping each checked primer pair to its off-target detection results.
            See `OffTargetResult` for details regarding the attributes included in each result.
        """

        primer_pair_results: dict[PrimerPair, OffTargetResult] = {}
        result: OffTargetResult

        # Get the primer pairs to map.  If the primer pair is found in the cache, use that
        primer_pairs_to_map: list[PrimerPair] = []
        if not self._cache_results:
            primer_pairs_to_map = primer_pairs
        else:
            for primer_pair in primer_pairs:
                match self._primer_pair_cache.get(primer_pair, None):
                    case None:
                        primer_pairs_to_map.append(primer_pair)  # Map it!
                    case result:
                        primer_pair_results[primer_pair] = result

        # If there are no primer pairs to map, return the results
        if len(primer_pairs_to_map) == 0:
            return primer_pair_results

        # Get mappings of all the primers
        primers = [primer for primer_pair in primer_pairs for primer in primer_pair]
        hits_by_primer = self.mappings_of(primers)

        for primer_pair in primer_pairs:
            primer_pair_results[primer_pair] = self._build_off_target_result(
                primer_pair=primer_pair, hits_by_primer=hits_by_primer
            )

        return primer_pair_results

    def _build_off_target_result(
        self, primer_pair: PrimerPair, hits_by_primer: dict[str, BwaResult]
    ) -> OffTargetResult:
        """Builds an `OffTargetResult` for the given `PrimerPair`.

        If there are too many primer hits for either the left or right primer, set `passes` to
        `False` on the returned `OffTargetResult`, otherwise generate all possible amplicons with
        size less than the given maximum to generate off target results.

        Args:
            primer_pair: the primer pair from which the off-target result is built.
            hits_by_primer: mapping of primer bases to `BwaHits`.
        """
        result: OffTargetResult

        # Get the mappings for the left primer and right primer respectively
        left_bwa_result: BwaResult = hits_by_primer[primer_pair.left_primer.bases]
        right_bwa_result: BwaResult = hits_by_primer[primer_pair.right_primer.bases]

        # If there are too many hits, this primer pair will not pass. Exit early.
        if (
            left_bwa_result.hit_count > self._max_primer_hits
            or right_bwa_result.hit_count > self._max_primer_hits
        ):
            result = OffTargetResult(primer_pair=primer_pair, passes=False)
            if self._cache_results:
                self._primer_pair_cache[primer_pair] = replace(result, cached=True)
            return result

        # Map the hits by reference name
        left_positive_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        left_negative_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        right_positive_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        right_negative_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)

        # Split the hits for left and right by reference name and strand
        for hit in left_bwa_result.hits:
            if hit.negative:
                left_negative_hits[hit.refname].append(hit)
            else:
                left_positive_hits[hit.refname].append(hit)

        for hit in right_bwa_result.hits:
            if hit.negative:
                right_negative_hits[hit.refname].append(hit)
            else:
                right_positive_hits[hit.refname].append(hit)

        refnames: set[ReferenceName] = {
            h.refname for h in itertools.chain(left_bwa_result.hits, right_bwa_result.hits)
        }

        # Build amplicons from hits on the same reference with valid relative orientation
        amplicons: list[Span] = []
        for refname in refnames:
            amplicons.extend(
                self._to_amplicons(
                    positive_hits=left_positive_hits[refname],
                    negative_hits=right_negative_hits[refname],
                    max_len=self._max_amplicon_size,
                    strand=Strand.POSITIVE,
                )
            )
            amplicons.extend(
                self._to_amplicons(
                    positive_hits=right_positive_hits[refname],
                    negative_hits=left_negative_hits[refname],
                    max_len=self._max_amplicon_size,
                    strand=Strand.NEGATIVE,
                )
            )

        result = OffTargetResult(
            primer_pair=primer_pair,
            passes=self._min_primer_pair_hits <= len(amplicons) <= self._max_primer_pair_hits,
            spans=amplicons if self._keep_spans else [],
            left_primer_spans=(
                [self._hit_to_span(h) for h in left_bwa_result.hits]
                if self._keep_primer_spans
                else []
            ),
            right_primer_spans=(
                [self._hit_to_span(h) for h in right_bwa_result.hits]
                if self._keep_primer_spans
                else []
            ),
        )

        if self._cache_results:
            self._primer_pair_cache[primer_pair] = replace(result, cached=True)

        return result

    def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]:
        """
        Map primers to the reference genome using `bwa aln`.

        Alignment results may be optionally cached for efficiency. Set `cache_results` to `True`
        when instantiating the [[OffTargetDetector]] to enable caching.

        Any primers without cached results, or all primers when `cache_results` is `False`, will be
        mapped to the reference genome using `bwa aln` and the specified alignment parameters.

        **Note**: The reverse complement of each primer sequence is used for mapping. The `query`
        sequence in each `BwaResult` will match the input primer sequence, as will the sequences
        used as keys in the output dictionary. However, the coordinates reported in each `BwaHit`
        associated with a result will correspond to complementary sequence.

        Args:
            primers: A list of primers to map.

        Returns:
            A dictionary mapping each primer's sequence to its alignment results.
            See `BwaResult` for details regarding the attributes included in each result.
        """

        primers_to_map: list[PrimerType]
        if not self._cache_results:
            primers_to_map = primers
        else:
            primers_to_map = [
                primer for primer in list(OrderedSet(primers)) if primer not in self._primer_cache
            ]

        # Build the unique list of queries to map with BWA
        queries: list[Query] = [
            Query(id=primer.bases, bases=primer.bases) for primer in primers_to_map
        ]

        # Map the queries with BWA
        hits_by_primer: dict[str, BwaResult] = {
            result.query.id: result for result in self._bwa.map_all(queries)
        }

        # Cache the results, if desired, and get the hits by primer _for all_ primers. If not
        # caching, then hits_by_primer already contains all the primers.
        if self._cache_results:
            self._primer_cache.update(hits_by_primer)
            hits_by_primer = {primer.bases: self._primer_cache[primer.bases] for primer in primers}

        return hits_by_primer

    @staticmethod
    def _to_amplicons(
        positive_hits: list[BwaHit], negative_hits: list[BwaHit], max_len: int, strand: Strand
    ) -> list[Span]:
        """Takes lists of positive strand hits and negative strand hits and constructs amplicon
        mappings anywhere a positive strand hit and a negative strand hit occur where the end of
        the negative strand hit is no more than `max_len` from the start of the positive strand
        hit.

        Primers may not overlap.

        Args:
            positive_hits: List of hits on the positive strand for one of the primers in the pair.
            negative_hits: List of hits on the negative strand for the other primer in the pair.
            max_len: Maximum length of amplicons to consider.
            strand: The strand of the amplicon to generate. Set to Strand.POSITIVE if
                `positive_hits` are for the left primer and `negative_hits` are for the right
                primer. Set to Strand.NEGATIVE if `positive_hits` are for the right primer and
                `negative_hits` are for the left primer.

        Raises:
            ValueError: If any of the positive hits are not on the positive strand, or any of the
                negative hits are not on the negative strand. If hits are present on more than one
                reference.
        """
        if any(h.negative for h in positive_hits):
            raise ValueError("Positive hits must be on the positive strand.")
        if any(not h.negative for h in negative_hits):
            raise ValueError("Negative hits must be on the negative strand.")

        refnames: set[ReferenceName] = {
            h.refname for h in itertools.chain(positive_hits, negative_hits)
        }
        if len(refnames) > 1:
            raise ValueError(f"Hits are present on more than one reference: {refnames}")

        # Exit early if one of the hit lists is empty - this will save unnecessary sorting of the
        # other list
        if len(positive_hits) == 0 or len(negative_hits) == 0:
            return []

        # Sort the positive strand hits by start position and the negative strand hits by *end*
        # position. The `max_len` cutoff is based on negative_hit.end - positive_hit.start + 1.
        positive_hits_sorted = sorted(positive_hits, key=lambda h: h.start)
        negative_hits_sorted = sorted(negative_hits, key=lambda h: h.end)

        amplicons: list[Span] = []

        # Track the position of the previously examined negative hit.
        prev_negative_hit_index = 0
        for positive_hit in positive_hits_sorted:
            # Check only negative hits starting with the previously examined one.
            for negative_hit_index, negative_hit in enumerate(
                negative_hits_sorted[prev_negative_hit_index:],
                start=prev_negative_hit_index,
            ):
                # TODO: Consider allowing overlapping positive and negative hits.
                if (
                    negative_hit.start > positive_hit.end
                    and negative_hit.end - positive_hit.start + 1 <= max_len
                ):
                    # If the negative hit starts to the right of the positive hit, and the amplicon
                    # length is <= max_len, add it to the list of amplicon hits to be returned.
                    amplicons.append(
                        Span(
                            refname=positive_hit.refname,
                            start=positive_hit.start,
                            end=negative_hit.end,
                            strand=strand,
                        )
                    )

                if negative_hit.end - positive_hit.start + 1 > max_len:
                    # Stop searching for negative hits to pair with this positive hit.
                    # All subsequence negative hits will have amplicon length > max_len
                    break

                if negative_hit.end < positive_hit.start:
                    # This positive hit is genomically right of the current negative hit.
                    # All subsequent positive hits will also be genomically right of this negative
                    # hit, so we should start at the one after this. If this index is past the end
                    # of the list, the slice `negative_hits_sorted[prev_negative_hit_index:]` will
                    # be empty.
                    prev_negative_hit_index = negative_hit_index + 1

        return amplicons

    @staticmethod
    def _hit_to_span(hit: BwaHit) -> Span:
        """Converts a Bwa Hit object to a Span."""
        return Span(refname=hit.refname, start=hit.start, end=hit.end)

    def close(self) -> None:
        self._bwa.close()

    def __enter__(self) -> Self:
        return self

    def __exit__(
        self,
        exc_type: Optional[type[BaseException]],
        exc_value: Optional[BaseException],
        traceback: Optional[TracebackType],
    ) -> None:
        """Gracefully terminates any running subprocesses."""
        super().__exit__(exc_type, exc_value, traceback)
        self.close()

Functions

__exit__
__exit__(
    exc_type: Optional[type[BaseException]],
    exc_value: Optional[BaseException],
    traceback: Optional[TracebackType],
) -> None

Gracefully terminates any running subprocesses.

Source code in prymer/offtarget/offtarget_detector.py
def __exit__(
    self,
    exc_type: Optional[type[BaseException]],
    exc_value: Optional[BaseException],
    traceback: Optional[TracebackType],
) -> None:
    """Gracefully terminates any running subprocesses."""
    super().__exit__(exc_type, exc_value, traceback)
    self.close()
__init__
__init__(
    ref: Path,
    max_primer_hits: int,
    max_primer_pair_hits: int,
    three_prime_region_length: int,
    max_mismatches_in_three_prime_region: int,
    max_mismatches: int,
    max_gap_opens: int = 0,
    max_gap_extends: int = -1,
    max_amplicon_size: int = 1000,
    min_primer_pair_hits: int = 1,
    cache_results: bool = True,
    threads: Optional[int] = None,
    keep_spans: bool = True,
    keep_primer_spans: bool = True,
    executable: str | Path = BWA_EXECUTABLE_NAME,
) -> None

Initialize an [[OffTargetDetector]].

This class accepts a large number of parameters to control the behavior of the off-target detection. The parameters are used in the various aspects of off-target detection as follows:

  1. Alignment (off-target hit detection): ref, executable, threads, three_prime_region_length, max_mismatches_in_three_prime_region, max_mismatches, and max_primer_hits.
  2. Filtering of individual primers: max_primer_hits.
  3. Checking of primer pairs: max_primer_hits, min_primer_pair_hits, max_primer_pair_hits, and max_amplicon_size.

The three_prime_region_length parameter is used as the seed length for bwa aln.

Parameters:

Name Type Description Default
ref Path

the reference genome fasta file (must be indexed with BWA)

required
max_primer_hits int

the maximum number of hits an individual primer can have in the genome before it is considered an invalid primer, and all primer pairs containing the primer failed. Must be greater than or equal to 0.

required
max_primer_pair_hits int

the maximum number of amplicons a primer pair can make and be considered passing. Must be greater than or equal to 0.

required
min_primer_pair_hits int

The minimum number of amplicons a primer pair can make and be considered passing. (In most cases, this is the number of amplicons a primer pair is expected to generate.) The default is 1, which is appropriate when the primer pair is being evaluated for off-target hits against the same reference genome from which the primers were generated. If the primer pair was generated from a different reference sequence, it may be appropriate to set this value to 0. Must be greater than or equal to 0.

1
three_prime_region_length int

the number of bases at the 3' end of the primer in which the parameter max_mismatches_in_three_prime_region is evaluated. This value is used as the seed length (bwa aln -l). Must be a minimum of 8.

required
max_mismatches_in_three_prime_region int

the maximum number of mismatches that are tolerated in the three prime region of each primer defined by three_prime_region_length. Must be between 0 and three_prime_region_length, inclusive.

required
max_mismatches int

the maximum number of mismatches allowed in the full length primer (including any in the three prime region). Must be greater than or equal to 0.

required
max_gap_opens int

the maximum number of gaps (insertions or deletions) allowable in an alignment of a oligo to the reference. Must be greater than or equal to 0.

0
max_gap_extends int

the maximum number of gap extensions allowed; extending a gap beyond a single base costs 1 gap extension. Can be set to -1 to allow unlimited extensions up to max diffs (aka max mismatches), while disallowing "long gaps". Must be greater than or equal to -1.

-1
max_amplicon_size int

the maximum amplicon size to consider amplifiable. Must be greater than 0.

1000
cache_results bool

if True, cache results for faster re-querying

True
threads Optional[int]

the number of threads to use when invoking bwa

None
keep_spans bool

if True, [[OffTargetResult]] objects will be reported with amplicon spans populated, otherwise not

True
keep_primer_spans bool

if True, [[OffTargetResult]] objects will be reported with left and right primer spans

True
executable str | Path

string or Path representation of the bwa executable path

BWA_EXECUTABLE_NAME

Raises:

Type Description
ValueError

If max_amplicon_size is not greater than 0.

ValueError

If any of max_primer_hits, max_primer_pair_hits, or min_primer_pair_hits are not greater than or equal to 0.

ValueError

If three_prime_region_length is not greater than or equal to 8.

ValueError

If max_mismatches_in_three_prime_region is outside the range 0 to three_prime_region_length, inclusive.

ValueError

If max_mismatches is not greater than or equal to 0.

ValueError

If max_gap_opens is not greater than or equal to 0.

ValueError

If max_gap_extends is not -1 or greater than or equal to 0.

Source code in prymer/offtarget/offtarget_detector.py
def __init__(  # noqa: C901
    self,
    ref: Path,
    max_primer_hits: int,
    max_primer_pair_hits: int,
    three_prime_region_length: int,
    max_mismatches_in_three_prime_region: int,
    max_mismatches: int,
    max_gap_opens: int = 0,
    max_gap_extends: int = -1,
    max_amplicon_size: int = 1000,
    min_primer_pair_hits: int = 1,
    cache_results: bool = True,
    threads: Optional[int] = None,
    keep_spans: bool = True,
    keep_primer_spans: bool = True,
    executable: str | Path = BWA_EXECUTABLE_NAME,
) -> None:
    """
    Initialize an [[OffTargetDetector]].

    This class accepts a large number of parameters to control the behavior of the off-target
    detection. The parameters are used in the various aspects of off-target detection as
    follows:

    1. Alignment (off-target hit detection): `ref`, `executable`, `threads`,
       `three_prime_region_length`, `max_mismatches_in_three_prime_region`, `max_mismatches`,
       and `max_primer_hits`.
    2. Filtering of individual primers: `max_primer_hits`.
    3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`,
       `max_primer_pair_hits`, and `max_amplicon_size`.

    The `three_prime_region_length` parameter is used as the seed length for `bwa aln`.

    Args:
        ref: the reference genome fasta file (must be indexed with BWA)
        max_primer_hits: the maximum number of hits an individual primer can have in the genome
            before it is considered an invalid primer, and all primer pairs containing the
            primer failed. Must be greater than or equal to 0.
        max_primer_pair_hits: the maximum number of amplicons a primer pair can make and be
            considered passing. Must be greater than or equal to 0.
        min_primer_pair_hits: The minimum number of amplicons a primer pair can make and be
            considered passing. (In most cases, this is the number of amplicons a primer pair is
            expected to generate.) The default is 1, which is appropriate when the primer pair
            is being evaluated for off-target hits against the same reference genome from which
            the primers were generated. If the primer pair was generated from a different
            reference sequence, it may be appropriate to set this value to 0. Must be greater
                than or equal to 0.
        three_prime_region_length: the number of bases at the 3' end of the primer in which the
            parameter `max_mismatches_in_three_prime_region` is evaluated. This value is used as
            the seed length (`bwa aln -l`). Must be a minimum of 8.
        max_mismatches_in_three_prime_region: the maximum number of mismatches that are
            tolerated in the three prime region of each primer defined by
            `three_prime_region_length`. Must be between 0 and `three_prime_region_length`,
            inclusive.
        max_mismatches: the maximum number of mismatches allowed in the full length primer
            (including any in the three prime region). Must be greater than or equal to 0.
        max_gap_opens: the maximum number of gaps (insertions or deletions) allowable in an
            alignment of a oligo to the reference. Must be greater than or equal to 0.
        max_gap_extends: the maximum number of gap extensions allowed; extending a gap
            beyond a single base costs 1 gap extension.  Can be set to -1 to allow
            unlimited extensions up to max diffs (aka max mismatches), while disallowing
            "long gaps". Must be greater than or equal to -1.
        max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater
            than 0.
        cache_results: if True, cache results for faster re-querying
        threads: the number of threads to use when invoking bwa
        keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans
            populated, otherwise not
        keep_primer_spans: if True, [[OffTargetResult]] objects will be reported with left and
            right primer spans
        executable: string or Path representation of the `bwa` executable path

    Raises:
        ValueError: If `max_amplicon_size` is not greater than 0.
        ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or
            `min_primer_pair_hits` are not greater than or equal to 0.
        ValueError: If `three_prime_region_length` is not greater than or equal to 8.
        ValueError: If `max_mismatches_in_three_prime_region` is outside the range 0 to
            `three_prime_region_length`, inclusive.
        ValueError: If `max_mismatches` is not greater than or equal to 0.
        ValueError: If `max_gap_opens` is not greater than or equal to 0.
        ValueError: If `max_gap_extends` is not -1 or greater than or equal to 0.
    """
    errors: list[str] = []
    if max_amplicon_size < 1:
        errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}")
    if max_primer_hits < 0:
        errors.append(
            f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}"
        )
    if max_primer_pair_hits < 0:
        errors.append(
            "'max_primer_pair_hits' must be greater than or equal to 0. "
            f"Saw {max_primer_pair_hits}"
        )
    if min_primer_pair_hits < 0:
        errors.append(
            "'min_primer_pair_hits' must be greater than or equal to 0. "
            f"Saw {min_primer_pair_hits}"
        )
    if three_prime_region_length < MINIMUM_THREE_PRIME_REGION_LENGTH:
        errors.append(
            "'three_prime_region_length' must be greater than or equal to "
            f"{MINIMUM_THREE_PRIME_REGION_LENGTH}. Saw {three_prime_region_length}"
        )
    if (
        max_mismatches_in_three_prime_region < 0
        or max_mismatches_in_three_prime_region > three_prime_region_length
    ):
        errors.append(
            "'max_mismatches_in_three_prime_region' must be between 0 and "
            f"'three_prime_region_length'={three_prime_region_length} inclusive. "
            f"Saw {max_mismatches_in_three_prime_region}"
        )
    if max_mismatches < 0:
        errors.append(
            f"'max_mismatches' must be greater than or equal to 0. Saw {max_mismatches}"
        )
    if max_gap_opens < 0:
        errors.append(
            f"'max_gap_opens' must be greater than or equal to 0. Saw {max_gap_opens}"
        )
    if max_gap_extends < -1:
        errors.append(
            "'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'="
            f"{max_mismatches}) or greater than or equal to 0. Saw {max_gap_extends}"
        )

    if len(errors) > 0:
        raise ValueError("\n".join(errors))

    self._primer_cache: dict[str, BwaResult] = {}
    self._primer_pair_cache: dict[PrimerPair, OffTargetResult] = {}
    self._bwa = BwaAlnInteractive(
        executable=executable,
        ref=ref,
        reverse_complement=True,
        threads=threads,
        seed_length=three_prime_region_length,
        max_mismatches_in_seed=max_mismatches_in_three_prime_region,
        max_mismatches=max_mismatches,
        max_gap_opens=max_gap_opens,
        max_gap_extensions=max_gap_extends,
        max_hits=max_primer_hits,
    )
    self._max_primer_hits = max_primer_hits
    self._max_primer_pair_hits: int = max_primer_pair_hits
    self._min_primer_pair_hits: int = min_primer_pair_hits
    self._max_amplicon_size: int = max_amplicon_size
    self._cache_results: bool = cache_results
    self._keep_spans: bool = keep_spans
    self._keep_primer_spans: bool = keep_primer_spans
check_all
check_all(
    primer_pairs: list[PrimerPair],
) -> dict[PrimerPair, OffTargetResult]

Check a collection of primer pairs for off-target amplification.

This method maps each primer to the specified reference with bwa aln to search for off-target hits. Possible amplicons are identified from the pairwise combinations of these alignments, up to the specified max_amplicon_size.

Primer pairs are marked as passing if both of the following are true: 1. Each primer has no more than max_primer_hits alignments to the genome. 2. The size of the set of possible amplicons does not exceed max_primer_pair_hits, and is at least min_primer_pair_hits.

Parameters:

Name Type Description Default
primer_pairs list[PrimerPair]

The primer pairs to check.

required

Returns:

Type Description
dict[PrimerPair, OffTargetResult]

A dictionary mapping each checked primer pair to its off-target detection results.

dict[PrimerPair, OffTargetResult]

See OffTargetResult for details regarding the attributes included in each result.

Source code in prymer/offtarget/offtarget_detector.py
def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTargetResult]:
    """
    Check a collection of primer pairs for off-target amplification.

    This method maps each primer to the specified reference with `bwa aln` to search for
    off-target hits. Possible amplicons are identified from the pairwise combinations of these
    alignments, up to the specified `max_amplicon_size`.

    Primer pairs are marked as passing if both of the following are true:
    1. Each primer has no more than `max_primer_hits` alignments to the genome.
    2. The size of the set of possible amplicons does not exceed `max_primer_pair_hits`, and is
       at least `min_primer_pair_hits`.

    Args:
        primer_pairs: The primer pairs to check.

    Returns:
        A dictionary mapping each checked primer pair to its off-target detection results.
        See `OffTargetResult` for details regarding the attributes included in each result.
    """

    primer_pair_results: dict[PrimerPair, OffTargetResult] = {}
    result: OffTargetResult

    # Get the primer pairs to map.  If the primer pair is found in the cache, use that
    primer_pairs_to_map: list[PrimerPair] = []
    if not self._cache_results:
        primer_pairs_to_map = primer_pairs
    else:
        for primer_pair in primer_pairs:
            match self._primer_pair_cache.get(primer_pair, None):
                case None:
                    primer_pairs_to_map.append(primer_pair)  # Map it!
                case result:
                    primer_pair_results[primer_pair] = result

    # If there are no primer pairs to map, return the results
    if len(primer_pairs_to_map) == 0:
        return primer_pair_results

    # Get mappings of all the primers
    primers = [primer for primer_pair in primer_pairs for primer in primer_pair]
    hits_by_primer = self.mappings_of(primers)

    for primer_pair in primer_pairs:
        primer_pair_results[primer_pair] = self._build_off_target_result(
            primer_pair=primer_pair, hits_by_primer=hits_by_primer
        )

    return primer_pair_results
check_one
check_one(primer_pair: PrimerPair) -> OffTargetResult

Check an individual primer pair for off-target amplification.

See OffTargetDetector.check_all() for details.

Parameters:

Name Type Description Default
primer_pair PrimerPair

The primer pair to check.

required

Returns:

Type Description
OffTargetResult

The results of the off-target detection.

OffTargetResult

See OffTargetResult for details regarding the attributes included in the result.

Source code in prymer/offtarget/offtarget_detector.py
def check_one(self, primer_pair: PrimerPair) -> OffTargetResult:
    """
    Check an individual primer pair for off-target amplification.

    See `OffTargetDetector.check_all()` for details.

    Args:
        primer_pair: The primer pair to check.

    Returns:
        The results of the off-target detection.
        See `OffTargetResult` for details regarding the attributes included in the result.
    """
    result: dict[PrimerPair, OffTargetResult] = self.check_all([primer_pair])
    return result[primer_pair]
filter
filter(primers: list[PrimerType]) -> list[PrimerType]

Remove primers that have more than max_primer_hits mappings to the genome.

This method maps each primer to the specified reference with bwa aln to search for off-target hits. Note that when the reference includes the sequence from which the primers were designed, the on-target hit will be included in the hit count. max_primer_hits should be set to at least 1 in this case.

Parameters:

Name Type Description Default
primers list[PrimerType]

A list of primers to filter.

required

Returns:

Type Description
list[PrimerType]

The input primers, filtered to remove any primers with more than max_primer_hits

list[PrimerType]

mappings to the genome.

Source code in prymer/offtarget/offtarget_detector.py
def filter(self, primers: list[PrimerType]) -> list[PrimerType]:
    """
    Remove primers that have more than `max_primer_hits` mappings to the genome.

    This method maps each primer to the specified reference with `bwa aln` to search for
    off-target hits. Note that when the reference includes the sequence from which the primers
    were designed, the on-target hit will be included in the hit count. `max_primer_hits` should
    be set to at least 1 in this case.

    Args:
        primers: A list of primers to filter.

    Returns:
        The input primers, filtered to remove any primers with more than `max_primer_hits`
        mappings to the genome.
    """
    results: dict[str, BwaResult] = self.mappings_of(primers)
    return [
        primer for primer in primers if results[primer.bases].hit_count <= self._max_primer_hits
    ]
mappings_of
mappings_of(
    primers: list[PrimerType],
) -> dict[str, BwaResult]

Map primers to the reference genome using bwa aln.

Alignment results may be optionally cached for efficiency. Set cache_results to True when instantiating the [[OffTargetDetector]] to enable caching.

Any primers without cached results, or all primers when cache_results is False, will be mapped to the reference genome using bwa aln and the specified alignment parameters.

Note: The reverse complement of each primer sequence is used for mapping. The query sequence in each BwaResult will match the input primer sequence, as will the sequences used as keys in the output dictionary. However, the coordinates reported in each BwaHit associated with a result will correspond to complementary sequence.

Parameters:

Name Type Description Default
primers list[PrimerType]

A list of primers to map.

required

Returns:

Type Description
dict[str, BwaResult]

A dictionary mapping each primer's sequence to its alignment results.

dict[str, BwaResult]

See BwaResult for details regarding the attributes included in each result.

Source code in prymer/offtarget/offtarget_detector.py
def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]:
    """
    Map primers to the reference genome using `bwa aln`.

    Alignment results may be optionally cached for efficiency. Set `cache_results` to `True`
    when instantiating the [[OffTargetDetector]] to enable caching.

    Any primers without cached results, or all primers when `cache_results` is `False`, will be
    mapped to the reference genome using `bwa aln` and the specified alignment parameters.

    **Note**: The reverse complement of each primer sequence is used for mapping. The `query`
    sequence in each `BwaResult` will match the input primer sequence, as will the sequences
    used as keys in the output dictionary. However, the coordinates reported in each `BwaHit`
    associated with a result will correspond to complementary sequence.

    Args:
        primers: A list of primers to map.

    Returns:
        A dictionary mapping each primer's sequence to its alignment results.
        See `BwaResult` for details regarding the attributes included in each result.
    """

    primers_to_map: list[PrimerType]
    if not self._cache_results:
        primers_to_map = primers
    else:
        primers_to_map = [
            primer for primer in list(OrderedSet(primers)) if primer not in self._primer_cache
        ]

    # Build the unique list of queries to map with BWA
    queries: list[Query] = [
        Query(id=primer.bases, bases=primer.bases) for primer in primers_to_map
    ]

    # Map the queries with BWA
    hits_by_primer: dict[str, BwaResult] = {
        result.query.id: result for result in self._bwa.map_all(queries)
    }

    # Cache the results, if desired, and get the hits by primer _for all_ primers. If not
    # caching, then hits_by_primer already contains all the primers.
    if self._cache_results:
        self._primer_cache.update(hits_by_primer)
        hits_by_primer = {primer.bases: self._primer_cache[primer.bases] for primer in primers}

    return hits_by_primer

OffTargetResult dataclass

Information obtained by running a single primer pair through the off-target detector.

Attributes:

Name Type Description
primer_pair PrimerPair

the primer pair submitted

passes bool

True if neither primer exceeds the specified maximum number of hits, and the primer pair would generate an acceptable number of amplicons in the reference genome (i.e. min_primer_pair_hits <= num_amplicons <= max_primer_pair_hits). False otherwise.

cached bool

True if this result is part of a cache, False otherwise. This is useful for testing

spans list[Span]

The set of mappings of the primer pair to the genome (i.e., the region spanned by the inferred amplicon). This list will be empty if the generating [[OffTargetDetector]] was constructed with keep_spans=False.

left_primer_spans list[Span]

the list of mappings for the left primer, independent of the pair mappings, or an empty list

right_primer_spans list[Span]

the list of mappings for the right primer, independent of the pair mappings, or an empty list

Source code in prymer/offtarget/offtarget_detector.py
@dataclass(init=True, frozen=True)
class OffTargetResult:
    """Information obtained by running a single primer pair through the off-target detector.

    Attributes:
        primer_pair: the primer pair submitted
        passes: True if neither primer exceeds the specified maximum number of hits, and the primer
            pair would generate an acceptable number of amplicons in the reference genome (i.e.
            `min_primer_pair_hits <= num_amplicons <= max_primer_pair_hits`). False otherwise.
        cached: True if this result is part of a cache, False otherwise.  This is useful for testing
        spans: The set of mappings of the primer pair to the genome (i.e., the region spanned by the
            inferred amplicon). This list will be empty if the generating [[OffTargetDetector]] was
            constructed with `keep_spans=False`.
        left_primer_spans: the list of mappings for the left primer, independent of the pair
            mappings, or an empty list
        right_primer_spans: the list of mappings for the right primer, independent of the pair
            mappings, or an empty list
    """

    primer_pair: PrimerPair
    passes: bool
    cached: bool = False
    spans: list[Span] = field(default_factory=list)
    left_primer_spans: list[Span] = field(default_factory=list)
    right_primer_spans: list[Span] = field(default_factory=list)

Query dataclass

Represents a single query sequence for mapping.

Attributes:

Name Type Description
id str

the identifier for the query (e.g. read name)

bases str

the query bases

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class Query:
    """Represents a single query sequence for mapping.

    Attributes:
        id: the identifier for the query (e.g. read name)
        bases: the query bases
    """

    id: str
    bases: str

    _DEFAULT_BASE_QUALITY: ClassVar[str] = "H"
    """The default base quality for a query"""

    def __post_init__(self) -> None:
        if self.bases is None or len(self.bases) == 0:
            raise ValueError("No bases were provided to query")

    def to_fastq(self, reverse_complement: bool = False) -> str:
        """Returns the multi-line FASTQ string representation of this query"""
        bases = sequence.reverse_complement(self.bases) if reverse_complement else self.bases
        quals = self._DEFAULT_BASE_QUALITY * len(self.bases)
        return f"@{self.id}\n{bases}\n+\n{quals}\n"

Functions

to_fastq
to_fastq(reverse_complement: bool = False) -> str

Returns the multi-line FASTQ string representation of this query

Source code in prymer/offtarget/bwa.py
def to_fastq(self, reverse_complement: bool = False) -> str:
    """Returns the multi-line FASTQ string representation of this query"""
    bases = sequence.reverse_complement(self.bases) if reverse_complement else self.bases
    quals = self._DEFAULT_BASE_QUALITY * len(self.bases)
    return f"@{self.id}\n{bases}\n+\n{quals}\n"

Modules

bwa

Methods and Classes to run and interact with BWA

The BwaAlnInteractive class is used to map a list of queries to the reference genome. A single query may be mapped using map_one(), while multiple queries may be mapped using map_all(). The latter is more efficient than mapping queries one-by-one.

The queries are provided to these methods using the Query class, which contains the unique identifier for the query and the bases to query. These methods return a BwaResult, which represents zero or more hits (or alignments) found by BWA for the given query. Each hit is represented by a BwaHit object. In some cases, BWA will write fewer (or no) hits in the "XA" tag than the total number hits reported in the "HN". This occurs when BWA finds more hits than max_hits (see bwt aln -X).

Use of this module requires installation of a custom version of BWA named bwa-aln-interactive. See:

Example
>>> from pathlib import Path
>>> ref_fasta = Path("./tests/offtarget/data/miniref.fa")
>>> query = Query(bases="TCTACTAAAAATACAAAAAATTAGCTGGGCATGATGGCATGCACCTGTAATCCCGCTACT", id="NA")
>>> bwa = BwaAlnInteractive(ref=ref_fasta, max_hits=1)
>>> result = bwa.map_one(query=query.bases, id=query.id)
>>> result.hit_count
1
>>> len(result.hits)
1
>>> result.hits[0]
BwaHit(refname='chr1', start=61, negative=False, cigar=Cigar(elements=(CigarElement(length=60, operator=<CigarOp.M: (0, 'M', True, True)>),)), edits=0)
>>> query = Query(bases="AAAAAA", id="NA")
>>> bwa.map_all(queries=[query])
[BwaResult(query=Query(id='NA', bases='AAAAAA'), hit_count=3968, hits=[])]
>>> bwa.close()
True

Attributes

BWA_AUX_EXTENSIONS module-attribute
BWA_AUX_EXTENSIONS: list[str] = [
    ".amb",
    ".ann",
    ".bwt",
    ".pac",
    ".sa",
]

The file extensions for BWA index files

BWA_EXECUTABLE_NAME module-attribute
BWA_EXECUTABLE_NAME: str = 'bwa-aln-interactive'

The executable name for the interactive build of bwa aln.

MAX_GAP_EXTENSIONS module-attribute
MAX_GAP_EXTENSIONS: int = -1

The default maximum number of gap extensions allowed in the full query sequence

MAX_GAP_OPENS module-attribute
MAX_GAP_OPENS: int = 0

The default maximum number of gap opens allowed in the full query sequence

MAX_MISMATCHES module-attribute
MAX_MISMATCHES: int = 3

The default maximum number of mismatches allowed in the full query sequence

MAX_MISMATCHES_IN_SEED module-attribute
MAX_MISMATCHES_IN_SEED: int = 3

The default maximum number of mismatches allowed in the seed region

SEED_LENGTH module-attribute
SEED_LENGTH: int = 20

The default length of the seed region

Classes

BwaAlnInteractive

Bases: ExecutableRunner

Wrapper around a novel mode of 'bwa aln' that allows for "interactive" use of bwa to keep the process running and be able to send it chunks of reads periodically and get alignments back without waiting for a full batch of reads to be sent.

See:

Attributes:

Name Type Description
max_hits int

the maximum number of hits to report - if more than this number of seed hits are found, report only the count and not each hit.

reverse_complement bool

reverse complement each query sequence before alignment.

include_alt_hits bool

if True include hits to references with names ending in _alt, otherwise do not include them.

header

the SAM alignment header.

Source code in prymer/offtarget/bwa.py
class BwaAlnInteractive(ExecutableRunner):
    """Wrapper around a novel mode of 'bwa aln' that allows for "interactive" use of bwa to keep
    the process running and be able to send it chunks of reads periodically and get alignments
    back without waiting for a full batch of reads to be sent.

    See:

    - [https://github.com/fulcrumgenomics/bwa-aln-interactive](https://github.com/fulcrumgenomics/bwa-aln-interactive)
    - [https://bioconda.github.io/recipes/bwa-aln-interactive/README.html](https://bioconda.github.io/recipes/bwa-aln-interactive/README.html)

    Attributes:
        max_hits: the maximum number of hits to report - if more than this number of seed hits
                  are found, report only the count and not each hit.
        reverse_complement: reverse complement each query sequence before alignment.
        include_alt_hits: if True include hits to references with names ending in _alt, otherwise
                          do not include them.
        header: the SAM alignment header.
    """

    def __init__(
        self,
        ref: Path,
        max_hits: int,
        executable: str | Path = BWA_EXECUTABLE_NAME,
        max_mismatches: int = 3,
        max_mismatches_in_seed: int = 3,
        max_gap_opens: int = 0,
        max_gap_extensions: int = -1,
        min_indel_to_end_distance: int = 3,
        seed_length: int = 20,
        reverse_complement: bool = False,
        include_alt_hits: bool = False,
        threads: Optional[int] = None,
    ):
        """
        Args:
            ref: the path to the reference FASTA, which must be indexed with bwa.
            max_hits: the maximum number of hits to report - if more than this number of seed hits
                      are found, report only the count and not each hit.
            executable: string or Path representation of the `bwa-aln-interactive` executable path
            max_mismatches: the maximum number of mismatches allowed in the full query sequence
            max_mismatches_in_seed: the maximum number of mismatches allowed in the seed region
            max_gap_opens: the maximum number of gap opens allowed in the full query sequence
            max_gap_extensions: the maximum number of gap extensions allowed in the full query
                                sequence
            min_indel_to_end_distance: do not place an indel within this many bp of the ends of
                the query sequence
            seed_length: the length of the seed region
            reverse_complement: reverse complement each query sequence before alignment
            include_alt_hits: if true include hits to references with names ending in _alt,
                              otherwise do not include them.
            threads: the number of threads to use.  If `None`, use all available CPUs.
        """
        threads = os.cpu_count() if threads is None else threads
        executable_path = ExecutableRunner.validate_executable_path(executable=executable)
        self.reverse_complement: bool = reverse_complement
        self.include_alt_hits: bool = include_alt_hits
        self.max_hits: int = max_hits

        missing_aux_paths = []
        for aux_ext in BWA_AUX_EXTENSIONS:
            aux_path = Path(f"{ref}{aux_ext}")
            if not aux_path.exists():
                missing_aux_paths.append(aux_path)
        if len(missing_aux_paths) > 0:
            message: str
            if len(missing_aux_paths) > 1:
                message = "BWA index files do not exist:\n\t"
            else:
                message = "BWA index file does not exist:\n\t"
            message += "\t\n".join(f"{p}" for p in missing_aux_paths)
            raise FileNotFoundError(f"{message}\nIndex with: `{executable_path} index {ref}`")

        # -N = non-iterative mode: search for all n-difference hits (slooow)
        # -S = output SAM (run samse)
        # -Z = interactive mode (no input buffer and force processing with empty lines between recs)
        command: list[str] = [
            f"{executable_path}",
            "aln",
            "-t",
            f"{threads}",
            "-n",
            f"{max_mismatches}",
            "-o",
            f"{max_gap_opens}",
            "-e",
            f"{max_gap_extensions}",
            "-i",
            f"{min_indel_to_end_distance}",
            "-l",
            f"{seed_length}",
            "-k",
            f"{max_mismatches_in_seed}",
            "-X",
            f"{max_hits}",
            "-N",
            "-S",
            "-Z",
            "-D",
            f"{ref}",
            "/dev/stdin",
        ]

        super().__init__(command=command, stderr=subprocess.PIPE)

        header = []
        for line in self._subprocess.stdout:
            if line.startswith("@"):
                header.append(line)
            if line.startswith("@PG"):
                break

        self.header = AlignmentHeader.from_text("".join(header))

        # NB: ExecutableRunner will default to redirecting stderr to /dev/null. However, we would
        # like to preserve stderr messages from bwa for potential debugging. To do this, we create
        # a single thread to continuously read from stderr and redirect text lines to a debug
        # logger. The close() method of this class will additionally join the stderr logging thread.
        self._logger = logging.getLogger(self.__class__.__qualname__)
        self._stderr_thread = Thread(
            daemon=True,
            target=self._stream_to_sink,
            args=(self._subprocess.stderr, self._logger.debug),
        )
        self._stderr_thread.start()

    def __signal_bwa(self) -> None:
        """Signals BWA to process the queries."""
        self._subprocess.stdin.flush()
        # NB: the executable compiled on different platforms requires a different number of newlines
        # NB: it is not understood why, but 16 newlines seems to work for all platforms tested
        self._subprocess.stdin.write("\n" * 16)
        self._subprocess.stdin.flush()

    @override
    def close(self) -> bool:
        """
        Gracefully terminates the underlying subprocess if it is still running.

        Returns:
            True: if the subprocess was terminated successfully
            False: if the subprocess failed to terminate or was not already running
        """
        safely_closed: bool = super().close()
        self._stderr_thread.join()
        return safely_closed

    def map_one(self, query: str, id: str = "unknown") -> BwaResult:
        """Maps a single query to the genome and returns the result.

        Args:
            query: the query to map with BWA

        Returns:
            a `BwaResult` based on mapping the query
        """
        return self.map_all([Query(bases=query, id=id)])[0]

    def map_all(self, queries: list[Query]) -> list[BwaResult]:
        """Maps multiple queries and returns the results.  This is more efficient than using
        `map_one` on each query one-by-one as it batches reads to BWA.

        Args:
            queries: the queries to map with BWA

        Returns:
            one `BwaResult`s for each query
        """
        if len(queries) == 0:
            return []

        # Send the reads to BWA
        for query in queries:
            fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
            self._subprocess.stdin.write(fastq_str)

        # Force the input to be sent to the underlying process.
        self.__signal_bwa()

        # Read back the results
        results: list[BwaResult] = []
        for query in queries:
            # get the next alignment and convert to a result
            line: str = next(self._subprocess.stdout).strip()
            assert not line.startswith("@"), f"SAM record must not start with '@'! {line}"
            alignment = AlignedSegment.fromstring(line, self.header)
            results.append(self._to_result(query=query, rec=alignment))

        return results

    def _to_result(self, query: Query, rec: pysam.AlignedSegment) -> BwaResult:
        """
        Convert the query and alignment to a result.

        Args:
            query: the original query
            rec: the alignment
        """
        if query.id != rec.query_name:
            raise ValueError(
                "Query and Results are out of order" f"Query=${query.id}, Result=${rec.query_name}"
            )

        num_hits: int = int(rec.get_tag("HN")) if rec.has_tag("HN") else 0
        # `to_hits()` removes artifactual hits which span the boundary between concatenated
        # reference sequences. If we are reporting a list of hits, the number of hits should match
        # the size of this list. Otherwise, we either have zero hits, or more than the maximum
        # number of hits. In both of the latter cases, we have to rely on the count reported in the
        # `HN` tag.
        hits: list[BwaHit]
        if 0 < num_hits <= self.max_hits:
            hits = self.to_hits(rec=rec)
            num_hits = len(hits)
        else:
            hits = []

        return BwaResult(query=query, hit_count=num_hits, hits=hits)

    def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
        """Extracts the hits from the given alignment.  Beyond the current alignment
        additional alignments are parsed from the XA SAM tag.

        Args:
            rec: the given alignment
        """
        negative = rec.is_reverse != self.reverse_complement
        first_hit: BwaHit = BwaHit(
            refname=rec.reference_name,
            start=rec.reference_start + 1,  # NB: pysam is 0-based, Hit is 1-based
            negative=negative,
            cigar=Cigar.from_cigartuples(rec.cigartuples),
            edits=int(rec.get_tag("NM")),
        )

        # Add the hits
        # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
        hits: list[BwaHit] = [first_hit]
        if rec.has_tag("XA"):
            for xa in cast(str, rec.get_tag("XA")).split(";"):
                if xa == "":
                    continue
                fields = xa.split(",")

                # If the reverse-complement of the query sequence was fed to BWA, various fields
                # should be negated/reversed in the Hit
                negative = fields[1][0] == "-"
                if self.reverse_complement:
                    negative = not negative

                hit: BwaHit = BwaHit(
                    refname=fields[0],
                    start=int(fields[1][1:]),  # the XA tag is 1-based
                    negative=negative,
                    cigar=Cigar.from_cigarstring(fields[2]),
                    edits=int(fields[3]),
                )
                hits.append(hit)

        if not self.include_alt_hits:
            hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

        # Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s
        # alignment process, which concatenates all reference sequences and reports hits which span
        # across contigs.
        # NB: the type ignore is necessary because pysam's type hint for `get_reference_length` is
        # incorrect.
        # https://github.com/pysam-developers/pysam/pull/1313
        hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)]  # type: ignore[arg-type]  # noqa: E501

        return hits
Functions
__init__
__init__(
    ref: Path,
    max_hits: int,
    executable: str | Path = BWA_EXECUTABLE_NAME,
    max_mismatches: int = 3,
    max_mismatches_in_seed: int = 3,
    max_gap_opens: int = 0,
    max_gap_extensions: int = -1,
    min_indel_to_end_distance: int = 3,
    seed_length: int = 20,
    reverse_complement: bool = False,
    include_alt_hits: bool = False,
    threads: Optional[int] = None,
)

Parameters:

Name Type Description Default
ref Path

the path to the reference FASTA, which must be indexed with bwa.

required
max_hits int

the maximum number of hits to report - if more than this number of seed hits are found, report only the count and not each hit.

required
executable str | Path

string or Path representation of the bwa-aln-interactive executable path

BWA_EXECUTABLE_NAME
max_mismatches int

the maximum number of mismatches allowed in the full query sequence

3
max_mismatches_in_seed int

the maximum number of mismatches allowed in the seed region

3
max_gap_opens int

the maximum number of gap opens allowed in the full query sequence

0
max_gap_extensions int

the maximum number of gap extensions allowed in the full query sequence

-1
min_indel_to_end_distance int

do not place an indel within this many bp of the ends of the query sequence

3
seed_length int

the length of the seed region

20
reverse_complement bool

reverse complement each query sequence before alignment

False
include_alt_hits bool

if true include hits to references with names ending in _alt, otherwise do not include them.

False
threads Optional[int]

the number of threads to use. If None, use all available CPUs.

None
Source code in prymer/offtarget/bwa.py
def __init__(
    self,
    ref: Path,
    max_hits: int,
    executable: str | Path = BWA_EXECUTABLE_NAME,
    max_mismatches: int = 3,
    max_mismatches_in_seed: int = 3,
    max_gap_opens: int = 0,
    max_gap_extensions: int = -1,
    min_indel_to_end_distance: int = 3,
    seed_length: int = 20,
    reverse_complement: bool = False,
    include_alt_hits: bool = False,
    threads: Optional[int] = None,
):
    """
    Args:
        ref: the path to the reference FASTA, which must be indexed with bwa.
        max_hits: the maximum number of hits to report - if more than this number of seed hits
                  are found, report only the count and not each hit.
        executable: string or Path representation of the `bwa-aln-interactive` executable path
        max_mismatches: the maximum number of mismatches allowed in the full query sequence
        max_mismatches_in_seed: the maximum number of mismatches allowed in the seed region
        max_gap_opens: the maximum number of gap opens allowed in the full query sequence
        max_gap_extensions: the maximum number of gap extensions allowed in the full query
                            sequence
        min_indel_to_end_distance: do not place an indel within this many bp of the ends of
            the query sequence
        seed_length: the length of the seed region
        reverse_complement: reverse complement each query sequence before alignment
        include_alt_hits: if true include hits to references with names ending in _alt,
                          otherwise do not include them.
        threads: the number of threads to use.  If `None`, use all available CPUs.
    """
    threads = os.cpu_count() if threads is None else threads
    executable_path = ExecutableRunner.validate_executable_path(executable=executable)
    self.reverse_complement: bool = reverse_complement
    self.include_alt_hits: bool = include_alt_hits
    self.max_hits: int = max_hits

    missing_aux_paths = []
    for aux_ext in BWA_AUX_EXTENSIONS:
        aux_path = Path(f"{ref}{aux_ext}")
        if not aux_path.exists():
            missing_aux_paths.append(aux_path)
    if len(missing_aux_paths) > 0:
        message: str
        if len(missing_aux_paths) > 1:
            message = "BWA index files do not exist:\n\t"
        else:
            message = "BWA index file does not exist:\n\t"
        message += "\t\n".join(f"{p}" for p in missing_aux_paths)
        raise FileNotFoundError(f"{message}\nIndex with: `{executable_path} index {ref}`")

    # -N = non-iterative mode: search for all n-difference hits (slooow)
    # -S = output SAM (run samse)
    # -Z = interactive mode (no input buffer and force processing with empty lines between recs)
    command: list[str] = [
        f"{executable_path}",
        "aln",
        "-t",
        f"{threads}",
        "-n",
        f"{max_mismatches}",
        "-o",
        f"{max_gap_opens}",
        "-e",
        f"{max_gap_extensions}",
        "-i",
        f"{min_indel_to_end_distance}",
        "-l",
        f"{seed_length}",
        "-k",
        f"{max_mismatches_in_seed}",
        "-X",
        f"{max_hits}",
        "-N",
        "-S",
        "-Z",
        "-D",
        f"{ref}",
        "/dev/stdin",
    ]

    super().__init__(command=command, stderr=subprocess.PIPE)

    header = []
    for line in self._subprocess.stdout:
        if line.startswith("@"):
            header.append(line)
        if line.startswith("@PG"):
            break

    self.header = AlignmentHeader.from_text("".join(header))

    # NB: ExecutableRunner will default to redirecting stderr to /dev/null. However, we would
    # like to preserve stderr messages from bwa for potential debugging. To do this, we create
    # a single thread to continuously read from stderr and redirect text lines to a debug
    # logger. The close() method of this class will additionally join the stderr logging thread.
    self._logger = logging.getLogger(self.__class__.__qualname__)
    self._stderr_thread = Thread(
        daemon=True,
        target=self._stream_to_sink,
        args=(self._subprocess.stderr, self._logger.debug),
    )
    self._stderr_thread.start()
__signal_bwa
__signal_bwa() -> None

Signals BWA to process the queries.

Source code in prymer/offtarget/bwa.py
def __signal_bwa(self) -> None:
    """Signals BWA to process the queries."""
    self._subprocess.stdin.flush()
    # NB: the executable compiled on different platforms requires a different number of newlines
    # NB: it is not understood why, but 16 newlines seems to work for all platforms tested
    self._subprocess.stdin.write("\n" * 16)
    self._subprocess.stdin.flush()
close
close() -> bool

Gracefully terminates the underlying subprocess if it is still running.

Returns:

Name Type Description
True bool

if the subprocess was terminated successfully

False bool

if the subprocess failed to terminate or was not already running

Source code in prymer/offtarget/bwa.py
@override
def close(self) -> bool:
    """
    Gracefully terminates the underlying subprocess if it is still running.

    Returns:
        True: if the subprocess was terminated successfully
        False: if the subprocess failed to terminate or was not already running
    """
    safely_closed: bool = super().close()
    self._stderr_thread.join()
    return safely_closed
map_all
map_all(queries: list[Query]) -> list[BwaResult]

Maps multiple queries and returns the results. This is more efficient than using map_one on each query one-by-one as it batches reads to BWA.

Parameters:

Name Type Description Default
queries list[Query]

the queries to map with BWA

required

Returns:

Type Description
list[BwaResult]

one BwaResults for each query

Source code in prymer/offtarget/bwa.py
def map_all(self, queries: list[Query]) -> list[BwaResult]:
    """Maps multiple queries and returns the results.  This is more efficient than using
    `map_one` on each query one-by-one as it batches reads to BWA.

    Args:
        queries: the queries to map with BWA

    Returns:
        one `BwaResult`s for each query
    """
    if len(queries) == 0:
        return []

    # Send the reads to BWA
    for query in queries:
        fastq_str = query.to_fastq(reverse_complement=self.reverse_complement)
        self._subprocess.stdin.write(fastq_str)

    # Force the input to be sent to the underlying process.
    self.__signal_bwa()

    # Read back the results
    results: list[BwaResult] = []
    for query in queries:
        # get the next alignment and convert to a result
        line: str = next(self._subprocess.stdout).strip()
        assert not line.startswith("@"), f"SAM record must not start with '@'! {line}"
        alignment = AlignedSegment.fromstring(line, self.header)
        results.append(self._to_result(query=query, rec=alignment))

    return results
map_one
map_one(query: str, id: str = 'unknown') -> BwaResult

Maps a single query to the genome and returns the result.

Parameters:

Name Type Description Default
query str

the query to map with BWA

required

Returns:

Type Description
BwaResult

a BwaResult based on mapping the query

Source code in prymer/offtarget/bwa.py
def map_one(self, query: str, id: str = "unknown") -> BwaResult:
    """Maps a single query to the genome and returns the result.

    Args:
        query: the query to map with BWA

    Returns:
        a `BwaResult` based on mapping the query
    """
    return self.map_all([Query(bases=query, id=id)])[0]
to_hits
to_hits(rec: AlignedSegment) -> list[BwaHit]

Extracts the hits from the given alignment. Beyond the current alignment additional alignments are parsed from the XA SAM tag.

Parameters:

Name Type Description Default
rec AlignedSegment

the given alignment

required
Source code in prymer/offtarget/bwa.py
def to_hits(self, rec: AlignedSegment) -> list[BwaHit]:
    """Extracts the hits from the given alignment.  Beyond the current alignment
    additional alignments are parsed from the XA SAM tag.

    Args:
        rec: the given alignment
    """
    negative = rec.is_reverse != self.reverse_complement
    first_hit: BwaHit = BwaHit(
        refname=rec.reference_name,
        start=rec.reference_start + 1,  # NB: pysam is 0-based, Hit is 1-based
        negative=negative,
        cigar=Cigar.from_cigartuples(rec.cigartuples),
        edits=int(rec.get_tag("NM")),
    )

    # Add the hits
    # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
    hits: list[BwaHit] = [first_hit]
    if rec.has_tag("XA"):
        for xa in cast(str, rec.get_tag("XA")).split(";"):
            if xa == "":
                continue
            fields = xa.split(",")

            # If the reverse-complement of the query sequence was fed to BWA, various fields
            # should be negated/reversed in the Hit
            negative = fields[1][0] == "-"
            if self.reverse_complement:
                negative = not negative

            hit: BwaHit = BwaHit(
                refname=fields[0],
                start=int(fields[1][1:]),  # the XA tag is 1-based
                negative=negative,
                cigar=Cigar.from_cigarstring(fields[2]),
                edits=int(fields[3]),
            )
            hits.append(hit)

    if not self.include_alt_hits:
        hits = [hit for hit in hits if not hit.refname.endswith("_alt")]

    # Remove hits that extend beyond the end of a contig - these are artifacts of `bwa aln`'s
    # alignment process, which concatenates all reference sequences and reports hits which span
    # across contigs.
    # NB: the type ignore is necessary because pysam's type hint for `get_reference_length` is
    # incorrect.
    # https://github.com/pysam-developers/pysam/pull/1313
    hits = [hit for hit in hits if hit.end <= self.header.get_reference_length(hit.refname)]  # type: ignore[arg-type]  # noqa: E501

    return hits
BwaHit dataclass

Represents a single hit or alignment of a sequence to a location in the genome.

Attributes:

Name Type Description
refname str

the reference name of the hit

start int

the start position of the hit (1-based inclusive)

negative bool

whether the hit is on the negative strand

cigar Cigar

the cigar string returned by BWA

edits int

the number of edits between the read and the reference

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class BwaHit:
    """Represents a single hit or alignment of a sequence to a location in the genome.

    Attributes:
        refname: the reference name of the hit
        start: the start position of the hit (1-based inclusive)
        negative: whether the hit is on the negative strand
        cigar: the cigar string returned by BWA
        edits: the number of edits between the read and the reference
    """

    refname: str
    start: int
    negative: bool
    cigar: Cigar
    edits: int

    @property
    def mismatches(self) -> int:
        """The number of mismatches for the hit"""
        indel_sum = sum(elem.length for elem in self.cigar.elements if elem.operator.is_indel)
        if indel_sum > self.edits:
            raise ValueError(
                f"indel_sum ({indel_sum}) > self.edits ({self.edits}) with cigar: {self.cigar}"
            )
        return self.edits - indel_sum

    @property
    def end(self) -> int:
        """The end position of the hit (1-based inclusive)"""
        return coordmath.get_closed_end(self.start, self.cigar.length_on_target())

    @staticmethod
    def build(
        refname: str, start: int, negative: bool, cigar: str, edits: int, revcomp: bool = False
    ) -> "BwaHit":
        """Generates a hit object.

        Args:
            refname: the reference name of the hit
            start: the start position of the hit (1-based inclusive)
            negative: whether the hit is on the negative strand
            cigar: the cigar string returned by BWA
            edits: the number of edits between the read and the reference
            revcomp: whether the reverse-complement of the query sequence was fed to BWA, in which
                case various fields should be negated/reversed in the Hit

        Returns:
            A Hit that represents the mapping of the original query sequence that was supplied
        """
        negative = negative != revcomp
        return BwaHit(
            refname=refname,
            start=start,
            negative=negative,
            cigar=Cigar.from_cigarstring(cigar),
            edits=edits,
        )

    def __str__(self) -> str:
        """Returns the string representation in bwa's XA tag format."""
        # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
        return ",".join(
            [
                self.refname,
                ("-" if self.negative else "+") + f"{self.start}",
                f"{self.cigar}",
                f"{self.edits}",
            ]
        )
Attributes
end property
end: int

The end position of the hit (1-based inclusive)

mismatches property
mismatches: int

The number of mismatches for the hit

Functions
__str__
__str__() -> str

Returns the string representation in bwa's XA tag format.

Source code in prymer/offtarget/bwa.py
def __str__(self) -> str:
    """Returns the string representation in bwa's XA tag format."""
    # E.g. XA:Z:chr4,-97592047,24M,3;chr8,-32368749,24M,3;
    return ",".join(
        [
            self.refname,
            ("-" if self.negative else "+") + f"{self.start}",
            f"{self.cigar}",
            f"{self.edits}",
        ]
    )
build staticmethod
build(
    refname: str,
    start: int,
    negative: bool,
    cigar: str,
    edits: int,
    revcomp: bool = False,
) -> BwaHit

Generates a hit object.

Parameters:

Name Type Description Default
refname str

the reference name of the hit

required
start int

the start position of the hit (1-based inclusive)

required
negative bool

whether the hit is on the negative strand

required
cigar str

the cigar string returned by BWA

required
edits int

the number of edits between the read and the reference

required
revcomp bool

whether the reverse-complement of the query sequence was fed to BWA, in which case various fields should be negated/reversed in the Hit

False

Returns:

Type Description
BwaHit

A Hit that represents the mapping of the original query sequence that was supplied

Source code in prymer/offtarget/bwa.py
@staticmethod
def build(
    refname: str, start: int, negative: bool, cigar: str, edits: int, revcomp: bool = False
) -> "BwaHit":
    """Generates a hit object.

    Args:
        refname: the reference name of the hit
        start: the start position of the hit (1-based inclusive)
        negative: whether the hit is on the negative strand
        cigar: the cigar string returned by BWA
        edits: the number of edits between the read and the reference
        revcomp: whether the reverse-complement of the query sequence was fed to BWA, in which
            case various fields should be negated/reversed in the Hit

    Returns:
        A Hit that represents the mapping of the original query sequence that was supplied
    """
    negative = negative != revcomp
    return BwaHit(
        refname=refname,
        start=start,
        negative=negative,
        cigar=Cigar.from_cigarstring(cigar),
        edits=edits,
    )
BwaResult dataclass

Represents zero or more hits or alignments found by BWA for the given query.

The number of hits found may be more than the number of hits listed in the hits attribute.

Attributes:

Name Type Description
query Query

the query (as given, no RC'ing)

hit_count int

the total number of hits found by bwa (this may be more than len(hits))

hits list[BwaHit]

the subset of hits that were reported by bwa

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class BwaResult:
    """
    Represents zero or more hits or alignments found by BWA for the given query.

    The number of hits found may be more than the number of hits listed in the `hits` attribute.

    Attributes:
        query: the query (as given, no RC'ing)
        hit_count: the total number of hits found by bwa (this may be more than len(hits))
        hits: the subset of hits that were reported by bwa
    """

    query: Query
    hit_count: int
    hits: list[BwaHit]
Query dataclass

Represents a single query sequence for mapping.

Attributes:

Name Type Description
id str

the identifier for the query (e.g. read name)

bases str

the query bases

Source code in prymer/offtarget/bwa.py
@dataclass(init=True, frozen=True)
class Query:
    """Represents a single query sequence for mapping.

    Attributes:
        id: the identifier for the query (e.g. read name)
        bases: the query bases
    """

    id: str
    bases: str

    _DEFAULT_BASE_QUALITY: ClassVar[str] = "H"
    """The default base quality for a query"""

    def __post_init__(self) -> None:
        if self.bases is None or len(self.bases) == 0:
            raise ValueError("No bases were provided to query")

    def to_fastq(self, reverse_complement: bool = False) -> str:
        """Returns the multi-line FASTQ string representation of this query"""
        bases = sequence.reverse_complement(self.bases) if reverse_complement else self.bases
        quals = self._DEFAULT_BASE_QUALITY * len(self.bases)
        return f"@{self.id}\n{bases}\n+\n{quals}\n"
Functions
to_fastq
to_fastq(reverse_complement: bool = False) -> str

Returns the multi-line FASTQ string representation of this query

Source code in prymer/offtarget/bwa.py
def to_fastq(self, reverse_complement: bool = False) -> str:
    """Returns the multi-line FASTQ string representation of this query"""
    bases = sequence.reverse_complement(self.bases) if reverse_complement else self.bases
    quals = self._DEFAULT_BASE_QUALITY * len(self.bases)
    return f"@{self.id}\n{bases}\n+\n{quals}\n"

Modules

offtarget_detector

Methods and classes to detect off-target mappings for primers and primer pairs

The OffTargetDetector uses BwaAlnInteractive to search for off targets for one or more primers or primer pairs.

The filter() method filters an iterable of Primers to return only those that have less than a given maximum number of off-target hits to the genome.

>>> from pathlib import Path
>>> from prymer.api.span import Strand
>>> ref_fasta = Path("./tests/offtarget/data/miniref.fa")
>>> left_primer =  Oligo(bases="AAAAA", tm=37, penalty=0, span=Span("chr1", start=67, end=71))
>>> right_primer = Oligo(bases="TTTTT", tm=37, penalty=0, span=Span("chr1", start=75, end=79, strand=Strand.NEGATIVE))
>>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=204, max_primer_pair_hits=1, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250)
>>> len(detector.filter(primers=[left_primer, right_primer]))  # keep all
2
>>> detector.close()
>>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=203, max_primer_pair_hits=1, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250)
>>> len(detector.filter(primers=[left_primer, right_primer]))  # keep  none
0
>>> detector.close()

The check_one() and the check_all() methods check one or multiple primer pairs respectively for off-target mappings, returning an OffTargetResult, or mapping from each PrimerPair to its corresponding OffTargetResult. This result contains information about the off-target mappings.

>>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=1, max_primer_pair_hits=1, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250)
>>> primer_pair = PrimerPair(left_primer=left_primer, right_primer=right_primer, amplicon_tm=37, penalty=0.0)
>>> detector.check_one(primer_pair=primer_pair)
OffTargetResult(primer_pair=..., passes=False, cached=False, spans=[], left_primer_spans=[], right_primer_spans=[])
>>> list(detector.check_all(primer_pairs=[primer_pair]).values())
[OffTargetResult(primer_pair=..., passes=False, cached=True, spans=[], left_primer_spans=[], right_primer_spans=[])]
>>> detector = OffTargetDetector(ref=ref_fasta, max_primer_hits=204, max_primer_pair_hits=856, three_prime_region_length=20, max_mismatches_in_three_prime_region=0, max_mismatches=0, max_amplicon_size=250)
>>> result = detector.check_one(primer_pair=primer_pair)
>>> len(result.spans)
856
>>> len(result.left_primer_spans)
204
>>> len(result.right_primer_spans)
204

Finally, the mappings_of() method maps individual primers (Primers).

>>> p1: Oligo = Oligo(tm=37, penalty=0, span=Span(refname="chr1", start=1, end=30), bases="CAGGTGGATCATGAGGTCAGGAGTTCAAGA")
>>> p2: Oligo = Oligo(tm=37, penalty=0, span=Span(refname="chr1", start=61, end=93, strand=Strand.NEGATIVE), bases="CATGCCCAGCTAATTTTTTGTATTTTTAGTAGA")
>>> results_dict: dict[str, BwaResult] = detector.mappings_of(primers=[p1, p2])
>>> list(results_dict.keys())
['CAGGTGGATCATGAGGTCAGGAGTTCAAGA', 'CATGCCCAGCTAATTTTTTGTATTTTTAGTAGA']
>>> results = list(results_dict.values())
>>> results[0]
BwaResult(query=Query(id='CAGGTGGATCATGAGGTCAGGAGTTCAAGA', bases='CAGGTGGATCATGAGGTCAGGAGTTCAAGA'), hit_count=1, hits=[...])
>>> results[0].hits[0]
BwaHit(refname='chr1', start=1, negative=False, cigar=Cigar(elements=(CigarElement(length=30, operator=<CigarOp.M: (0, 'M', True, True)>),)), edits=0)
>>> results[1]
BwaResult(query=Query(id='CATGCCCAGCTAATTTTTTGTATTTTTAGTAGA', bases='CATGCCCAGCTAATTTTTTGTATTTTTAGTAGA'), hit_count=1, hits=[...])
>>> results[1].hits[0]
BwaHit(refname='chr1', start=61, negative=True, cigar=Cigar(elements=(CigarElement(length=33, operator=<CigarOp.M: (0, 'M', True, True)>),)), edits=0)

Attributes

MINIMUM_THREE_PRIME_REGION_LENGTH module-attribute
MINIMUM_THREE_PRIME_REGION_LENGTH: int = 8

Minimum allowable seed length for the 3' region.

ReferenceName module-attribute
ReferenceName: TypeAlias = str

Alias for a reference sequence name.

Classes

OffTargetDetector

Bases: AbstractContextManager

A class for detecting off-target mappings of primers and primer pairs that uses a custom version of "bwa aln" named "bwa-aln-interactive".

OffTargetDetector uses a custom, interactive version of bwa aln to perform off-target detection. This approach is faster and more sensitive than traditional isPCR and in addition can correctly detect primers that are repetitive and contain many thousands or millions of mappings to the genome.

Note that while this class invokes BWA with multiple threads, it is not itself thread-safe. Only one thread at a time should invoke methods on this class without external synchronization.

Off-target detection may be performed for individual primers and/or primer pairs.

To detect off-target hits for individual primers, use the OffTargetDetector.filters() method, which will remove any primers that have more than the specified number of maximum hits against the reference.

To detect off-target amplification of primer pairs, use the OffTargetDetector.check_one() or OffTargetDetector.check_all() methods. These methods screen the individual primers in each pair for off-target hits, and verify that the possible amplicons inferred by the primers' alignments does not exceed the specified maximum number of primer pair hits.

Source code in prymer/offtarget/offtarget_detector.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
class OffTargetDetector(AbstractContextManager):
    """A class for detecting off-target mappings of primers and primer pairs that uses a custom
    version of "bwa aln" named "bwa-aln-interactive".

    `OffTargetDetector` uses a [custom, interactive
    version](https://github.com/fulcrumgenomics/bwa-aln-interactive/) of `bwa aln` to perform
    off-target detection. This approach is faster and more sensitive than traditional isPCR and in
    addition can correctly detect primers that are repetitive and contain many thousands or millions
    of mappings to the genome.

    Note that while this class invokes BWA with multiple threads, it is not itself thread-safe.
    Only one thread at a time should invoke methods on this class without external synchronization.

    Off-target detection may be performed for individual primers and/or primer pairs.

    To detect off-target hits for individual primers, use the `OffTargetDetector.filters()` method,
    which will remove any primers that have more than the specified number of maximum hits against
    the reference.

    To detect off-target amplification of primer pairs, use the `OffTargetDetector.check_one()` or
    `OffTargetDetector.check_all()` methods. These methods screen the individual primers in each
    pair for off-target hits, and verify that the possible amplicons inferred by the primers'
    alignments does not exceed the specified maximum number of primer pair hits.
    """

    def __init__(  # noqa: C901
        self,
        ref: Path,
        max_primer_hits: int,
        max_primer_pair_hits: int,
        three_prime_region_length: int,
        max_mismatches_in_three_prime_region: int,
        max_mismatches: int,
        max_gap_opens: int = 0,
        max_gap_extends: int = -1,
        max_amplicon_size: int = 1000,
        min_primer_pair_hits: int = 1,
        cache_results: bool = True,
        threads: Optional[int] = None,
        keep_spans: bool = True,
        keep_primer_spans: bool = True,
        executable: str | Path = BWA_EXECUTABLE_NAME,
    ) -> None:
        """
        Initialize an [[OffTargetDetector]].

        This class accepts a large number of parameters to control the behavior of the off-target
        detection. The parameters are used in the various aspects of off-target detection as
        follows:

        1. Alignment (off-target hit detection): `ref`, `executable`, `threads`,
           `three_prime_region_length`, `max_mismatches_in_three_prime_region`, `max_mismatches`,
           and `max_primer_hits`.
        2. Filtering of individual primers: `max_primer_hits`.
        3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`,
           `max_primer_pair_hits`, and `max_amplicon_size`.

        The `three_prime_region_length` parameter is used as the seed length for `bwa aln`.

        Args:
            ref: the reference genome fasta file (must be indexed with BWA)
            max_primer_hits: the maximum number of hits an individual primer can have in the genome
                before it is considered an invalid primer, and all primer pairs containing the
                primer failed. Must be greater than or equal to 0.
            max_primer_pair_hits: the maximum number of amplicons a primer pair can make and be
                considered passing. Must be greater than or equal to 0.
            min_primer_pair_hits: The minimum number of amplicons a primer pair can make and be
                considered passing. (In most cases, this is the number of amplicons a primer pair is
                expected to generate.) The default is 1, which is appropriate when the primer pair
                is being evaluated for off-target hits against the same reference genome from which
                the primers were generated. If the primer pair was generated from a different
                reference sequence, it may be appropriate to set this value to 0. Must be greater
                    than or equal to 0.
            three_prime_region_length: the number of bases at the 3' end of the primer in which the
                parameter `max_mismatches_in_three_prime_region` is evaluated. This value is used as
                the seed length (`bwa aln -l`). Must be a minimum of 8.
            max_mismatches_in_three_prime_region: the maximum number of mismatches that are
                tolerated in the three prime region of each primer defined by
                `three_prime_region_length`. Must be between 0 and `three_prime_region_length`,
                inclusive.
            max_mismatches: the maximum number of mismatches allowed in the full length primer
                (including any in the three prime region). Must be greater than or equal to 0.
            max_gap_opens: the maximum number of gaps (insertions or deletions) allowable in an
                alignment of a oligo to the reference. Must be greater than or equal to 0.
            max_gap_extends: the maximum number of gap extensions allowed; extending a gap
                beyond a single base costs 1 gap extension.  Can be set to -1 to allow
                unlimited extensions up to max diffs (aka max mismatches), while disallowing
                "long gaps". Must be greater than or equal to -1.
            max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater
                than 0.
            cache_results: if True, cache results for faster re-querying
            threads: the number of threads to use when invoking bwa
            keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans
                populated, otherwise not
            keep_primer_spans: if True, [[OffTargetResult]] objects will be reported with left and
                right primer spans
            executable: string or Path representation of the `bwa` executable path

        Raises:
            ValueError: If `max_amplicon_size` is not greater than 0.
            ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or
                `min_primer_pair_hits` are not greater than or equal to 0.
            ValueError: If `three_prime_region_length` is not greater than or equal to 8.
            ValueError: If `max_mismatches_in_three_prime_region` is outside the range 0 to
                `three_prime_region_length`, inclusive.
            ValueError: If `max_mismatches` is not greater than or equal to 0.
            ValueError: If `max_gap_opens` is not greater than or equal to 0.
            ValueError: If `max_gap_extends` is not -1 or greater than or equal to 0.
        """
        errors: list[str] = []
        if max_amplicon_size < 1:
            errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}")
        if max_primer_hits < 0:
            errors.append(
                f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}"
            )
        if max_primer_pair_hits < 0:
            errors.append(
                "'max_primer_pair_hits' must be greater than or equal to 0. "
                f"Saw {max_primer_pair_hits}"
            )
        if min_primer_pair_hits < 0:
            errors.append(
                "'min_primer_pair_hits' must be greater than or equal to 0. "
                f"Saw {min_primer_pair_hits}"
            )
        if three_prime_region_length < MINIMUM_THREE_PRIME_REGION_LENGTH:
            errors.append(
                "'three_prime_region_length' must be greater than or equal to "
                f"{MINIMUM_THREE_PRIME_REGION_LENGTH}. Saw {three_prime_region_length}"
            )
        if (
            max_mismatches_in_three_prime_region < 0
            or max_mismatches_in_three_prime_region > three_prime_region_length
        ):
            errors.append(
                "'max_mismatches_in_three_prime_region' must be between 0 and "
                f"'three_prime_region_length'={three_prime_region_length} inclusive. "
                f"Saw {max_mismatches_in_three_prime_region}"
            )
        if max_mismatches < 0:
            errors.append(
                f"'max_mismatches' must be greater than or equal to 0. Saw {max_mismatches}"
            )
        if max_gap_opens < 0:
            errors.append(
                f"'max_gap_opens' must be greater than or equal to 0. Saw {max_gap_opens}"
            )
        if max_gap_extends < -1:
            errors.append(
                "'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'="
                f"{max_mismatches}) or greater than or equal to 0. Saw {max_gap_extends}"
            )

        if len(errors) > 0:
            raise ValueError("\n".join(errors))

        self._primer_cache: dict[str, BwaResult] = {}
        self._primer_pair_cache: dict[PrimerPair, OffTargetResult] = {}
        self._bwa = BwaAlnInteractive(
            executable=executable,
            ref=ref,
            reverse_complement=True,
            threads=threads,
            seed_length=three_prime_region_length,
            max_mismatches_in_seed=max_mismatches_in_three_prime_region,
            max_mismatches=max_mismatches,
            max_gap_opens=max_gap_opens,
            max_gap_extensions=max_gap_extends,
            max_hits=max_primer_hits,
        )
        self._max_primer_hits = max_primer_hits
        self._max_primer_pair_hits: int = max_primer_pair_hits
        self._min_primer_pair_hits: int = min_primer_pair_hits
        self._max_amplicon_size: int = max_amplicon_size
        self._cache_results: bool = cache_results
        self._keep_spans: bool = keep_spans
        self._keep_primer_spans: bool = keep_primer_spans

    def filter(self, primers: list[PrimerType]) -> list[PrimerType]:
        """
        Remove primers that have more than `max_primer_hits` mappings to the genome.

        This method maps each primer to the specified reference with `bwa aln` to search for
        off-target hits. Note that when the reference includes the sequence from which the primers
        were designed, the on-target hit will be included in the hit count. `max_primer_hits` should
        be set to at least 1 in this case.

        Args:
            primers: A list of primers to filter.

        Returns:
            The input primers, filtered to remove any primers with more than `max_primer_hits`
            mappings to the genome.
        """
        results: dict[str, BwaResult] = self.mappings_of(primers)
        return [
            primer for primer in primers if results[primer.bases].hit_count <= self._max_primer_hits
        ]

    def check_one(self, primer_pair: PrimerPair) -> OffTargetResult:
        """
        Check an individual primer pair for off-target amplification.

        See `OffTargetDetector.check_all()` for details.

        Args:
            primer_pair: The primer pair to check.

        Returns:
            The results of the off-target detection.
            See `OffTargetResult` for details regarding the attributes included in the result.
        """
        result: dict[PrimerPair, OffTargetResult] = self.check_all([primer_pair])
        return result[primer_pair]

    def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTargetResult]:
        """
        Check a collection of primer pairs for off-target amplification.

        This method maps each primer to the specified reference with `bwa aln` to search for
        off-target hits. Possible amplicons are identified from the pairwise combinations of these
        alignments, up to the specified `max_amplicon_size`.

        Primer pairs are marked as passing if both of the following are true:
        1. Each primer has no more than `max_primer_hits` alignments to the genome.
        2. The size of the set of possible amplicons does not exceed `max_primer_pair_hits`, and is
           at least `min_primer_pair_hits`.

        Args:
            primer_pairs: The primer pairs to check.

        Returns:
            A dictionary mapping each checked primer pair to its off-target detection results.
            See `OffTargetResult` for details regarding the attributes included in each result.
        """

        primer_pair_results: dict[PrimerPair, OffTargetResult] = {}
        result: OffTargetResult

        # Get the primer pairs to map.  If the primer pair is found in the cache, use that
        primer_pairs_to_map: list[PrimerPair] = []
        if not self._cache_results:
            primer_pairs_to_map = primer_pairs
        else:
            for primer_pair in primer_pairs:
                match self._primer_pair_cache.get(primer_pair, None):
                    case None:
                        primer_pairs_to_map.append(primer_pair)  # Map it!
                    case result:
                        primer_pair_results[primer_pair] = result

        # If there are no primer pairs to map, return the results
        if len(primer_pairs_to_map) == 0:
            return primer_pair_results

        # Get mappings of all the primers
        primers = [primer for primer_pair in primer_pairs for primer in primer_pair]
        hits_by_primer = self.mappings_of(primers)

        for primer_pair in primer_pairs:
            primer_pair_results[primer_pair] = self._build_off_target_result(
                primer_pair=primer_pair, hits_by_primer=hits_by_primer
            )

        return primer_pair_results

    def _build_off_target_result(
        self, primer_pair: PrimerPair, hits_by_primer: dict[str, BwaResult]
    ) -> OffTargetResult:
        """Builds an `OffTargetResult` for the given `PrimerPair`.

        If there are too many primer hits for either the left or right primer, set `passes` to
        `False` on the returned `OffTargetResult`, otherwise generate all possible amplicons with
        size less than the given maximum to generate off target results.

        Args:
            primer_pair: the primer pair from which the off-target result is built.
            hits_by_primer: mapping of primer bases to `BwaHits`.
        """
        result: OffTargetResult

        # Get the mappings for the left primer and right primer respectively
        left_bwa_result: BwaResult = hits_by_primer[primer_pair.left_primer.bases]
        right_bwa_result: BwaResult = hits_by_primer[primer_pair.right_primer.bases]

        # If there are too many hits, this primer pair will not pass. Exit early.
        if (
            left_bwa_result.hit_count > self._max_primer_hits
            or right_bwa_result.hit_count > self._max_primer_hits
        ):
            result = OffTargetResult(primer_pair=primer_pair, passes=False)
            if self._cache_results:
                self._primer_pair_cache[primer_pair] = replace(result, cached=True)
            return result

        # Map the hits by reference name
        left_positive_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        left_negative_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        right_positive_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)
        right_negative_hits: defaultdict[ReferenceName, list[BwaHit]] = defaultdict(list)

        # Split the hits for left and right by reference name and strand
        for hit in left_bwa_result.hits:
            if hit.negative:
                left_negative_hits[hit.refname].append(hit)
            else:
                left_positive_hits[hit.refname].append(hit)

        for hit in right_bwa_result.hits:
            if hit.negative:
                right_negative_hits[hit.refname].append(hit)
            else:
                right_positive_hits[hit.refname].append(hit)

        refnames: set[ReferenceName] = {
            h.refname for h in itertools.chain(left_bwa_result.hits, right_bwa_result.hits)
        }

        # Build amplicons from hits on the same reference with valid relative orientation
        amplicons: list[Span] = []
        for refname in refnames:
            amplicons.extend(
                self._to_amplicons(
                    positive_hits=left_positive_hits[refname],
                    negative_hits=right_negative_hits[refname],
                    max_len=self._max_amplicon_size,
                    strand=Strand.POSITIVE,
                )
            )
            amplicons.extend(
                self._to_amplicons(
                    positive_hits=right_positive_hits[refname],
                    negative_hits=left_negative_hits[refname],
                    max_len=self._max_amplicon_size,
                    strand=Strand.NEGATIVE,
                )
            )

        result = OffTargetResult(
            primer_pair=primer_pair,
            passes=self._min_primer_pair_hits <= len(amplicons) <= self._max_primer_pair_hits,
            spans=amplicons if self._keep_spans else [],
            left_primer_spans=(
                [self._hit_to_span(h) for h in left_bwa_result.hits]
                if self._keep_primer_spans
                else []
            ),
            right_primer_spans=(
                [self._hit_to_span(h) for h in right_bwa_result.hits]
                if self._keep_primer_spans
                else []
            ),
        )

        if self._cache_results:
            self._primer_pair_cache[primer_pair] = replace(result, cached=True)

        return result

    def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]:
        """
        Map primers to the reference genome using `bwa aln`.

        Alignment results may be optionally cached for efficiency. Set `cache_results` to `True`
        when instantiating the [[OffTargetDetector]] to enable caching.

        Any primers without cached results, or all primers when `cache_results` is `False`, will be
        mapped to the reference genome using `bwa aln` and the specified alignment parameters.

        **Note**: The reverse complement of each primer sequence is used for mapping. The `query`
        sequence in each `BwaResult` will match the input primer sequence, as will the sequences
        used as keys in the output dictionary. However, the coordinates reported in each `BwaHit`
        associated with a result will correspond to complementary sequence.

        Args:
            primers: A list of primers to map.

        Returns:
            A dictionary mapping each primer's sequence to its alignment results.
            See `BwaResult` for details regarding the attributes included in each result.
        """

        primers_to_map: list[PrimerType]
        if not self._cache_results:
            primers_to_map = primers
        else:
            primers_to_map = [
                primer for primer in list(OrderedSet(primers)) if primer not in self._primer_cache
            ]

        # Build the unique list of queries to map with BWA
        queries: list[Query] = [
            Query(id=primer.bases, bases=primer.bases) for primer in primers_to_map
        ]

        # Map the queries with BWA
        hits_by_primer: dict[str, BwaResult] = {
            result.query.id: result for result in self._bwa.map_all(queries)
        }

        # Cache the results, if desired, and get the hits by primer _for all_ primers. If not
        # caching, then hits_by_primer already contains all the primers.
        if self._cache_results:
            self._primer_cache.update(hits_by_primer)
            hits_by_primer = {primer.bases: self._primer_cache[primer.bases] for primer in primers}

        return hits_by_primer

    @staticmethod
    def _to_amplicons(
        positive_hits: list[BwaHit], negative_hits: list[BwaHit], max_len: int, strand: Strand
    ) -> list[Span]:
        """Takes lists of positive strand hits and negative strand hits and constructs amplicon
        mappings anywhere a positive strand hit and a negative strand hit occur where the end of
        the negative strand hit is no more than `max_len` from the start of the positive strand
        hit.

        Primers may not overlap.

        Args:
            positive_hits: List of hits on the positive strand for one of the primers in the pair.
            negative_hits: List of hits on the negative strand for the other primer in the pair.
            max_len: Maximum length of amplicons to consider.
            strand: The strand of the amplicon to generate. Set to Strand.POSITIVE if
                `positive_hits` are for the left primer and `negative_hits` are for the right
                primer. Set to Strand.NEGATIVE if `positive_hits` are for the right primer and
                `negative_hits` are for the left primer.

        Raises:
            ValueError: If any of the positive hits are not on the positive strand, or any of the
                negative hits are not on the negative strand. If hits are present on more than one
                reference.
        """
        if any(h.negative for h in positive_hits):
            raise ValueError("Positive hits must be on the positive strand.")
        if any(not h.negative for h in negative_hits):
            raise ValueError("Negative hits must be on the negative strand.")

        refnames: set[ReferenceName] = {
            h.refname for h in itertools.chain(positive_hits, negative_hits)
        }
        if len(refnames) > 1:
            raise ValueError(f"Hits are present on more than one reference: {refnames}")

        # Exit early if one of the hit lists is empty - this will save unnecessary sorting of the
        # other list
        if len(positive_hits) == 0 or len(negative_hits) == 0:
            return []

        # Sort the positive strand hits by start position and the negative strand hits by *end*
        # position. The `max_len` cutoff is based on negative_hit.end - positive_hit.start + 1.
        positive_hits_sorted = sorted(positive_hits, key=lambda h: h.start)
        negative_hits_sorted = sorted(negative_hits, key=lambda h: h.end)

        amplicons: list[Span] = []

        # Track the position of the previously examined negative hit.
        prev_negative_hit_index = 0
        for positive_hit in positive_hits_sorted:
            # Check only negative hits starting with the previously examined one.
            for negative_hit_index, negative_hit in enumerate(
                negative_hits_sorted[prev_negative_hit_index:],
                start=prev_negative_hit_index,
            ):
                # TODO: Consider allowing overlapping positive and negative hits.
                if (
                    negative_hit.start > positive_hit.end
                    and negative_hit.end - positive_hit.start + 1 <= max_len
                ):
                    # If the negative hit starts to the right of the positive hit, and the amplicon
                    # length is <= max_len, add it to the list of amplicon hits to be returned.
                    amplicons.append(
                        Span(
                            refname=positive_hit.refname,
                            start=positive_hit.start,
                            end=negative_hit.end,
                            strand=strand,
                        )
                    )

                if negative_hit.end - positive_hit.start + 1 > max_len:
                    # Stop searching for negative hits to pair with this positive hit.
                    # All subsequence negative hits will have amplicon length > max_len
                    break

                if negative_hit.end < positive_hit.start:
                    # This positive hit is genomically right of the current negative hit.
                    # All subsequent positive hits will also be genomically right of this negative
                    # hit, so we should start at the one after this. If this index is past the end
                    # of the list, the slice `negative_hits_sorted[prev_negative_hit_index:]` will
                    # be empty.
                    prev_negative_hit_index = negative_hit_index + 1

        return amplicons

    @staticmethod
    def _hit_to_span(hit: BwaHit) -> Span:
        """Converts a Bwa Hit object to a Span."""
        return Span(refname=hit.refname, start=hit.start, end=hit.end)

    def close(self) -> None:
        self._bwa.close()

    def __enter__(self) -> Self:
        return self

    def __exit__(
        self,
        exc_type: Optional[type[BaseException]],
        exc_value: Optional[BaseException],
        traceback: Optional[TracebackType],
    ) -> None:
        """Gracefully terminates any running subprocesses."""
        super().__exit__(exc_type, exc_value, traceback)
        self.close()
Functions
__exit__
__exit__(
    exc_type: Optional[type[BaseException]],
    exc_value: Optional[BaseException],
    traceback: Optional[TracebackType],
) -> None

Gracefully terminates any running subprocesses.

Source code in prymer/offtarget/offtarget_detector.py
def __exit__(
    self,
    exc_type: Optional[type[BaseException]],
    exc_value: Optional[BaseException],
    traceback: Optional[TracebackType],
) -> None:
    """Gracefully terminates any running subprocesses."""
    super().__exit__(exc_type, exc_value, traceback)
    self.close()
__init__
__init__(
    ref: Path,
    max_primer_hits: int,
    max_primer_pair_hits: int,
    three_prime_region_length: int,
    max_mismatches_in_three_prime_region: int,
    max_mismatches: int,
    max_gap_opens: int = 0,
    max_gap_extends: int = -1,
    max_amplicon_size: int = 1000,
    min_primer_pair_hits: int = 1,
    cache_results: bool = True,
    threads: Optional[int] = None,
    keep_spans: bool = True,
    keep_primer_spans: bool = True,
    executable: str | Path = BWA_EXECUTABLE_NAME,
) -> None

Initialize an [[OffTargetDetector]].

This class accepts a large number of parameters to control the behavior of the off-target detection. The parameters are used in the various aspects of off-target detection as follows:

  1. Alignment (off-target hit detection): ref, executable, threads, three_prime_region_length, max_mismatches_in_three_prime_region, max_mismatches, and max_primer_hits.
  2. Filtering of individual primers: max_primer_hits.
  3. Checking of primer pairs: max_primer_hits, min_primer_pair_hits, max_primer_pair_hits, and max_amplicon_size.

The three_prime_region_length parameter is used as the seed length for bwa aln.

Parameters:

Name Type Description Default
ref Path

the reference genome fasta file (must be indexed with BWA)

required
max_primer_hits int

the maximum number of hits an individual primer can have in the genome before it is considered an invalid primer, and all primer pairs containing the primer failed. Must be greater than or equal to 0.

required
max_primer_pair_hits int

the maximum number of amplicons a primer pair can make and be considered passing. Must be greater than or equal to 0.

required
min_primer_pair_hits int

The minimum number of amplicons a primer pair can make and be considered passing. (In most cases, this is the number of amplicons a primer pair is expected to generate.) The default is 1, which is appropriate when the primer pair is being evaluated for off-target hits against the same reference genome from which the primers were generated. If the primer pair was generated from a different reference sequence, it may be appropriate to set this value to 0. Must be greater than or equal to 0.

1
three_prime_region_length int

the number of bases at the 3' end of the primer in which the parameter max_mismatches_in_three_prime_region is evaluated. This value is used as the seed length (bwa aln -l). Must be a minimum of 8.

required
max_mismatches_in_three_prime_region int

the maximum number of mismatches that are tolerated in the three prime region of each primer defined by three_prime_region_length. Must be between 0 and three_prime_region_length, inclusive.

required
max_mismatches int

the maximum number of mismatches allowed in the full length primer (including any in the three prime region). Must be greater than or equal to 0.

required
max_gap_opens int

the maximum number of gaps (insertions or deletions) allowable in an alignment of a oligo to the reference. Must be greater than or equal to 0.

0
max_gap_extends int

the maximum number of gap extensions allowed; extending a gap beyond a single base costs 1 gap extension. Can be set to -1 to allow unlimited extensions up to max diffs (aka max mismatches), while disallowing "long gaps". Must be greater than or equal to -1.

-1
max_amplicon_size int

the maximum amplicon size to consider amplifiable. Must be greater than 0.

1000
cache_results bool

if True, cache results for faster re-querying

True
threads Optional[int]

the number of threads to use when invoking bwa

None
keep_spans bool

if True, [[OffTargetResult]] objects will be reported with amplicon spans populated, otherwise not

True
keep_primer_spans bool

if True, [[OffTargetResult]] objects will be reported with left and right primer spans

True
executable str | Path

string or Path representation of the bwa executable path

BWA_EXECUTABLE_NAME

Raises:

Type Description
ValueError

If max_amplicon_size is not greater than 0.

ValueError

If any of max_primer_hits, max_primer_pair_hits, or min_primer_pair_hits are not greater than or equal to 0.

ValueError

If three_prime_region_length is not greater than or equal to 8.

ValueError

If max_mismatches_in_three_prime_region is outside the range 0 to three_prime_region_length, inclusive.

ValueError

If max_mismatches is not greater than or equal to 0.

ValueError

If max_gap_opens is not greater than or equal to 0.

ValueError

If max_gap_extends is not -1 or greater than or equal to 0.

Source code in prymer/offtarget/offtarget_detector.py
def __init__(  # noqa: C901
    self,
    ref: Path,
    max_primer_hits: int,
    max_primer_pair_hits: int,
    three_prime_region_length: int,
    max_mismatches_in_three_prime_region: int,
    max_mismatches: int,
    max_gap_opens: int = 0,
    max_gap_extends: int = -1,
    max_amplicon_size: int = 1000,
    min_primer_pair_hits: int = 1,
    cache_results: bool = True,
    threads: Optional[int] = None,
    keep_spans: bool = True,
    keep_primer_spans: bool = True,
    executable: str | Path = BWA_EXECUTABLE_NAME,
) -> None:
    """
    Initialize an [[OffTargetDetector]].

    This class accepts a large number of parameters to control the behavior of the off-target
    detection. The parameters are used in the various aspects of off-target detection as
    follows:

    1. Alignment (off-target hit detection): `ref`, `executable`, `threads`,
       `three_prime_region_length`, `max_mismatches_in_three_prime_region`, `max_mismatches`,
       and `max_primer_hits`.
    2. Filtering of individual primers: `max_primer_hits`.
    3. Checking of primer pairs: `max_primer_hits`, `min_primer_pair_hits`,
       `max_primer_pair_hits`, and `max_amplicon_size`.

    The `three_prime_region_length` parameter is used as the seed length for `bwa aln`.

    Args:
        ref: the reference genome fasta file (must be indexed with BWA)
        max_primer_hits: the maximum number of hits an individual primer can have in the genome
            before it is considered an invalid primer, and all primer pairs containing the
            primer failed. Must be greater than or equal to 0.
        max_primer_pair_hits: the maximum number of amplicons a primer pair can make and be
            considered passing. Must be greater than or equal to 0.
        min_primer_pair_hits: The minimum number of amplicons a primer pair can make and be
            considered passing. (In most cases, this is the number of amplicons a primer pair is
            expected to generate.) The default is 1, which is appropriate when the primer pair
            is being evaluated for off-target hits against the same reference genome from which
            the primers were generated. If the primer pair was generated from a different
            reference sequence, it may be appropriate to set this value to 0. Must be greater
                than or equal to 0.
        three_prime_region_length: the number of bases at the 3' end of the primer in which the
            parameter `max_mismatches_in_three_prime_region` is evaluated. This value is used as
            the seed length (`bwa aln -l`). Must be a minimum of 8.
        max_mismatches_in_three_prime_region: the maximum number of mismatches that are
            tolerated in the three prime region of each primer defined by
            `three_prime_region_length`. Must be between 0 and `three_prime_region_length`,
            inclusive.
        max_mismatches: the maximum number of mismatches allowed in the full length primer
            (including any in the three prime region). Must be greater than or equal to 0.
        max_gap_opens: the maximum number of gaps (insertions or deletions) allowable in an
            alignment of a oligo to the reference. Must be greater than or equal to 0.
        max_gap_extends: the maximum number of gap extensions allowed; extending a gap
            beyond a single base costs 1 gap extension.  Can be set to -1 to allow
            unlimited extensions up to max diffs (aka max mismatches), while disallowing
            "long gaps". Must be greater than or equal to -1.
        max_amplicon_size: the maximum amplicon size to consider amplifiable. Must be greater
            than 0.
        cache_results: if True, cache results for faster re-querying
        threads: the number of threads to use when invoking bwa
        keep_spans: if True, [[OffTargetResult]] objects will be reported with amplicon spans
            populated, otherwise not
        keep_primer_spans: if True, [[OffTargetResult]] objects will be reported with left and
            right primer spans
        executable: string or Path representation of the `bwa` executable path

    Raises:
        ValueError: If `max_amplicon_size` is not greater than 0.
        ValueError: If any of `max_primer_hits`, `max_primer_pair_hits`, or
            `min_primer_pair_hits` are not greater than or equal to 0.
        ValueError: If `three_prime_region_length` is not greater than or equal to 8.
        ValueError: If `max_mismatches_in_three_prime_region` is outside the range 0 to
            `three_prime_region_length`, inclusive.
        ValueError: If `max_mismatches` is not greater than or equal to 0.
        ValueError: If `max_gap_opens` is not greater than or equal to 0.
        ValueError: If `max_gap_extends` is not -1 or greater than or equal to 0.
    """
    errors: list[str] = []
    if max_amplicon_size < 1:
        errors.append(f"'max_amplicon_size' must be greater than 0. Saw {max_amplicon_size}")
    if max_primer_hits < 0:
        errors.append(
            f"'max_primer_hits' must be greater than or equal to 0. Saw {max_primer_hits}"
        )
    if max_primer_pair_hits < 0:
        errors.append(
            "'max_primer_pair_hits' must be greater than or equal to 0. "
            f"Saw {max_primer_pair_hits}"
        )
    if min_primer_pair_hits < 0:
        errors.append(
            "'min_primer_pair_hits' must be greater than or equal to 0. "
            f"Saw {min_primer_pair_hits}"
        )
    if three_prime_region_length < MINIMUM_THREE_PRIME_REGION_LENGTH:
        errors.append(
            "'three_prime_region_length' must be greater than or equal to "
            f"{MINIMUM_THREE_PRIME_REGION_LENGTH}. Saw {three_prime_region_length}"
        )
    if (
        max_mismatches_in_three_prime_region < 0
        or max_mismatches_in_three_prime_region > three_prime_region_length
    ):
        errors.append(
            "'max_mismatches_in_three_prime_region' must be between 0 and "
            f"'three_prime_region_length'={three_prime_region_length} inclusive. "
            f"Saw {max_mismatches_in_three_prime_region}"
        )
    if max_mismatches < 0:
        errors.append(
            f"'max_mismatches' must be greater than or equal to 0. Saw {max_mismatches}"
        )
    if max_gap_opens < 0:
        errors.append(
            f"'max_gap_opens' must be greater than or equal to 0. Saw {max_gap_opens}"
        )
    if max_gap_extends < -1:
        errors.append(
            "'max_gap_extends' must be -1 (for unlimited extensions up to 'max_mismatches'="
            f"{max_mismatches}) or greater than or equal to 0. Saw {max_gap_extends}"
        )

    if len(errors) > 0:
        raise ValueError("\n".join(errors))

    self._primer_cache: dict[str, BwaResult] = {}
    self._primer_pair_cache: dict[PrimerPair, OffTargetResult] = {}
    self._bwa = BwaAlnInteractive(
        executable=executable,
        ref=ref,
        reverse_complement=True,
        threads=threads,
        seed_length=three_prime_region_length,
        max_mismatches_in_seed=max_mismatches_in_three_prime_region,
        max_mismatches=max_mismatches,
        max_gap_opens=max_gap_opens,
        max_gap_extensions=max_gap_extends,
        max_hits=max_primer_hits,
    )
    self._max_primer_hits = max_primer_hits
    self._max_primer_pair_hits: int = max_primer_pair_hits
    self._min_primer_pair_hits: int = min_primer_pair_hits
    self._max_amplicon_size: int = max_amplicon_size
    self._cache_results: bool = cache_results
    self._keep_spans: bool = keep_spans
    self._keep_primer_spans: bool = keep_primer_spans
check_all
check_all(
    primer_pairs: list[PrimerPair],
) -> dict[PrimerPair, OffTargetResult]

Check a collection of primer pairs for off-target amplification.

This method maps each primer to the specified reference with bwa aln to search for off-target hits. Possible amplicons are identified from the pairwise combinations of these alignments, up to the specified max_amplicon_size.

Primer pairs are marked as passing if both of the following are true: 1. Each primer has no more than max_primer_hits alignments to the genome. 2. The size of the set of possible amplicons does not exceed max_primer_pair_hits, and is at least min_primer_pair_hits.

Parameters:

Name Type Description Default
primer_pairs list[PrimerPair]

The primer pairs to check.

required

Returns:

Type Description
dict[PrimerPair, OffTargetResult]

A dictionary mapping each checked primer pair to its off-target detection results.

dict[PrimerPair, OffTargetResult]

See OffTargetResult for details regarding the attributes included in each result.

Source code in prymer/offtarget/offtarget_detector.py
def check_all(self, primer_pairs: list[PrimerPair]) -> dict[PrimerPair, OffTargetResult]:
    """
    Check a collection of primer pairs for off-target amplification.

    This method maps each primer to the specified reference with `bwa aln` to search for
    off-target hits. Possible amplicons are identified from the pairwise combinations of these
    alignments, up to the specified `max_amplicon_size`.

    Primer pairs are marked as passing if both of the following are true:
    1. Each primer has no more than `max_primer_hits` alignments to the genome.
    2. The size of the set of possible amplicons does not exceed `max_primer_pair_hits`, and is
       at least `min_primer_pair_hits`.

    Args:
        primer_pairs: The primer pairs to check.

    Returns:
        A dictionary mapping each checked primer pair to its off-target detection results.
        See `OffTargetResult` for details regarding the attributes included in each result.
    """

    primer_pair_results: dict[PrimerPair, OffTargetResult] = {}
    result: OffTargetResult

    # Get the primer pairs to map.  If the primer pair is found in the cache, use that
    primer_pairs_to_map: list[PrimerPair] = []
    if not self._cache_results:
        primer_pairs_to_map = primer_pairs
    else:
        for primer_pair in primer_pairs:
            match self._primer_pair_cache.get(primer_pair, None):
                case None:
                    primer_pairs_to_map.append(primer_pair)  # Map it!
                case result:
                    primer_pair_results[primer_pair] = result

    # If there are no primer pairs to map, return the results
    if len(primer_pairs_to_map) == 0:
        return primer_pair_results

    # Get mappings of all the primers
    primers = [primer for primer_pair in primer_pairs for primer in primer_pair]
    hits_by_primer = self.mappings_of(primers)

    for primer_pair in primer_pairs:
        primer_pair_results[primer_pair] = self._build_off_target_result(
            primer_pair=primer_pair, hits_by_primer=hits_by_primer
        )

    return primer_pair_results
check_one
check_one(primer_pair: PrimerPair) -> OffTargetResult

Check an individual primer pair for off-target amplification.

See OffTargetDetector.check_all() for details.

Parameters:

Name Type Description Default
primer_pair PrimerPair

The primer pair to check.

required

Returns:

Type Description
OffTargetResult

The results of the off-target detection.

OffTargetResult

See OffTargetResult for details regarding the attributes included in the result.

Source code in prymer/offtarget/offtarget_detector.py
def check_one(self, primer_pair: PrimerPair) -> OffTargetResult:
    """
    Check an individual primer pair for off-target amplification.

    See `OffTargetDetector.check_all()` for details.

    Args:
        primer_pair: The primer pair to check.

    Returns:
        The results of the off-target detection.
        See `OffTargetResult` for details regarding the attributes included in the result.
    """
    result: dict[PrimerPair, OffTargetResult] = self.check_all([primer_pair])
    return result[primer_pair]
filter
filter(primers: list[PrimerType]) -> list[PrimerType]

Remove primers that have more than max_primer_hits mappings to the genome.

This method maps each primer to the specified reference with bwa aln to search for off-target hits. Note that when the reference includes the sequence from which the primers were designed, the on-target hit will be included in the hit count. max_primer_hits should be set to at least 1 in this case.

Parameters:

Name Type Description Default
primers list[PrimerType]

A list of primers to filter.

required

Returns:

Type Description
list[PrimerType]

The input primers, filtered to remove any primers with more than max_primer_hits

list[PrimerType]

mappings to the genome.

Source code in prymer/offtarget/offtarget_detector.py
def filter(self, primers: list[PrimerType]) -> list[PrimerType]:
    """
    Remove primers that have more than `max_primer_hits` mappings to the genome.

    This method maps each primer to the specified reference with `bwa aln` to search for
    off-target hits. Note that when the reference includes the sequence from which the primers
    were designed, the on-target hit will be included in the hit count. `max_primer_hits` should
    be set to at least 1 in this case.

    Args:
        primers: A list of primers to filter.

    Returns:
        The input primers, filtered to remove any primers with more than `max_primer_hits`
        mappings to the genome.
    """
    results: dict[str, BwaResult] = self.mappings_of(primers)
    return [
        primer for primer in primers if results[primer.bases].hit_count <= self._max_primer_hits
    ]
mappings_of
mappings_of(
    primers: list[PrimerType],
) -> dict[str, BwaResult]

Map primers to the reference genome using bwa aln.

Alignment results may be optionally cached for efficiency. Set cache_results to True when instantiating the [[OffTargetDetector]] to enable caching.

Any primers without cached results, or all primers when cache_results is False, will be mapped to the reference genome using bwa aln and the specified alignment parameters.

Note: The reverse complement of each primer sequence is used for mapping. The query sequence in each BwaResult will match the input primer sequence, as will the sequences used as keys in the output dictionary. However, the coordinates reported in each BwaHit associated with a result will correspond to complementary sequence.

Parameters:

Name Type Description Default
primers list[PrimerType]

A list of primers to map.

required

Returns:

Type Description
dict[str, BwaResult]

A dictionary mapping each primer's sequence to its alignment results.

dict[str, BwaResult]

See BwaResult for details regarding the attributes included in each result.

Source code in prymer/offtarget/offtarget_detector.py
def mappings_of(self, primers: list[PrimerType]) -> dict[str, BwaResult]:
    """
    Map primers to the reference genome using `bwa aln`.

    Alignment results may be optionally cached for efficiency. Set `cache_results` to `True`
    when instantiating the [[OffTargetDetector]] to enable caching.

    Any primers without cached results, or all primers when `cache_results` is `False`, will be
    mapped to the reference genome using `bwa aln` and the specified alignment parameters.

    **Note**: The reverse complement of each primer sequence is used for mapping. The `query`
    sequence in each `BwaResult` will match the input primer sequence, as will the sequences
    used as keys in the output dictionary. However, the coordinates reported in each `BwaHit`
    associated with a result will correspond to complementary sequence.

    Args:
        primers: A list of primers to map.

    Returns:
        A dictionary mapping each primer's sequence to its alignment results.
        See `BwaResult` for details regarding the attributes included in each result.
    """

    primers_to_map: list[PrimerType]
    if not self._cache_results:
        primers_to_map = primers
    else:
        primers_to_map = [
            primer for primer in list(OrderedSet(primers)) if primer not in self._primer_cache
        ]

    # Build the unique list of queries to map with BWA
    queries: list[Query] = [
        Query(id=primer.bases, bases=primer.bases) for primer in primers_to_map
    ]

    # Map the queries with BWA
    hits_by_primer: dict[str, BwaResult] = {
        result.query.id: result for result in self._bwa.map_all(queries)
    }

    # Cache the results, if desired, and get the hits by primer _for all_ primers. If not
    # caching, then hits_by_primer already contains all the primers.
    if self._cache_results:
        self._primer_cache.update(hits_by_primer)
        hits_by_primer = {primer.bases: self._primer_cache[primer.bases] for primer in primers}

    return hits_by_primer
OffTargetResult dataclass

Information obtained by running a single primer pair through the off-target detector.

Attributes:

Name Type Description
primer_pair PrimerPair

the primer pair submitted

passes bool

True if neither primer exceeds the specified maximum number of hits, and the primer pair would generate an acceptable number of amplicons in the reference genome (i.e. min_primer_pair_hits <= num_amplicons <= max_primer_pair_hits). False otherwise.

cached bool

True if this result is part of a cache, False otherwise. This is useful for testing

spans list[Span]

The set of mappings of the primer pair to the genome (i.e., the region spanned by the inferred amplicon). This list will be empty if the generating [[OffTargetDetector]] was constructed with keep_spans=False.

left_primer_spans list[Span]

the list of mappings for the left primer, independent of the pair mappings, or an empty list

right_primer_spans list[Span]

the list of mappings for the right primer, independent of the pair mappings, or an empty list

Source code in prymer/offtarget/offtarget_detector.py
@dataclass(init=True, frozen=True)
class OffTargetResult:
    """Information obtained by running a single primer pair through the off-target detector.

    Attributes:
        primer_pair: the primer pair submitted
        passes: True if neither primer exceeds the specified maximum number of hits, and the primer
            pair would generate an acceptable number of amplicons in the reference genome (i.e.
            `min_primer_pair_hits <= num_amplicons <= max_primer_pair_hits`). False otherwise.
        cached: True if this result is part of a cache, False otherwise.  This is useful for testing
        spans: The set of mappings of the primer pair to the genome (i.e., the region spanned by the
            inferred amplicon). This list will be empty if the generating [[OffTargetDetector]] was
            constructed with `keep_spans=False`.
        left_primer_spans: the list of mappings for the left primer, independent of the pair
            mappings, or an empty list
        right_primer_spans: the list of mappings for the right primer, independent of the pair
            mappings, or an empty list
    """

    primer_pair: PrimerPair
    passes: bool
    cached: bool = False
    spans: list[Span] = field(default_factory=list)
    left_primer_spans: list[Span] = field(default_factory=list)
    right_primer_spans: list[Span] = field(default_factory=list)