Source code for seguid._chksum

#!/usr/bin/env python3
# -*- coding: utf-8 -*-


import hashlib
import base64

from seguid._manip import reverse
from seguid._manip import rotate
from seguid._manip import rotate_to_min

from seguid._tables import tablefactory
from seguid._asserts import assert_in_alphabet
from seguid._asserts import assert_complementary

seguid_prefix: str = "seguid="
lsseguid_prefix: str = "lsseguid="
csseguid_prefix: str = "csseguid="
ldseguid_prefix: str = "ldseguid="
cdseguid_prefix: str = "cdseguid="
b64alphabet = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/_-")
short = 6


def _seguid(
    seq: str,
    alphabet: str = "{DNA}",
    encoding: callable = base64.standard_b64encode,
) -> str:
    assert callable(encoding)
    assert seq, "A sequence must not be empty"
    assert_in_alphabet(seq, alphabet=set(tablefactory(alphabet).keys()))
    m = hashlib.sha1()
    m.update(seq.encode("ASCII"))
    hs = encoding(m.digest())
    csum = f"{hs.decode('ASCII').rstrip('=')}"
    assert len(csum) == 27
    assert set(csum).issubset(b64alphabet)
    return csum


def _form(prefix, csum, form):
    longform = ""
    shortform = ""
    if form == "both":
        return csum[:short], prefix + csum
    elif form == "long":
        return prefix + csum
    if form == "short":
        return csum[:short]


# def _form(prefix, csum, form):
#     longform = ""
#     shortform = ""
#     if form != "short":
#         longform = prefix + csum
#     if form != "long":
#         shortform = csum[:short]
#     return " ".join((shortform, longform)).strip()



[docs]
def seguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
    """SEGUID v1 checksum for linear protein or single-stranded DNA.

    .. warning::
        ``seguid()`` (obsolete) is superseded by :func:`lsseguid()` (recommended).

    Given a nucleotide or amino-acid sequence ``seq`` in uppercase, the function returns
    a string containing the **SE**quence **G**lobally **U**nique **ID**entifier (**SEGUID**).
    The SEGUID is defined as the Base64 encoded SHA1 checksum calculated for the sequence
    in uppercase with the trailing padding symbol (``=``) removed.

    The original definition of the SEGUID v1 checksum algorithm (Babnigg & Giometti, 2006)
    included transformation to uppercase before calculating the checksum.
    Here, ``seguid()`` does *not* coerce the input sequence to upper case. If your input sequence
    has lower-case symbols, you can use :meth:`str.upper` to achieve what the original method does.
    ``seguid()`` only accepts symbols as specified by the `alphabet` argument.
    Thus, our implementation is more conservative, which has the benefit of
    lowering the risk of passing the incorrect sequence by mistake.

    The resulting checksum string may contain forward slash (``/``) and plus-sign (``+``) symbols.
    These characters cannot be a part of a Uniform Resource Locator (URL) or a filename on
    some operating systems. The SEGUID v2 checksum produced by :func:`lsseguid()` is similar to the
    SEGUID v1 checksum by ``seguid()``, but uses the Base64url encoding that do not produce
    these characters.

    The checksum is prefixed with ``seguid=``.

    Examples
    --------
    >>> seguid("AT")
    'seguid=Ax/RG6hzSrMEEWoCO1IWMGska+4'

    """
    return _form(
        seguid_prefix,
        _seguid(seq, alphabet=alphabet, encoding=base64.standard_b64encode),
        form,
    )




[docs]
def lsseguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
    """SEGUID checksum for linear single-stranded DNA.

    The same as the :func:`seguid()` function except that forward slashes (``/``) and plus signs (``+``)
    in the resulting checksum are replaced by underscores (``_``) and minus signs (``-``), respectively
    following the Base64url standard in RFC 4648.

    This checksum is applicable to linear single-stranded DNA
    sequences and protein sequences, among other sequences.  If
    protein sequences are analyzed, the alphabet argument should be
    ``"{protein}"`` or ``"{protein-extended}"``.

    The checksum is prefixed with ``lsseguid=``.

    Examples
    --------
    >>> lsseguid("AT")
    'lsseguid=Ax_RG6hzSrMEEWoCO1IWMGska-4'

    """
    return _form(
        lsseguid_prefix,
        _seguid(seq, alphabet=alphabet, encoding=base64.urlsafe_b64encode),
        form,
    )




[docs]
def csseguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
    r"""SEGUID checksum for circular single-stranded DNA.

    The ``csseguid()`` is the :func:`lsseguid()` checksum calculated
    for the lexicographically smallest string rotation of ``seq``.
    This checksum is Only defined for circular single-stranded
    sequences.

    The checksum is prefixed with ``csseguid=``.

    Examples
    --------
    >>> csseguid("ATTT")
    'csseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
    >>> lsseguid("ATTT")
    'lsseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
    >>> csseguid("TTTA")
    'csseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
    >>> lsseguid("TTTA")
    'lsseguid=8zCvKwyQAEsbPtC4yTV-pY0H93Q'

    """
    return _form(
        csseguid_prefix,
        _seguid(
            rotate_to_min(seq), alphabet=alphabet, encoding=base64.urlsafe_b64encode
        ),
        form,
    )




[docs]
def ldseguid(
    watson: str, crick: str, alphabet: str = "{DNA}", form: str = "long"
) -> str:
    r"""SEGUID checksum for linear double-stranded DNA.

    This function calculates the SEGUID checksum for a double-stranded DNA (dsDNA) sequence defined by two
    strings representing the upper (Watson) and the complementary (Crick) DNA strands. Watson and Crick
    strands should be of equal length.
    Optional single-stranded DNA regions in the ends are indicated by a dash (``-``) in either strand.

    The algorithm first selects the lexicographically smallest of the
    Watson and Crick strands.  The two string are then joined 5'-3',
    separated by a semicolon (``;``), and the :func:`lsseguid()`
    function is used on the resulting string.

    For example, consider the linear dsDNA sequence defined by ``watson="-TATGCC"`` and ``crick="-GCATAC"`` as in:

    ::

        dsDNA    SEGUID checksum

        -TATGCC  ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I
         |||||
        CATACG-

        -GCATAC  ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I
         |||||
        CCGTAT-

    The SEGUID algorithm identifies the ``"-GCATAC"`` strand as the lexicographic smallest of the two. Then it concattenates the two as:
    ::

        "-GCATAC" + ";" + "-TATGCC"

    and calculates the final checksum based on that sequence.

    The checksum is prefixed with ``ldseguid=``.

    Examples
    --------
    >>> ldseguid("-TATGCC", "-GCATAC")
    'ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I'
    >>> ldseguid("-GCATAC", "-TATGCC")
    'ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I'

    """
    assert watson, "Watson sequence must not be empty"
    assert crick, "Crick sequence must not be empty"
    assert len(watson) == len(crick)
    assert_complementary(watson, crick, alphabet=alphabet)

    tb = tablefactory(alphabet)
    assert len(set(tb.values())) > 1, "Was a single-stranded alphabet used by mistake?"

    exalphabet = alphabet + ",--,;;"

    if watson < crick:
        spec = watson + ";" + crick
    else:
        spec = crick + ";" + watson

    return _form(
        ldseguid_prefix,
        _seguid(spec, alphabet=exalphabet, encoding=base64.urlsafe_b64encode),
        form,
    )




[docs]
def cdseguid(
    watson: str, crick: str, alphabet: str = "{DNA}", form: str = "long"
) -> str:
    """SEGUID checksum for circular double-stranded DNA.

    The ``cdseguid()`` is the :func:`lsseguid()` checksum calculated for the lexicographically
    smallest string rotation of a double-stranded DNA sequence. Only defined for circular
    sequences.

    The checksum is prefixed with ``cdseguid=``.
    """
    from seguid._config import _min_rotation

    assert watson, "Watson sequence must not be empty"
    assert crick, "Crick sequence must not be empty"
    assert len(watson) == len(crick)
    assert_complementary(watson, crick, alphabet=alphabet)

    amount_watson = _min_rotation(watson)
    watson_min = rotate(watson, amount=amount_watson)
    amount_crick = _min_rotation(crick)
    crick_min = rotate(crick, amount=amount_crick)

    # Keep the "minimum" of the two variants
    if watson_min < crick_min:
        w = watson_min
        c = rotate(crick, amount=-amount_watson)
    else:
        w = crick_min
        c = rotate(watson, amount=-amount_crick)

    return _form(
        cdseguid_prefix,
        ldseguid(watson=w, crick=c, alphabet=alphabet, form="long")[
            len(ldseguid_prefix) :
        ],
        form,
    )
Source code for seguid._chksum

seguid

Navigation

Related Topics