#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import hashlib
import base64
from seguid._manip import reverse
from seguid._manip import rotate
from seguid._manip import rotate_to_min
from seguid._tables import tablefactory
from seguid._asserts import assert_in_alphabet
from seguid._asserts import assert_complementary
seguid_prefix: str = "seguid="
lsseguid_prefix: str = "lsseguid="
csseguid_prefix: str = "csseguid="
ldseguid_prefix: str = "ldseguid="
cdseguid_prefix: str = "cdseguid="
b64alphabet = set("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/_-")
short = 6
def _seguid(
seq: str,
alphabet: str = "{DNA}",
encoding: callable = base64.standard_b64encode,
) -> str:
assert callable(encoding)
assert seq, "A sequence must not be empty"
assert_in_alphabet(seq, alphabet=set(tablefactory(alphabet).keys()))
m = hashlib.sha1()
m.update(seq.encode("ASCII"))
hs = encoding(m.digest())
csum = f"{hs.decode('ASCII').rstrip('=')}"
assert len(csum) == 27
assert set(csum).issubset(b64alphabet)
return csum
def _form(prefix, csum, form):
longform = ""
shortform = ""
if form == "both":
return csum[:short], prefix + csum
elif form == "long":
return prefix + csum
if form == "short":
return csum[:short]
# def _form(prefix, csum, form):
# longform = ""
# shortform = ""
# if form != "short":
# longform = prefix + csum
# if form != "long":
# shortform = csum[:short]
# return " ".join((shortform, longform)).strip()
[docs]
def seguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
"""SEGUID v1 checksum for linear protein or single-stranded DNA.
.. warning::
``seguid()`` (obsolete) is superseded by :func:`lsseguid()` (recommended).
Given a nucleotide or amino-acid sequence ``seq`` in uppercase, the function returns
a string containing the **SE**\ quence **G**\ lobally **U**\ nique **ID**\ entifier (**SEGUID**\ ).
The SEGUID is defined as the Base64 encoded SHA1 checksum calculated for the sequence
in uppercase with the trailing padding symbol (``=``) removed.
The original definition of the SEGUID v1 checksum algorithm (Babnigg & Giometti, 2006)
included transformation to uppercase before calculating the checksum.
Here, ``seguid()`` does *not* coerce the input sequence to upper case. If your input sequence
has lower-case symbols, you can use :meth:`str.upper` to achieve what the original method does.
``seguid()`` only accepts symbols as specified by the `alphabet` argument.
Thus, our implementation is more conservative, which has the benefit of
lowering the risk of passing the incorrect sequence by mistake.
The resulting checksum string may contain forward slash (``/``) and plus-sign (``+``) symbols.
These characters cannot be a part of a Uniform Resource Locator (URL) or a filename on
some operating systems. The SEGUID v2 checksum produced by :func:`lsseguid()` is similar to the
SEGUID v1 checksum by ``seguid()``, but uses the Base64url encoding that do not produce
these characters.
The checksum is prefixed with ``seguid=``.
Examples
--------
>>> seguid("AT")
'seguid=Ax/RG6hzSrMEEWoCO1IWMGska+4'
"""
return _form(
seguid_prefix,
_seguid(seq, alphabet=alphabet, encoding=base64.standard_b64encode),
form,
)
[docs]
def lsseguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
"""SEGUID checksum for linear single-stranded DNA.
The same as the :func:`seguid()` function except that forward slashes (``/``) and plus signs (``+``)
in the resulting checksum are replaced by underscores (``_``) and minus signs (``-``), respectively
following the Base64url standard in RFC 4648.
This checksum is applicable to linear single-stranded DNA
sequences and protein sequences, among other sequences. If
protein sequences are analyzed, the alphabet argument should be
``"{protein}"`` or ``"{protein-extended}"``.
The checksum is prefixed with ``lsseguid=``.
Examples
--------
>>> lsseguid("AT")
'lsseguid=Ax_RG6hzSrMEEWoCO1IWMGska-4'
"""
return _form(
lsseguid_prefix,
_seguid(seq, alphabet=alphabet, encoding=base64.urlsafe_b64encode),
form,
)
[docs]
def csseguid(seq: str, alphabet: str = "{DNA}", form: str = "long") -> str:
r"""SEGUID checksum for circular single-stranded DNA.
The ``csseguid()`` is the :func:`lsseguid()` checksum calculated
for the lexicographically smallest string rotation of ``seq``.
This checksum is Only defined for circular single-stranded
sequences.
The checksum is prefixed with ``csseguid=``.
Examples
--------
>>> csseguid("ATTT")
'csseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
>>> lsseguid("ATTT")
'lsseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
>>> csseguid("TTTA")
'csseguid=ot6JPLeAeMmfztW1736Kc6DAqlo'
>>> lsseguid("TTTA")
'lsseguid=8zCvKwyQAEsbPtC4yTV-pY0H93Q'
"""
return _form(
csseguid_prefix,
_seguid(
rotate_to_min(seq), alphabet=alphabet, encoding=base64.urlsafe_b64encode
),
form,
)
[docs]
def ldseguid(
watson: str, crick: str, alphabet: str = "{DNA}", form: str = "long"
) -> str:
r"""SEGUID checksum for linear double-stranded DNA.
This function calculates the SEGUID checksum for a double-stranded DNA (dsDNA) sequence defined by two
strings representing the upper (Watson) and the complementary (Crick) DNA strands. Watson and Crick
strands should be of equal length.
Optional single-stranded DNA regions in the ends are indicated by a dash (``-``) in either strand.
The algorithm first selects the lexicographically smallest of the
Watson and Crick strands. The two string are then joined 5'-3',
separated by a semicolon (``;``), and the :func:`lsseguid()`
function is used on the resulting string.
For example, consider the linear dsDNA sequence defined by ``watson="-TATGCC"`` and ``crick="-GCATAC"`` as in:
::
dsDNA SEGUID checksum
-TATGCC ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I
|||||
CATACG-
-GCATAC ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I
|||||
CCGTAT-
The SEGUID algorithm identifies the ``"-GCATAC"`` strand as the lexicographic smallest of the two. Then it concattenates the two as:
::
"-GCATAC" + ";" + "-TATGCC"
and calculates the final checksum based on that sequence.
The checksum is prefixed with ``ldseguid=``.
Examples
--------
>>> ldseguid("-TATGCC", "-GCATAC")
'ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I'
>>> ldseguid("-GCATAC", "-TATGCC")
'ldseguid=rr65d6AYuP-CdMaVmdw3L9FPt6I'
"""
assert watson, "Watson sequence must not be empty"
assert crick, "Crick sequence must not be empty"
assert len(watson) == len(crick)
assert_complementary(watson, crick, alphabet=alphabet)
tb = tablefactory(alphabet)
assert len(set(tb.values())) > 1, "Was a single-stranded alphabet used by mistake?"
exalphabet = alphabet + ",--,;;"
if watson < crick:
spec = watson + ";" + crick
else:
spec = crick + ";" + watson
return _form(
ldseguid_prefix,
_seguid(spec, alphabet=exalphabet, encoding=base64.urlsafe_b64encode),
form,
)
[docs]
def cdseguid(
watson: str, crick: str, alphabet: str = "{DNA}", form: str = "long"
) -> str:
"""SEGUID checksum for circular double-stranded DNA.
The ``cdseguid()`` is the :func:`lsseguid()` checksum calculated for the lexicographically
smallest string rotation of a double-stranded DNA sequence. Only defined for circular
sequences.
The checksum is prefixed with ``cdseguid=``.
"""
from seguid._config import _min_rotation
assert watson, "Watson sequence must not be empty"
assert crick, "Crick sequence must not be empty"
assert len(watson) == len(crick)
assert_complementary(watson, crick, alphabet=alphabet)
amount_watson = _min_rotation(watson)
watson_min = rotate(watson, amount=amount_watson)
amount_crick = _min_rotation(crick)
crick_min = rotate(crick, amount=amount_crick)
# Keep the "minimum" of the two variants
if watson_min < crick_min:
w = watson_min
c = rotate(crick, amount=-amount_watson)
else:
w = crick_min
c = rotate(watson, amount=-amount_crick)
return _form(
cdseguid_prefix,
ldseguid(watson=w, crick=c, alphabet=alphabet, form="long")[
len(ldseguid_prefix) :
],
form,
)