Spaces:
No application file
No application file
| # Copyright 2002 by Yves Bastide and Brad Chapman. | |
| # Copyright 2007 by Sebastian Bassi | |
| # All rights reserved. | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Functions to calculate assorted sequence checksums.""" | |
| # crc32, crc64, gcg, and seguid | |
| # crc64 is adapted from BioPerl | |
| import binascii | |
| def crc32(seq): | |
| """Return the crc32 checksum for a sequence (string or Seq object). | |
| Note that the case is important: | |
| >>> crc32("ACGTACGTACGT") | |
| 20049947 | |
| >>> crc32("acgtACGTacgt") | |
| 1688586483 | |
| """ | |
| try: | |
| # Assume it's a Seq object | |
| s = bytes(seq) | |
| except TypeError: | |
| # Assume it's a string | |
| s = seq.encode() | |
| return binascii.crc32(s) | |
| def _init_table_h(): | |
| _table_h = [] | |
| for i in range(256): | |
| part_l = i | |
| part_h = 0 | |
| for j in range(8): | |
| rflag = part_l & 1 | |
| part_l >>= 1 | |
| if part_h & 1: | |
| part_l |= 1 << 31 | |
| part_h >>= 1 | |
| if rflag: | |
| part_h ^= 0xD8000000 | |
| _table_h.append(part_h) | |
| return _table_h | |
| # Initialisation | |
| _table_h = _init_table_h() | |
| def crc64(s): | |
| """Return the crc64 checksum for a sequence (string or Seq object). | |
| Note that the case is important: | |
| >>> crc64("ACGTACGTACGT") | |
| 'CRC-C4FBB762C4A87EBD' | |
| >>> crc64("acgtACGTacgt") | |
| 'CRC-DA4509DC64A87EBD' | |
| """ | |
| crcl = 0 | |
| crch = 0 | |
| for c in s: | |
| shr = (crch & 0xFF) << 24 | |
| temp1h = crch >> 8 | |
| temp1l = (crcl >> 8) | shr | |
| idx = (crcl ^ ord(c)) & 0xFF | |
| crch = temp1h ^ _table_h[idx] | |
| crcl = temp1l | |
| return f"CRC-{crch:08X}{crcl:08X}" | |
| def gcg(seq): | |
| """Return the GCG checksum (int) for a sequence (string or Seq object). | |
| Given a nucleotide or amino-acid sequence (or any string), | |
| returns the GCG checksum (int). Checksum used by GCG program. | |
| seq type = str. | |
| Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi | |
| with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina. | |
| All sequences are converted to uppercase. | |
| >>> gcg("ACGTACGTACGT") | |
| 5688 | |
| >>> gcg("acgtACGTacgt") | |
| 5688 | |
| """ | |
| index = checksum = 0 | |
| for char in seq: | |
| index += 1 | |
| checksum += index * ord(char.upper()) | |
| if index == 57: | |
| index = 0 | |
| return checksum % 10000 | |
| def seguid(seq): | |
| """Return the SEGUID (string) for a sequence (string or Seq object). | |
| Given a nucleotide or amino-acid sequence (or any string), | |
| returns the SEGUID string (A SEquence Globally Unique IDentifier). | |
| seq type = str. | |
| Note that the case is not important: | |
| >>> seguid("ACGTACGTACGT") | |
| 'If6HIvcnRSQDVNiAoefAzySc6i4' | |
| >>> seguid("acgtACGTacgt") | |
| 'If6HIvcnRSQDVNiAoefAzySc6i4' | |
| For more information about SEGUID, see: | |
| http://bioinformatics.anl.gov/seguid/ | |
| https://doi.org/10.1002/pmic.200600032 | |
| """ | |
| import hashlib | |
| import base64 | |
| m = hashlib.sha1() | |
| try: | |
| # Assume it's a Seq object | |
| seq = bytes(seq) | |
| except TypeError: | |
| # Assume it's a string | |
| seq = seq.encode() | |
| m.update(seq.upper()) | |
| tmp = base64.encodebytes(m.digest()) | |
| return tmp.decode().replace("\n", "").rstrip("=") | |
| if __name__ == "__main__": | |
| from Bio._utils import run_doctest | |
| run_doctest() | |