File size: 2,457 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""Compositional features: k-mer frequencies and codon usage.

These supplement the v0 amino-acid-composition features in `genome.py`. They are
computed on the same predicted-CDS set, so adding them to a v1 featurize run is
~free in network/CPU terms.

Two feature groups:
  - tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
    halophily, and phylum-level taxonomy
  - codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
    and growth rate phenotype

We use them as relative frequencies (sum to 1 across each group) rather than
counts, so they're scale-invariant across genome sizes.
"""
from __future__ import annotations

from collections import Counter
from collections.abc import Iterable

NUCLEOTIDES = "ACGT"
TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
               for c in NUCLEOTIDES for d in NUCLEOTIDES]
CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]


def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
    """Relative frequency of each of the 256 ACGT tetranucleotides.

    Skips any 4-mer containing a non-ACGT character (e.g. N).
    """
    counts: Counter[str] = Counter()
    total = 0
    for _, seq in contigs:
        s = seq.upper()
        for i in range(len(s) - 3):
            kmer = s[i : i + 4]
            if kmer in TETRA_KMERS_SET:  # fast in-set check
                counts[kmer] += 1
                total += 1
    if total == 0:
        return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
    return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}


def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
    """Relative frequency of each of the 64 codons across all predicted CDS.

    Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
    Skips codons containing non-ACGT (e.g. N).
    """
    counts: Counter[str] = Counter()
    total = 0
    for cds in cds_nucleotides:
        s = cds.upper()
        for i in range(0, len(s) - 2, 3):
            codon = s[i : i + 3]
            if codon in CODONS_SET:
                counts[codon] += 1
                total += 1
    if total == 0:
        return {f"codon_{k}": 0.0 for k in CODONS}
    return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}


# Lookup sets for fast membership checks
TETRA_KMERS_SET = set(TETRA_KMERS)
CODONS_SET = set(CODONS)