File size: 1,571 Bytes
6d2a502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""Tests for tetranucleotide + codon-frequency features."""
from __future__ import annotations

from microbe_model.features.composition import codon_freqs, tetranucleotide_freqs


def test_tetranucleotide_freqs_sum_to_one() -> None:
    contigs = [("c1", "ACGT" * 100)]  # 400 nt → 397 4-mers
    out = tetranucleotide_freqs(contigs)
    assert len(out) == 256
    total = sum(out.values())
    assert abs(total - 1.0) < 1e-6


def test_tetranucleotide_freqs_handles_n() -> None:
    contigs = [("c1", "ACGNACGTACGT")]
    out = tetranucleotide_freqs(contigs)
    # All 4-mers containing N should be skipped; valid ones (ACGT, CGTA, GTAC, TACG) counted
    nonzero = {k: v for k, v in out.items() if v > 0}
    assert all(("N" not in k.removeprefix("tetra_")) for k in nonzero)
    assert nonzero  # we should have some non-N kmers


def test_tetranucleotide_freqs_empty() -> None:
    out = tetranucleotide_freqs([])
    assert len(out) == 256
    assert all(v == 0.0 for v in out.values())


def test_codon_freqs_sum_to_one() -> None:
    cds_list = ["ATG" * 30 + "TAA"]  # 30 ATG codons, 1 stop
    out = codon_freqs(cds_list)
    assert len(out) == 64
    total = sum(out.values())
    assert abs(total - 1.0) < 1e-6
    # ATG should be 30/31 of the codons
    assert abs(out["codon_ATG"] - 30 / 31) < 1e-6


def test_codon_freqs_skips_non_acgt() -> None:
    cds_list = ["ATGNNNATG"]  # codon NNN should be skipped
    out = codon_freqs(cds_list)
    assert out["codon_ATG"] == 1.0  # only the two ATG codons counted, both same
    assert sum(out.values()) == 1.0