File size: 5,984 Bytes
99f834c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 | """Tests for the sequence analysis core."""
import math
import pytest
from core.analysis.gc_content import gc_percent, gc_sliding_window, gc_by_codon_position
from core.analysis.cai import calculate_cai, codon_usage_report
from core.analysis.homopolymers import detect_homopolymers
from core.analysis.restriction_sites import scan_restriction_sites, sites_present
from core.analysis.kozak import check_kozak, find_all_kozak_contexts
# ββ GC content ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestGCContent:
def test_pure_gc(self):
assert gc_percent("GCGCGC") == pytest.approx(100.0)
def test_pure_at(self):
assert gc_percent("ATATAT") == pytest.approx(0.0)
def test_fifty_percent(self):
assert gc_percent("ATGC") == pytest.approx(50.0)
def test_empty_sequence(self):
assert gc_percent("") == 0.0
def test_sliding_window_shape(self):
seq = "ATGC" * 50 # 200 nt
positions, values = gc_sliding_window(seq, window=100, step=1)
assert len(positions) == len(values)
assert len(positions) == 200 - 100 + 1 # 101 windows
def test_sliding_window_all_gc(self):
seq = "GC" * 100
_, values = gc_sliding_window(seq, window=50, step=10)
assert all(v == pytest.approx(100.0) for v in values)
def test_gc_by_codon_position(self):
# ATG CCC TAA β G at pos 1,3 of ATG; C at pos 1,2,3 of CCC; A at all in TAA
cds = "ATGCCCTAA"
result = gc_by_codon_position(cds)
assert "GC1" in result
assert "GC2" in result
assert "GC3" in result
def test_gc_by_codon_not_divisible(self):
with pytest.raises(ValueError):
gc_by_codon_position("ATGC")
# ββ CAI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestCAI:
# Perfect human-optimized sequence uses the best codon at every position
# ATG (Met=1.0) + TTC (Phe=1.0) + CTG (Leu=1.0) + TAA (stop, skipped)
_GOOD_CDS = "ATGTTCCTGTAA"
def test_cai_range(self):
cai = calculate_cai(self._GOOD_CDS, organism="human")
assert 0.0 <= cai <= 1.0
def test_cai_ecoli(self):
cai = calculate_cai(self._GOOD_CDS, organism="ecoli")
assert 0.0 <= cai <= 1.0
def test_cai_unknown_organism(self):
with pytest.raises(ValueError):
calculate_cai(self._GOOD_CDS, organism="martian")
def test_cai_non_divisible(self):
with pytest.raises(ValueError):
calculate_cai("ATGTTCA", organism="human") # 7 nt β not divisible by 3
def test_codon_usage_report(self):
usage = codon_usage_report("ATGTTCCTG")
assert usage["ATG"] == 1
assert usage["TTC"] == 1
assert usage["CTG"] == 1
# ββ Homopolymers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestHomopolymers:
def test_detect_poly_a(self):
seq = "ATGCAAAAAATGC" # AAAAAA = 6 As at positions 4-9
runs = detect_homopolymers(seq, min_run=5)
assert len(runs) == 1
assert runs[0].nucleotide == "A"
assert runs[0].length == 6
def test_below_threshold(self):
seq = "ATGCAAAATGC" # only 4 As
runs = detect_homopolymers(seq, min_run=5)
assert len(runs) == 0
def test_multiple_runs(self):
seq = "AAAAAGGGGG"
runs = detect_homopolymers(seq, min_run=5)
assert len(runs) == 2
nucls = {r.nucleotide for r in runs}
assert nucls == {"A", "G"}
def test_empty_sequence(self):
assert detect_homopolymers("", min_run=5) == []
# ββ Restriction sites βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestRestrictionSites:
def test_ecori_present(self):
seq = "NNNNGAATTCNNNN" # EcoRI site
hits = scan_restriction_sites(seq, ["EcoRI"])
assert "EcoRI" in hits
assert any(h.strand == "+" for h in hits["EcoRI"])
def test_site_absent(self):
seq = "ATGCATGCATGC"
hits = scan_restriction_sites(seq, ["EcoRI"])
assert "EcoRI" not in hits
def test_reverse_complement(self):
# EcoRI on RC strand: GAATTC RC = GAATTC (palindrome)
seq = "NNNNGAATTCNNNN"
hits = scan_restriction_sites(seq, ["EcoRI"])
assert "EcoRI" in hits
def test_sites_present_list(self):
seq = "GAATTCGGATCC" # EcoRI + BamHI
present = sites_present(seq, ["EcoRI", "BamHI", "NotI"])
assert "EcoRI" in present
assert "BamHI" in present
assert "NotI" not in present
# ββ Kozak βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestKozak:
def test_strong_kozak(self):
# Ideal context: GCC ACC ATG G
seq = "NNNNGCCACCATGGCCC"
result = check_kozak(seq)
assert result.strength in ("strong", "adequate")
assert result.has_optimal_r3 # A at -3
def test_no_atg(self):
with pytest.raises(ValueError):
check_kozak("GCGCGCGCGC")
def test_score_range(self):
seq = "ATGCCCATG"
result = check_kozak(seq)
assert 0.0 <= result.score <= 1.0
def test_find_all_kozak(self):
seq = "ATGNNATGATGN"
results = find_all_kozak_contexts(seq)
assert len(results) == 3 # three ATGs
|