File size: 1,531 Bytes
52cf5ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
"""Smoke test the feature extractor on a tiny synthetic genome."""
from __future__ import annotations

import gzip
from pathlib import Path

from microbe_model.features.genome import extract_features


def _write_fake_genome(path: Path) -> None:
    """Write a tiny FASTA with two contigs of synthetic GC-balanced sequence."""
    contigs = [
        (">contig_1\n" + ("ATGCGTACGTAGCTAGCTAGCATGCGTACG" * 200) + "\n"),
        (">contig_2\n" + ("CGTACGATCGATCGTACGTAGCTACGATGC" * 200) + "\n"),
    ]
    with gzip.open(path, "wt") as fh:
        fh.write("".join(contigs))


def test_extract_features_runs(tmp_path: Path) -> None:
    fasta = tmp_path / "fake.fna.gz"
    _write_fake_genome(fasta)

    feats = extract_features(fasta)

    assert feats["genome_size_nt"] > 0
    assert 0 <= feats["gc_content"] <= 1
    assert feats["n_contigs"] == 2
    assert feats["n_predicted_cds"] >= 0  # synthetic seq may have no real ORFs

    # Amino acid fractions should sum to ~1 if any proteins were found, else 0.
    aa_total = sum(v for k, v in feats.items() if k.startswith("aa_frac_"))
    assert aa_total == 0.0 or abs(aa_total - 1.0) < 1e-6


def test_isoelectric_point_in_range() -> None:
    from microbe_model.features.genome import _isoelectric_point

    assert 0 <= _isoelectric_point("AAAAA") <= 14
    assert 0 <= _isoelectric_point("DDDDD") <= 14
    assert 0 <= _isoelectric_point("KKKKK") <= 14
    # Acidic protein should have lower pI than basic
    assert _isoelectric_point("DDDDD") < _isoelectric_point("KKKKK")