offtargeteffect's picture
Add liability/QC, cluster & tree, and experiment tracking
bdd3f19 verified
Raw
History Blame Contribute Delete
12.1 kB
"""
SequenceAnalyzer β€” main analysis entry point.
Runs all analysis modules against an mRNASequence and returns a
structured AnalysisReport. Results are cached on the sequence object
so repeated calls are cheap.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
import numpy as np
from core.models.sequence import mRNASequence
from core.analysis.gc_content import (
gc_percent,
gc_sliding_window,
gc_by_codon_position,
)
from core.analysis.cai import calculate_cai, codon_usage_report, CODON_TABLES
from core.analysis.homopolymers import detect_homopolymers, HomopolymerRun
from core.analysis.restriction_sites import (
scan_restriction_sites,
RestrictionSiteHit,
COMMON_ENZYMES,
)
from core.analysis.kozak import check_kozak, KozakResult
from core.analysis.structure import predict_structure, StructureResult
from core.analysis.uridine import analyze_uridine, UridineReport
from core.analysis.motifs import scan_motifs, MotifHit
from core.analysis.liability import assess_liabilities, LiabilityReport
@dataclass
class AnalysisReport:
"""All analysis results for a single mRNASequence."""
sequence_id: str
sequence_name: str
sequence_length: int
# GC content
gc_percent_global: float = 0.0
gc_sliding_positions: Optional[np.ndarray] = None # centre positions
gc_sliding_values: Optional[np.ndarray] = None # GC% per window
gc_by_codon_position: Optional[Dict[str, float]] = None
# Codon Adaptation Index
cai: Optional[float] = None
cai_organism: Optional[str] = None
codon_usage: Optional[Dict[str, int]] = None
# Homopolymers
homopolymer_runs: List[HomopolymerRun] = field(default_factory=list)
homopolymer_count: int = 0
longest_homopolymer: int = 0
# Restriction sites
restriction_hits: Dict[str, List[RestrictionSiteHit]] = field(default_factory=dict)
restriction_enzymes_present: List[str] = field(default_factory=list)
# Start / stop codon validation
has_start_codon: Optional[bool] = None
has_stop_codon: Optional[bool] = None
stop_codon: Optional[str] = None
in_frame: Optional[bool] = None
# Kozak context
kozak: Optional[KozakResult] = None
# Secondary structure (ViennaRNA)
structure: Optional[StructureResult] = None
# Uridine content (immunogenicity proxy)
uridine: Optional[UridineReport] = None
# Sequence-liability motifs (uORF, premature polyA, ARE, splice donor)
motif_hits: List[MotifHit] = field(default_factory=list)
# Aggregated liability / QC assessment
liability: Optional[LiabilityReport] = None
# Errors / warnings generated during analysis
warnings: List[str] = field(default_factory=list)
def to_dict(self) -> Dict[str, Any]:
return {
"sequence_id": self.sequence_id,
"sequence_name": self.sequence_name,
"sequence_length": self.sequence_length,
"gc_content": self.gc_percent_global,
"gc_percent_global": self.gc_percent_global,
"gc_by_codon_position": self.gc_by_codon_position,
"cai": self.cai,
"cai_organism": self.cai_organism,
"homopolymer_count": self.homopolymer_count,
"longest_homopolymer": self.longest_homopolymer,
"restriction_site_count": len(self.restriction_enzymes_present),
"restriction_enzymes_present": self.restriction_enzymes_present,
"has_start_codon": self.has_start_codon,
"has_stop_codon": self.has_stop_codon,
"stop_codon": self.stop_codon,
"in_frame": self.in_frame,
"kozak_score": self.kozak.score if self.kozak else None,
"kozak_strength": self.kozak.strength if self.kozak else None,
"mfe": self.structure.mfe if self.structure else None,
"uridine_percent": self.uridine.u_percent if self.uridine else None,
"liability_score": self.liability.score if self.liability else None,
"liability_verdict": self.liability.verdict if self.liability else None,
"liability_critical": self.liability.n_critical if self.liability else None,
"liability_warning": self.liability.n_warning if self.liability else None,
"liability_flag_count": self.liability.flag_count if self.liability else None,
"warnings": self.warnings,
}
class SequenceAnalyzer:
"""
Runs analysis modules against mRNASequence objects.
Results are cached inside the sequence's _analysis_cache dict
(keyed by analysis type) so re-running is a cache lookup.
"""
def __init__(
self,
gc_window: int = 100,
gc_step: int = 1,
homopolymer_min_run: int = 5,
restriction_enzymes: Optional[List[str]] = None,
cai_organism: str = "human",
cai_custom_table: Optional[Dict[str, float]] = None,
) -> None:
self.gc_window = gc_window
self.gc_step = gc_step
self.homopolymer_min_run = homopolymer_min_run
self.restriction_enzymes = restriction_enzymes or list(COMMON_ENZYMES.keys())
self.cai_organism = cai_organism
self.cai_custom_table = cai_custom_table
# ── Individual analysis methods ─────────────────────────────────────────
def analyze_gc(self, sequence: str) -> Dict[str, Any]:
positions, values = gc_sliding_window(sequence, self.gc_window, self.gc_step)
return {
"gc_percent_global": gc_percent(sequence),
"gc_sliding_positions": positions,
"gc_sliding_values": values,
}
def analyze_cai(self, cds: str) -> Dict[str, Any]:
try:
cai_value = calculate_cai(cds, self.cai_organism, self.cai_custom_table)
usage = codon_usage_report(cds)
return {
"cai": cai_value,
"cai_organism": self.cai_custom_table and "custom" or self.cai_organism,
"codon_usage": usage,
}
except Exception as e:
return {"cai": None, "cai_organism": None, "codon_usage": None, "error": str(e)}
def analyze_homopolymers(self, sequence: str) -> Dict[str, Any]:
runs = detect_homopolymers(sequence, self.homopolymer_min_run)
return {
"homopolymer_runs": runs,
"homopolymer_count": len(runs),
"longest_homopolymer": max((r.length for r in runs), default=0),
}
def analyze_restriction_sites(self, sequence: str) -> Dict[str, Any]:
hits = scan_restriction_sites(sequence, self.restriction_enzymes)
return {
"restriction_hits": hits,
"restriction_enzymes_present": list(hits.keys()),
}
def validate_cds(self, cds: str) -> Dict[str, Any]:
seq = cds.upper().replace("U", "T")
start_codons = {"ATG"}
stop_codons = {"TAA", "TAG", "TGA"}
has_start = seq[:3] in start_codons if len(seq) >= 3 else False
stop = seq[-3:] if len(seq) >= 3 else ""
has_stop = stop in stop_codons
in_frame = len(seq) % 3 == 0
return {
"has_start_codon": has_start,
"has_stop_codon": has_stop,
"stop_codon": stop if has_stop else None,
"in_frame": in_frame,
}
def analyze_kozak(self, sequence: str) -> Dict[str, Any]:
try:
result = check_kozak(sequence)
return {"kozak": result}
except ValueError as e:
return {"kozak": None, "kozak_warning": str(e)}
def analyze_structure(self, sequence: str) -> Dict[str, Any]:
result = predict_structure(sequence)
return {"structure": result}
# ── Full report ─────────────────────────────────────────────────────────
def run_full_analysis(
self,
seq: mRNASequence,
include_structure: bool = True,
force_rerun: bool = False,
) -> AnalysisReport:
"""
Run all analysis modules against seq and return an AnalysisReport.
Results are cached in seq._analysis_cache. Set force_rerun=True
to bypass the cache.
"""
cache_key = "full_analysis"
if not force_rerun and cache_key in seq._analysis_cache:
return seq._analysis_cache[cache_key] # type: ignore[return-value]
full_seq = seq.assembled_sequence
warnings: List[str] = []
report = AnalysisReport(
sequence_id=seq.id,
sequence_name=seq.name,
sequence_length=len(full_seq),
)
# GC content β€” run on full sequence
gc_data = self.analyze_gc(full_seq)
report.gc_percent_global = gc_data["gc_percent_global"]
report.gc_sliding_positions = gc_data["gc_sliding_positions"]
report.gc_sliding_values = gc_data["gc_sliding_values"]
# GC by codon position β€” only if CDS available
if seq.cds and len(seq.cds) % 3 == 0:
try:
report.gc_by_codon_position = gc_by_codon_position(seq.cds)
except Exception as e:
warnings.append(f"GC by codon position failed: {e}")
# CAI β€” only on CDS
if seq.cds:
cai_data = self.analyze_cai(seq.cds)
report.cai = cai_data.get("cai")
report.cai_organism = cai_data.get("cai_organism")
report.codon_usage = cai_data.get("codon_usage")
if "error" in cai_data:
warnings.append(f"CAI error: {cai_data['error']}")
# CDS validation
cds_data = self.validate_cds(seq.cds)
report.has_start_codon = cds_data["has_start_codon"]
report.has_stop_codon = cds_data["has_stop_codon"]
report.stop_codon = cds_data["stop_codon"]
report.in_frame = cds_data["in_frame"]
if not report.has_start_codon:
warnings.append("CDS does not begin with ATG.")
if not report.has_stop_codon:
warnings.append("CDS does not end with a stop codon.")
if not report.in_frame:
warnings.append("CDS length is not divisible by 3.")
# Homopolymers β€” full assembled sequence
hp_data = self.analyze_homopolymers(full_seq)
report.homopolymer_runs = hp_data["homopolymer_runs"]
report.homopolymer_count = hp_data["homopolymer_count"]
report.longest_homopolymer = hp_data["longest_homopolymer"]
# Restriction sites
rs_data = self.analyze_restriction_sites(full_seq)
report.restriction_hits = rs_data["restriction_hits"]
report.restriction_enzymes_present = rs_data["restriction_enzymes_present"]
# Kozak β€” try on kozak component, then fall back to full sequence
kozak_seq = seq.kozak or full_seq
kozak_data = self.analyze_kozak(kozak_seq)
report.kozak = kozak_data.get("kozak")
if "kozak_warning" in kozak_data:
warnings.append(kozak_data["kozak_warning"])
# Secondary structure
if include_structure:
struct_data = self.analyze_structure(full_seq)
report.structure = struct_data["structure"]
# Uridine content (immunogenicity proxy)
report.uridine = analyze_uridine(full_seq)
# Sequence-liability motifs (region-aware when components are available)
report.motif_hits = scan_motifs(
five_prime_utr=seq.five_prime_utr,
cds=seq.cds,
three_prime_utr=seq.three_prime_utr,
full_seq=full_seq,
)
report.warnings = warnings
# Aggregate everything into the liability / QC assessment
report.liability = assess_liabilities(report, seq)
# Cache result
seq._analysis_cache[cache_key] = report
return report