| """ |
| SequenceAnalyzer β main analysis entry point. |
| |
| Runs all analysis modules against an mRNASequence and returns a |
| structured AnalysisReport. Results are cached on the sequence object |
| so repeated calls are cheap. |
| """ |
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from typing import Any, Dict, List, Optional |
|
|
| import numpy as np |
|
|
| from core.models.sequence import mRNASequence |
| from core.analysis.gc_content import ( |
| gc_percent, |
| gc_sliding_window, |
| gc_by_codon_position, |
| ) |
| from core.analysis.cai import calculate_cai, codon_usage_report, CODON_TABLES |
| from core.analysis.homopolymers import detect_homopolymers, HomopolymerRun |
| from core.analysis.restriction_sites import ( |
| scan_restriction_sites, |
| RestrictionSiteHit, |
| COMMON_ENZYMES, |
| ) |
| from core.analysis.kozak import check_kozak, KozakResult |
| from core.analysis.structure import predict_structure, StructureResult |
| from core.analysis.uridine import analyze_uridine, UridineReport |
| from core.analysis.motifs import scan_motifs, MotifHit |
| from core.analysis.liability import assess_liabilities, LiabilityReport |
|
|
|
|
| @dataclass |
| class AnalysisReport: |
| """All analysis results for a single mRNASequence.""" |
| sequence_id: str |
| sequence_name: str |
| sequence_length: int |
|
|
| |
| gc_percent_global: float = 0.0 |
| gc_sliding_positions: Optional[np.ndarray] = None |
| gc_sliding_values: Optional[np.ndarray] = None |
| gc_by_codon_position: Optional[Dict[str, float]] = None |
|
|
| |
| cai: Optional[float] = None |
| cai_organism: Optional[str] = None |
| codon_usage: Optional[Dict[str, int]] = None |
|
|
| |
| homopolymer_runs: List[HomopolymerRun] = field(default_factory=list) |
| homopolymer_count: int = 0 |
| longest_homopolymer: int = 0 |
|
|
| |
| restriction_hits: Dict[str, List[RestrictionSiteHit]] = field(default_factory=dict) |
| restriction_enzymes_present: List[str] = field(default_factory=list) |
|
|
| |
| has_start_codon: Optional[bool] = None |
| has_stop_codon: Optional[bool] = None |
| stop_codon: Optional[str] = None |
| in_frame: Optional[bool] = None |
|
|
| |
| kozak: Optional[KozakResult] = None |
|
|
| |
| structure: Optional[StructureResult] = None |
|
|
| |
| uridine: Optional[UridineReport] = None |
|
|
| |
| motif_hits: List[MotifHit] = field(default_factory=list) |
|
|
| |
| liability: Optional[LiabilityReport] = None |
|
|
| |
| warnings: List[str] = field(default_factory=list) |
|
|
| def to_dict(self) -> Dict[str, Any]: |
| return { |
| "sequence_id": self.sequence_id, |
| "sequence_name": self.sequence_name, |
| "sequence_length": self.sequence_length, |
| "gc_content": self.gc_percent_global, |
| "gc_percent_global": self.gc_percent_global, |
| "gc_by_codon_position": self.gc_by_codon_position, |
| "cai": self.cai, |
| "cai_organism": self.cai_organism, |
| "homopolymer_count": self.homopolymer_count, |
| "longest_homopolymer": self.longest_homopolymer, |
| "restriction_site_count": len(self.restriction_enzymes_present), |
| "restriction_enzymes_present": self.restriction_enzymes_present, |
| "has_start_codon": self.has_start_codon, |
| "has_stop_codon": self.has_stop_codon, |
| "stop_codon": self.stop_codon, |
| "in_frame": self.in_frame, |
| "kozak_score": self.kozak.score if self.kozak else None, |
| "kozak_strength": self.kozak.strength if self.kozak else None, |
| "mfe": self.structure.mfe if self.structure else None, |
| "uridine_percent": self.uridine.u_percent if self.uridine else None, |
| "liability_score": self.liability.score if self.liability else None, |
| "liability_verdict": self.liability.verdict if self.liability else None, |
| "liability_critical": self.liability.n_critical if self.liability else None, |
| "liability_warning": self.liability.n_warning if self.liability else None, |
| "liability_flag_count": self.liability.flag_count if self.liability else None, |
| "warnings": self.warnings, |
| } |
|
|
|
|
| class SequenceAnalyzer: |
| """ |
| Runs analysis modules against mRNASequence objects. |
| |
| Results are cached inside the sequence's _analysis_cache dict |
| (keyed by analysis type) so re-running is a cache lookup. |
| """ |
|
|
| def __init__( |
| self, |
| gc_window: int = 100, |
| gc_step: int = 1, |
| homopolymer_min_run: int = 5, |
| restriction_enzymes: Optional[List[str]] = None, |
| cai_organism: str = "human", |
| cai_custom_table: Optional[Dict[str, float]] = None, |
| ) -> None: |
| self.gc_window = gc_window |
| self.gc_step = gc_step |
| self.homopolymer_min_run = homopolymer_min_run |
| self.restriction_enzymes = restriction_enzymes or list(COMMON_ENZYMES.keys()) |
| self.cai_organism = cai_organism |
| self.cai_custom_table = cai_custom_table |
|
|
| |
|
|
| def analyze_gc(self, sequence: str) -> Dict[str, Any]: |
| positions, values = gc_sliding_window(sequence, self.gc_window, self.gc_step) |
| return { |
| "gc_percent_global": gc_percent(sequence), |
| "gc_sliding_positions": positions, |
| "gc_sliding_values": values, |
| } |
|
|
| def analyze_cai(self, cds: str) -> Dict[str, Any]: |
| try: |
| cai_value = calculate_cai(cds, self.cai_organism, self.cai_custom_table) |
| usage = codon_usage_report(cds) |
| return { |
| "cai": cai_value, |
| "cai_organism": self.cai_custom_table and "custom" or self.cai_organism, |
| "codon_usage": usage, |
| } |
| except Exception as e: |
| return {"cai": None, "cai_organism": None, "codon_usage": None, "error": str(e)} |
|
|
| def analyze_homopolymers(self, sequence: str) -> Dict[str, Any]: |
| runs = detect_homopolymers(sequence, self.homopolymer_min_run) |
| return { |
| "homopolymer_runs": runs, |
| "homopolymer_count": len(runs), |
| "longest_homopolymer": max((r.length for r in runs), default=0), |
| } |
|
|
| def analyze_restriction_sites(self, sequence: str) -> Dict[str, Any]: |
| hits = scan_restriction_sites(sequence, self.restriction_enzymes) |
| return { |
| "restriction_hits": hits, |
| "restriction_enzymes_present": list(hits.keys()), |
| } |
|
|
| def validate_cds(self, cds: str) -> Dict[str, Any]: |
| seq = cds.upper().replace("U", "T") |
| start_codons = {"ATG"} |
| stop_codons = {"TAA", "TAG", "TGA"} |
| has_start = seq[:3] in start_codons if len(seq) >= 3 else False |
| stop = seq[-3:] if len(seq) >= 3 else "" |
| has_stop = stop in stop_codons |
| in_frame = len(seq) % 3 == 0 |
| return { |
| "has_start_codon": has_start, |
| "has_stop_codon": has_stop, |
| "stop_codon": stop if has_stop else None, |
| "in_frame": in_frame, |
| } |
|
|
| def analyze_kozak(self, sequence: str) -> Dict[str, Any]: |
| try: |
| result = check_kozak(sequence) |
| return {"kozak": result} |
| except ValueError as e: |
| return {"kozak": None, "kozak_warning": str(e)} |
|
|
| def analyze_structure(self, sequence: str) -> Dict[str, Any]: |
| result = predict_structure(sequence) |
| return {"structure": result} |
|
|
| |
|
|
| def run_full_analysis( |
| self, |
| seq: mRNASequence, |
| include_structure: bool = True, |
| force_rerun: bool = False, |
| ) -> AnalysisReport: |
| """ |
| Run all analysis modules against seq and return an AnalysisReport. |
| |
| Results are cached in seq._analysis_cache. Set force_rerun=True |
| to bypass the cache. |
| """ |
| cache_key = "full_analysis" |
| if not force_rerun and cache_key in seq._analysis_cache: |
| return seq._analysis_cache[cache_key] |
|
|
| full_seq = seq.assembled_sequence |
| warnings: List[str] = [] |
|
|
| report = AnalysisReport( |
| sequence_id=seq.id, |
| sequence_name=seq.name, |
| sequence_length=len(full_seq), |
| ) |
|
|
| |
| gc_data = self.analyze_gc(full_seq) |
| report.gc_percent_global = gc_data["gc_percent_global"] |
| report.gc_sliding_positions = gc_data["gc_sliding_positions"] |
| report.gc_sliding_values = gc_data["gc_sliding_values"] |
|
|
| |
| if seq.cds and len(seq.cds) % 3 == 0: |
| try: |
| report.gc_by_codon_position = gc_by_codon_position(seq.cds) |
| except Exception as e: |
| warnings.append(f"GC by codon position failed: {e}") |
|
|
| |
| if seq.cds: |
| cai_data = self.analyze_cai(seq.cds) |
| report.cai = cai_data.get("cai") |
| report.cai_organism = cai_data.get("cai_organism") |
| report.codon_usage = cai_data.get("codon_usage") |
| if "error" in cai_data: |
| warnings.append(f"CAI error: {cai_data['error']}") |
|
|
| |
| cds_data = self.validate_cds(seq.cds) |
| report.has_start_codon = cds_data["has_start_codon"] |
| report.has_stop_codon = cds_data["has_stop_codon"] |
| report.stop_codon = cds_data["stop_codon"] |
| report.in_frame = cds_data["in_frame"] |
|
|
| if not report.has_start_codon: |
| warnings.append("CDS does not begin with ATG.") |
| if not report.has_stop_codon: |
| warnings.append("CDS does not end with a stop codon.") |
| if not report.in_frame: |
| warnings.append("CDS length is not divisible by 3.") |
|
|
| |
| hp_data = self.analyze_homopolymers(full_seq) |
| report.homopolymer_runs = hp_data["homopolymer_runs"] |
| report.homopolymer_count = hp_data["homopolymer_count"] |
| report.longest_homopolymer = hp_data["longest_homopolymer"] |
|
|
| |
| rs_data = self.analyze_restriction_sites(full_seq) |
| report.restriction_hits = rs_data["restriction_hits"] |
| report.restriction_enzymes_present = rs_data["restriction_enzymes_present"] |
|
|
| |
| kozak_seq = seq.kozak or full_seq |
| kozak_data = self.analyze_kozak(kozak_seq) |
| report.kozak = kozak_data.get("kozak") |
| if "kozak_warning" in kozak_data: |
| warnings.append(kozak_data["kozak_warning"]) |
|
|
| |
| if include_structure: |
| struct_data = self.analyze_structure(full_seq) |
| report.structure = struct_data["structure"] |
|
|
| |
| report.uridine = analyze_uridine(full_seq) |
|
|
| |
| report.motif_hits = scan_motifs( |
| five_prime_utr=seq.five_prime_utr, |
| cds=seq.cds, |
| three_prime_utr=seq.three_prime_utr, |
| full_seq=full_seq, |
| ) |
|
|
| report.warnings = warnings |
|
|
| |
| report.liability = assess_liabilities(report, seq) |
|
|
| |
| seq._analysis_cache[cache_key] = report |
| return report |
|
|