Spaces:

offtargeteffect
/

mrna-design-studio

Running

File size: 12,113 Bytes

"""
SequenceAnalyzer — main analysis entry point.

Runs all analysis modules against an mRNASequence and returns a
structured AnalysisReport. Results are cached on the sequence object
so repeated calls are cheap.
"""
from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

import numpy as np

from core.models.sequence import mRNASequence
from core.analysis.gc_content import (
    gc_percent,
    gc_sliding_window,
    gc_by_codon_position,
)
from core.analysis.cai import calculate_cai, codon_usage_report, CODON_TABLES
from core.analysis.homopolymers import detect_homopolymers, HomopolymerRun
from core.analysis.restriction_sites import (
    scan_restriction_sites,
    RestrictionSiteHit,
    COMMON_ENZYMES,
)
from core.analysis.kozak import check_kozak, KozakResult
from core.analysis.structure import predict_structure, StructureResult
from core.analysis.uridine import analyze_uridine, UridineReport
from core.analysis.motifs import scan_motifs, MotifHit
from core.analysis.liability import assess_liabilities, LiabilityReport


@dataclass
class AnalysisReport:
    """All analysis results for a single mRNASequence."""
    sequence_id: str
    sequence_name: str
    sequence_length: int

    # GC content
    gc_percent_global: float = 0.0
    gc_sliding_positions: Optional[np.ndarray] = None  # centre positions
    gc_sliding_values: Optional[np.ndarray] = None     # GC% per window
    gc_by_codon_position: Optional[Dict[str, float]] = None

    # Codon Adaptation Index
    cai: Optional[float] = None
    cai_organism: Optional[str] = None
    codon_usage: Optional[Dict[str, int]] = None

    # Homopolymers
    homopolymer_runs: List[HomopolymerRun] = field(default_factory=list)
    homopolymer_count: int = 0
    longest_homopolymer: int = 0

    # Restriction sites
    restriction_hits: Dict[str, List[RestrictionSiteHit]] = field(default_factory=dict)
    restriction_enzymes_present: List[str] = field(default_factory=list)

    # Start / stop codon validation
    has_start_codon: Optional[bool] = None
    has_stop_codon: Optional[bool] = None
    stop_codon: Optional[str] = None
    in_frame: Optional[bool] = None

    # Kozak context
    kozak: Optional[KozakResult] = None

    # Secondary structure (ViennaRNA)
    structure: Optional[StructureResult] = None

    # Uridine content (immunogenicity proxy)
    uridine: Optional[UridineReport] = None

    # Sequence-liability motifs (uORF, premature polyA, ARE, splice donor)
    motif_hits: List[MotifHit] = field(default_factory=list)

    # Aggregated liability / QC assessment
    liability: Optional[LiabilityReport] = None

    # Errors / warnings generated during analysis
    warnings: List[str] = field(default_factory=list)

    def to_dict(self) -> Dict[str, Any]:
        return {
            "sequence_id": self.sequence_id,
            "sequence_name": self.sequence_name,
            "sequence_length": self.sequence_length,
            "gc_content": self.gc_percent_global,
            "gc_percent_global": self.gc_percent_global,
            "gc_by_codon_position": self.gc_by_codon_position,
            "cai": self.cai,
            "cai_organism": self.cai_organism,
            "homopolymer_count": self.homopolymer_count,
            "longest_homopolymer": self.longest_homopolymer,
            "restriction_site_count": len(self.restriction_enzymes_present),
            "restriction_enzymes_present": self.restriction_enzymes_present,
            "has_start_codon": self.has_start_codon,
            "has_stop_codon": self.has_stop_codon,
            "stop_codon": self.stop_codon,
            "in_frame": self.in_frame,
            "kozak_score": self.kozak.score if self.kozak else None,
            "kozak_strength": self.kozak.strength if self.kozak else None,
            "mfe": self.structure.mfe if self.structure else None,
            "uridine_percent": self.uridine.u_percent if self.uridine else None,
            "liability_score": self.liability.score if self.liability else None,
            "liability_verdict": self.liability.verdict if self.liability else None,
            "liability_critical": self.liability.n_critical if self.liability else None,
            "liability_warning": self.liability.n_warning if self.liability else None,
            "liability_flag_count": self.liability.flag_count if self.liability else None,
            "warnings": self.warnings,
        }


class SequenceAnalyzer:
    """
    Runs analysis modules against mRNASequence objects.

    Results are cached inside the sequence's _analysis_cache dict
    (keyed by analysis type) so re-running is a cache lookup.
    """

    def __init__(
        self,
        gc_window: int = 100,
        gc_step: int = 1,
        homopolymer_min_run: int = 5,
        restriction_enzymes: Optional[List[str]] = None,
        cai_organism: str = "human",
        cai_custom_table: Optional[Dict[str, float]] = None,
    ) -> None:
        self.gc_window = gc_window
        self.gc_step = gc_step
        self.homopolymer_min_run = homopolymer_min_run
        self.restriction_enzymes = restriction_enzymes or list(COMMON_ENZYMES.keys())
        self.cai_organism = cai_organism
        self.cai_custom_table = cai_custom_table

    # ── Individual analysis methods ─────────────────────────────────────────

    def analyze_gc(self, sequence: str) -> Dict[str, Any]:
        positions, values = gc_sliding_window(sequence, self.gc_window, self.gc_step)
        return {
            "gc_percent_global": gc_percent(sequence),
            "gc_sliding_positions": positions,
            "gc_sliding_values": values,
        }

    def analyze_cai(self, cds: str) -> Dict[str, Any]:
        try:
            cai_value = calculate_cai(cds, self.cai_organism, self.cai_custom_table)
            usage = codon_usage_report(cds)
            return {
                "cai": cai_value,
                "cai_organism": self.cai_custom_table and "custom" or self.cai_organism,
                "codon_usage": usage,
            }
        except Exception as e:
            return {"cai": None, "cai_organism": None, "codon_usage": None, "error": str(e)}

    def analyze_homopolymers(self, sequence: str) -> Dict[str, Any]:
        runs = detect_homopolymers(sequence, self.homopolymer_min_run)
        return {
            "homopolymer_runs": runs,
            "homopolymer_count": len(runs),
            "longest_homopolymer": max((r.length for r in runs), default=0),
        }

    def analyze_restriction_sites(self, sequence: str) -> Dict[str, Any]:
        hits = scan_restriction_sites(sequence, self.restriction_enzymes)
        return {
            "restriction_hits": hits,
            "restriction_enzymes_present": list(hits.keys()),
        }

    def validate_cds(self, cds: str) -> Dict[str, Any]:
        seq = cds.upper().replace("U", "T")
        start_codons = {"ATG"}
        stop_codons = {"TAA", "TAG", "TGA"}
        has_start = seq[:3] in start_codons if len(seq) >= 3 else False
        stop = seq[-3:] if len(seq) >= 3 else ""
        has_stop = stop in stop_codons
        in_frame = len(seq) % 3 == 0
        return {
            "has_start_codon": has_start,
            "has_stop_codon": has_stop,
            "stop_codon": stop if has_stop else None,
            "in_frame": in_frame,
        }

    def analyze_kozak(self, sequence: str) -> Dict[str, Any]:
        try:
            result = check_kozak(sequence)
            return {"kozak": result}
        except ValueError as e:
            return {"kozak": None, "kozak_warning": str(e)}

    def analyze_structure(self, sequence: str) -> Dict[str, Any]:
        result = predict_structure(sequence)
        return {"structure": result}

    # ── Full report ─────────────────────────────────────────────────────────

    def run_full_analysis(
        self,
        seq: mRNASequence,
        include_structure: bool = True,
        force_rerun: bool = False,
    ) -> AnalysisReport:
        """
        Run all analysis modules against seq and return an AnalysisReport.

        Results are cached in seq._analysis_cache. Set force_rerun=True
        to bypass the cache.
        """
        cache_key = "full_analysis"
        if not force_rerun and cache_key in seq._analysis_cache:
            return seq._analysis_cache[cache_key]  # type: ignore[return-value]

        full_seq = seq.assembled_sequence
        warnings: List[str] = []

        report = AnalysisReport(
            sequence_id=seq.id,
            sequence_name=seq.name,
            sequence_length=len(full_seq),
        )

        # GC content — run on full sequence
        gc_data = self.analyze_gc(full_seq)
        report.gc_percent_global = gc_data["gc_percent_global"]
        report.gc_sliding_positions = gc_data["gc_sliding_positions"]
        report.gc_sliding_values = gc_data["gc_sliding_values"]

        # GC by codon position — only if CDS available
        if seq.cds and len(seq.cds) % 3 == 0:
            try:
                report.gc_by_codon_position = gc_by_codon_position(seq.cds)
            except Exception as e:
                warnings.append(f"GC by codon position failed: {e}")

        # CAI — only on CDS
        if seq.cds:
            cai_data = self.analyze_cai(seq.cds)
            report.cai = cai_data.get("cai")
            report.cai_organism = cai_data.get("cai_organism")
            report.codon_usage = cai_data.get("codon_usage")
            if "error" in cai_data:
                warnings.append(f"CAI error: {cai_data['error']}")

            # CDS validation
            cds_data = self.validate_cds(seq.cds)
            report.has_start_codon = cds_data["has_start_codon"]
            report.has_stop_codon = cds_data["has_stop_codon"]
            report.stop_codon = cds_data["stop_codon"]
            report.in_frame = cds_data["in_frame"]

            if not report.has_start_codon:
                warnings.append("CDS does not begin with ATG.")
            if not report.has_stop_codon:
                warnings.append("CDS does not end with a stop codon.")
            if not report.in_frame:
                warnings.append("CDS length is not divisible by 3.")

        # Homopolymers — full assembled sequence
        hp_data = self.analyze_homopolymers(full_seq)
        report.homopolymer_runs = hp_data["homopolymer_runs"]
        report.homopolymer_count = hp_data["homopolymer_count"]
        report.longest_homopolymer = hp_data["longest_homopolymer"]

        # Restriction sites
        rs_data = self.analyze_restriction_sites(full_seq)
        report.restriction_hits = rs_data["restriction_hits"]
        report.restriction_enzymes_present = rs_data["restriction_enzymes_present"]

        # Kozak — try on kozak component, then fall back to full sequence
        kozak_seq = seq.kozak or full_seq
        kozak_data = self.analyze_kozak(kozak_seq)
        report.kozak = kozak_data.get("kozak")
        if "kozak_warning" in kozak_data:
            warnings.append(kozak_data["kozak_warning"])

        # Secondary structure
        if include_structure:
            struct_data = self.analyze_structure(full_seq)
            report.structure = struct_data["structure"]

        # Uridine content (immunogenicity proxy)
        report.uridine = analyze_uridine(full_seq)

        # Sequence-liability motifs (region-aware when components are available)
        report.motif_hits = scan_motifs(
            five_prime_utr=seq.five_prime_utr,
            cds=seq.cds,
            three_prime_utr=seq.three_prime_utr,
            full_seq=full_seq,
        )

        report.warnings = warnings

        # Aggregate everything into the liability / QC assessment
        report.liability = assess_liabilities(report, seq)

        # Cache result
        seq._analysis_cache[cache_key] = report
        return report