Spaces:

offtargeteffect
/

mrna-design-studio

Running

App Files Files Community

mrna-design-studio / core /analysis /analyzer.py

offtargeteffect

Add liability/QC, cluster & tree, and experiment tracking

bdd3f19 verified 2 days ago

Raw

History Blame Contribute Delete

12.1 kB

	"""
	SequenceAnalyzer — main analysis entry point.

	Runs all analysis modules against an mRNASequence and returns a
	structured AnalysisReport. Results are cached on the sequence object
	so repeated calls are cheap.
	"""
	from __future__ import annotations

	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Optional

	import numpy as np

	from core.models.sequence import mRNASequence
	from core.analysis.gc_content import (
	gc_percent,
	gc_sliding_window,
	gc_by_codon_position,
	)
	from core.analysis.cai import calculate_cai, codon_usage_report, CODON_TABLES
	from core.analysis.homopolymers import detect_homopolymers, HomopolymerRun
	from core.analysis.restriction_sites import (
	scan_restriction_sites,
	RestrictionSiteHit,
	COMMON_ENZYMES,
	)
	from core.analysis.kozak import check_kozak, KozakResult
	from core.analysis.structure import predict_structure, StructureResult
	from core.analysis.uridine import analyze_uridine, UridineReport
	from core.analysis.motifs import scan_motifs, MotifHit
	from core.analysis.liability import assess_liabilities, LiabilityReport


	@dataclass
	class AnalysisReport:
	"""All analysis results for a single mRNASequence."""
	sequence_id: str
	sequence_name: str
	sequence_length: int

	# GC content
	gc_percent_global: float = 0.0
	gc_sliding_positions: Optional[np.ndarray] = None # centre positions
	gc_sliding_values: Optional[np.ndarray] = None # GC% per window
	gc_by_codon_position: Optional[Dict[str, float]] = None

	# Codon Adaptation Index
	cai: Optional[float] = None
	cai_organism: Optional[str] = None
	codon_usage: Optional[Dict[str, int]] = None

	# Homopolymers
	homopolymer_runs: List[HomopolymerRun] = field(default_factory=list)
	homopolymer_count: int = 0
	longest_homopolymer: int = 0

	# Restriction sites
	restriction_hits: Dict[str, List[RestrictionSiteHit]] = field(default_factory=dict)
	restriction_enzymes_present: List[str] = field(default_factory=list)

	# Start / stop codon validation
	has_start_codon: Optional[bool] = None
	has_stop_codon: Optional[bool] = None
	stop_codon: Optional[str] = None
	in_frame: Optional[bool] = None

	# Kozak context
	kozak: Optional[KozakResult] = None

	# Secondary structure (ViennaRNA)
	structure: Optional[StructureResult] = None

	# Uridine content (immunogenicity proxy)
	uridine: Optional[UridineReport] = None

	# Sequence-liability motifs (uORF, premature polyA, ARE, splice donor)
	motif_hits: List[MotifHit] = field(default_factory=list)

	# Aggregated liability / QC assessment
	liability: Optional[LiabilityReport] = None

	# Errors / warnings generated during analysis
	warnings: List[str] = field(default_factory=list)

	def to_dict(self) -> Dict[str, Any]:
	return {
	"sequence_id": self.sequence_id,
	"sequence_name": self.sequence_name,
	"sequence_length": self.sequence_length,
	"gc_content": self.gc_percent_global,
	"gc_percent_global": self.gc_percent_global,
	"gc_by_codon_position": self.gc_by_codon_position,
	"cai": self.cai,
	"cai_organism": self.cai_organism,
	"homopolymer_count": self.homopolymer_count,
	"longest_homopolymer": self.longest_homopolymer,
	"restriction_site_count": len(self.restriction_enzymes_present),
	"restriction_enzymes_present": self.restriction_enzymes_present,
	"has_start_codon": self.has_start_codon,
	"has_stop_codon": self.has_stop_codon,
	"stop_codon": self.stop_codon,
	"in_frame": self.in_frame,
	"kozak_score": self.kozak.score if self.kozak else None,
	"kozak_strength": self.kozak.strength if self.kozak else None,
	"mfe": self.structure.mfe if self.structure else None,
	"uridine_percent": self.uridine.u_percent if self.uridine else None,
	"liability_score": self.liability.score if self.liability else None,
	"liability_verdict": self.liability.verdict if self.liability else None,
	"liability_critical": self.liability.n_critical if self.liability else None,
	"liability_warning": self.liability.n_warning if self.liability else None,
	"liability_flag_count": self.liability.flag_count if self.liability else None,
	"warnings": self.warnings,
	}


	class SequenceAnalyzer:
	"""
	Runs analysis modules against mRNASequence objects.

	Results are cached inside the sequence's _analysis_cache dict
	(keyed by analysis type) so re-running is a cache lookup.
	"""

	def __init__(
	self,
	gc_window: int = 100,
	gc_step: int = 1,
	homopolymer_min_run: int = 5,
	restriction_enzymes: Optional[List[str]] = None,
	cai_organism: str = "human",
	cai_custom_table: Optional[Dict[str, float]] = None,
	) -> None:
	self.gc_window = gc_window
	self.gc_step = gc_step
	self.homopolymer_min_run = homopolymer_min_run
	self.restriction_enzymes = restriction_enzymes or list(COMMON_ENZYMES.keys())
	self.cai_organism = cai_organism
	self.cai_custom_table = cai_custom_table

	# ── Individual analysis methods ─────────────────────────────────────────

	def analyze_gc(self, sequence: str) -> Dict[str, Any]:
	positions, values = gc_sliding_window(sequence, self.gc_window, self.gc_step)
	return {
	"gc_percent_global": gc_percent(sequence),
	"gc_sliding_positions": positions,
	"gc_sliding_values": values,
	}

	def analyze_cai(self, cds: str) -> Dict[str, Any]:
	try:
	cai_value = calculate_cai(cds, self.cai_organism, self.cai_custom_table)
	usage = codon_usage_report(cds)
	return {
	"cai": cai_value,
	"cai_organism": self.cai_custom_table and "custom" or self.cai_organism,
	"codon_usage": usage,
	}
	except Exception as e:
	return {"cai": None, "cai_organism": None, "codon_usage": None, "error": str(e)}

	def analyze_homopolymers(self, sequence: str) -> Dict[str, Any]:
	runs = detect_homopolymers(sequence, self.homopolymer_min_run)
	return {
	"homopolymer_runs": runs,
	"homopolymer_count": len(runs),
	"longest_homopolymer": max((r.length for r in runs), default=0),
	}

	def analyze_restriction_sites(self, sequence: str) -> Dict[str, Any]:
	hits = scan_restriction_sites(sequence, self.restriction_enzymes)
	return {
	"restriction_hits": hits,
	"restriction_enzymes_present": list(hits.keys()),
	}

	def validate_cds(self, cds: str) -> Dict[str, Any]:
	seq = cds.upper().replace("U", "T")
	start_codons = {"ATG"}
	stop_codons = {"TAA", "TAG", "TGA"}
	has_start = seq[:3] in start_codons if len(seq) >= 3 else False
	stop = seq[-3:] if len(seq) >= 3 else ""
	has_stop = stop in stop_codons
	in_frame = len(seq) % 3 == 0
	return {
	"has_start_codon": has_start,
	"has_stop_codon": has_stop,
	"stop_codon": stop if has_stop else None,
	"in_frame": in_frame,
	}

	def analyze_kozak(self, sequence: str) -> Dict[str, Any]:
	try:
	result = check_kozak(sequence)
	return {"kozak": result}
	except ValueError as e:
	return {"kozak": None, "kozak_warning": str(e)}

	def analyze_structure(self, sequence: str) -> Dict[str, Any]:
	result = predict_structure(sequence)
	return {"structure": result}

	# ── Full report ─────────────────────────────────────────────────────────

	def run_full_analysis(
	self,
	seq: mRNASequence,
	include_structure: bool = True,
	force_rerun: bool = False,
	) -> AnalysisReport:
	"""
	Run all analysis modules against seq and return an AnalysisReport.

	Results are cached in seq._analysis_cache. Set force_rerun=True
	to bypass the cache.
	"""
	cache_key = "full_analysis"
	if not force_rerun and cache_key in seq._analysis_cache:
	return seq._analysis_cache[cache_key] # type: ignore[return-value]

	full_seq = seq.assembled_sequence
	warnings: List[str] = []

	report = AnalysisReport(
	sequence_id=seq.id,
	sequence_name=seq.name,
	sequence_length=len(full_seq),
	)

	# GC content — run on full sequence
	gc_data = self.analyze_gc(full_seq)
	report.gc_percent_global = gc_data["gc_percent_global"]
	report.gc_sliding_positions = gc_data["gc_sliding_positions"]
	report.gc_sliding_values = gc_data["gc_sliding_values"]

	# GC by codon position — only if CDS available
	if seq.cds and len(seq.cds) % 3 == 0:
	try:
	report.gc_by_codon_position = gc_by_codon_position(seq.cds)
	except Exception as e:
	warnings.append(f"GC by codon position failed: {e}")

	# CAI — only on CDS
	if seq.cds:
	cai_data = self.analyze_cai(seq.cds)
	report.cai = cai_data.get("cai")
	report.cai_organism = cai_data.get("cai_organism")
	report.codon_usage = cai_data.get("codon_usage")
	if "error" in cai_data:
	warnings.append(f"CAI error: {cai_data['error']}")

	# CDS validation
	cds_data = self.validate_cds(seq.cds)
	report.has_start_codon = cds_data["has_start_codon"]
	report.has_stop_codon = cds_data["has_stop_codon"]
	report.stop_codon = cds_data["stop_codon"]
	report.in_frame = cds_data["in_frame"]

	if not report.has_start_codon:
	warnings.append("CDS does not begin with ATG.")
	if not report.has_stop_codon:
	warnings.append("CDS does not end with a stop codon.")
	if not report.in_frame:
	warnings.append("CDS length is not divisible by 3.")

	# Homopolymers — full assembled sequence
	hp_data = self.analyze_homopolymers(full_seq)
	report.homopolymer_runs = hp_data["homopolymer_runs"]
	report.homopolymer_count = hp_data["homopolymer_count"]
	report.longest_homopolymer = hp_data["longest_homopolymer"]

	# Restriction sites
	rs_data = self.analyze_restriction_sites(full_seq)
	report.restriction_hits = rs_data["restriction_hits"]
	report.restriction_enzymes_present = rs_data["restriction_enzymes_present"]

	# Kozak — try on kozak component, then fall back to full sequence
	kozak_seq = seq.kozak or full_seq
	kozak_data = self.analyze_kozak(kozak_seq)
	report.kozak = kozak_data.get("kozak")
	if "kozak_warning" in kozak_data:
	warnings.append(kozak_data["kozak_warning"])

	# Secondary structure
	if include_structure:
	struct_data = self.analyze_structure(full_seq)
	report.structure = struct_data["structure"]

	# Uridine content (immunogenicity proxy)
	report.uridine = analyze_uridine(full_seq)

	# Sequence-liability motifs (region-aware when components are available)
	report.motif_hits = scan_motifs(
	five_prime_utr=seq.five_prime_utr,
	cds=seq.cds,
	three_prime_utr=seq.three_prime_utr,
	full_seq=full_seq,
	)

	report.warnings = warnings

	# Aggregate everything into the liability / QC assessment
	report.liability = assess_liabilities(report, seq)

	# Cache result
	seq._analysis_cache[cache_key] = report
	return report