mrna-design-studio / core /analysis /liability.py
offtargeteffect's picture
Add liability/QC, cluster & tree, and experiment tracking
bdd3f19 verified
Raw
History Blame Contribute Delete
10.9 kB
"""
Liability / QC aggregator.
Rolls the individual analysis results (GC, homopolymers, restriction sites,
uridine, CDS validation, Kozak, secondary structure, sequence motifs) into a
single severity-ranked liability report with an overall QC score (0–100) and a
pass / review / fail verdict β€” analogous to a developability/liability overlay.
This module is a pure aggregator: it reads attributes off an already-computed
analysis report (duck-typed) and the sequence object, so it imports neither the
analyzer nor the Panel UI. It only depends on the homopolymer detector (to
re-scan the construct *body*, excluding the legitimate poly-A tail).
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, List, Optional
from core.analysis.homopolymers import detect_homopolymers
# Severity levels, ordered most→least severe
CRITICAL = "critical"
WARNING = "warning"
INFO = "info"
_SEVERITY_ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2}
_PENALTY = {CRITICAL: 25, WARNING: 10, INFO: 3}
# Thresholds
_GC_LOW, _GC_HIGH = 40.0, 65.0 # warning band
_GC_LOW_CRIT, _GC_HIGH_CRIT = 30.0, 70.0 # critical band
_HOMOPOLYMER_WARN = 10 # body run length β†’ warning
_HOMOPOLYMER_CRIT = 15 # body run length β†’ critical
_URIDINE_WARN_PCT = 40.0
_MFE_PER_NT_INFO = -0.45 # very negative β†’ highly structured
@dataclass
class LiabilityFlag:
"""A single liability finding."""
id: str
category: str # "GC", "Homopolymer", "Restriction", "Uridine",
# "CDS", "Kozak", "Structure", "Motif"
severity: str # CRITICAL | WARNING | INFO
title: str
detail: str
location: str = "" # human-readable location, e.g. "CDS pos 123"
recommendation: str = ""
@dataclass
class LiabilityReport:
"""Aggregated liability assessment for one sequence."""
flags: List[LiabilityFlag] = field(default_factory=list)
score: int = 100 # 0–100, higher is cleaner
verdict: str = "pass" # "pass" | "review" | "fail"
n_critical: int = 0
n_warning: int = 0
n_info: int = 0
checks_run: int = 0
@property
def flag_count(self) -> int:
return len(self.flags)
def sorted_flags(self) -> List[LiabilityFlag]:
return sorted(self.flags, key=lambda f: _SEVERITY_ORDER.get(f.severity, 9))
def _body_sequence(seq: Any) -> str:
"""Construct body = everything except the legitimate poly-A tail."""
parts = [
getattr(seq, "five_prime_utr", None),
getattr(seq, "kozak", None),
getattr(seq, "cds", None),
getattr(seq, "three_prime_utr", None),
]
return "".join(p for p in parts if p).upper().replace("U", "T")
def assess_liabilities(report: Any, seq: Any) -> LiabilityReport:
"""
Build a LiabilityReport from an analysis ``report`` and its ``seq``.
``report`` is duck-typed: it is expected to expose the attributes set by
SequenceAnalyzer (gc_percent_global, restriction_enzymes_present, uridine,
has_start_codon/has_stop_codon/in_frame, kozak, structure, motif_hits).
"""
flags: List[LiabilityFlag] = []
checks = 0
def add(category, severity, title, detail, location="", recommendation=""):
flags.append(LiabilityFlag(
id=f"{category.lower()}-{len(flags)}",
category=category, severity=severity, title=title,
detail=detail, location=location, recommendation=recommendation,
))
# ── GC content ────────────────────────────────────────────────────────────
checks += 1
gc = getattr(report, "gc_percent_global", None)
if gc is not None and gc > 0:
if gc < _GC_LOW_CRIT or gc > _GC_HIGH_CRIT:
add("GC", CRITICAL, "GC content far outside optimal range",
f"Global GC is {gc:.1f}% (optimal {_GC_LOW:.0f}–{_GC_HIGH:.0f}%).",
"global",
"Re-balance GC; extremes hurt synthesis, translation, and stability.")
elif gc < _GC_LOW or gc > _GC_HIGH:
add("GC", WARNING, "GC content outside optimal range",
f"Global GC is {gc:.1f}% (optimal {_GC_LOW:.0f}–{_GC_HIGH:.0f}%).",
"global",
"Nudge GC toward 40–65% during codon optimisation.")
# ── Homopolymers in the body (exclude poly-A tail) ────────────────────────
checks += 1
body = _body_sequence(seq)
if body:
body_runs = detect_homopolymers(body, min_run=_HOMOPOLYMER_WARN)
if body_runs:
longest = max(body_runs, key=lambda r: r.length)
sev = CRITICAL if longest.length >= _HOMOPOLYMER_CRIT else WARNING
add("Homopolymer", sev, "Homopolymer run in construct body",
f"{len(body_runs)} run(s) β‰₯{_HOMOPOLYMER_WARN} nt; longest "
f"{longest.nucleotide}Γ—{longest.length}.",
f"body pos {longest.start}",
"Break up long single-base runs to avoid synthesis errors and "
"polymerase slippage.")
# ── Restriction sites ─────────────────────────────────────────────────────
checks += 1
enzymes = getattr(report, "restriction_enzymes_present", None) or []
if enzymes:
add("Restriction", WARNING, "Internal restriction sites present",
f"Sites for: {', '.join(sorted(enzymes))}.",
"construct",
"Remove internal sites or pick a cloning strategy that avoids them.")
# ── Uridine content / high-U stretches ────────────────────────────────────
checks += 1
uri = getattr(report, "uridine", None)
if uri is not None:
stretches = getattr(uri, "high_u_stretches", []) or []
u_pct = getattr(uri, "u_percent", 0.0)
if u_pct >= _URIDINE_WARN_PCT or stretches:
detail = f"Uridine {u_pct:.1f}%"
if stretches:
detail += f"; {len(stretches)} high-U stretch(es)"
add("Uridine", WARNING, "Elevated uridine content",
detail + ".",
"construct",
"High U is immunostimulatory β€” optimise sequence and/or use "
"modified nucleotides (e.g. N1-methylpseudouridine).")
# ── CDS integrity ─────────────────────────────────────────────────────────
if getattr(report, "has_start_codon", None) is not None:
checks += 1
if report.has_start_codon is False:
add("CDS", CRITICAL, "CDS missing start codon",
"CDS does not begin with ATG.", "CDS 5' end",
"Ensure the CDS starts with ATG.")
if getattr(report, "has_stop_codon", None) is False:
add("CDS", CRITICAL, "CDS missing stop codon",
"CDS does not end with a stop codon.", "CDS 3' end",
"Append a stop codon (TAA/TAG/TGA).")
if getattr(report, "in_frame", None) is False:
add("CDS", CRITICAL, "CDS not in frame",
"CDS length is not divisible by 3.", "CDS",
"Fix indels so the CDS length is a multiple of 3.")
# ── Kozak context ─────────────────────────────────────────────────────────
kz = getattr(report, "kozak", None)
if kz is not None:
checks += 1
strength = getattr(kz, "strength", None)
if strength == "weak":
add("Kozak", WARNING, "Weak Kozak context",
f"Kozak score {getattr(kz, 'score', 0):.2f} (weak).",
"around start codon",
"Strengthen Kozak: purine (A/G) at -3 and G at +4.")
elif strength == "adequate":
add("Kozak", INFO, "Sub-optimal Kozak context",
f"Kozak score {getattr(kz, 'score', 0):.2f} (adequate).",
"around start codon",
"Optional: optimise -3/+4 positions for stronger initiation.")
# ── Secondary structure (if computed) ─────────────────────────────────────
struct = getattr(report, "structure", None)
if struct is not None and not getattr(struct, "is_stub", True):
checks += 1
length = max(len(getattr(struct, "sequence", "") or ""), 1)
per_nt = getattr(struct, "mfe", 0.0) / length
if per_nt < _MFE_PER_NT_INFO:
add("Structure", INFO, "Highly structured mRNA",
f"MFE {struct.mfe:.1f} kcal/mol ({per_nt:.2f}/nt).",
"global",
"Strong structure (esp. near the 5' cap/start) can impede "
"translation initiation.")
# ── Sequence motifs ───────────────────────────────────────────────────────
motif_hits = getattr(report, "motif_hits", None) or []
if motif_hits:
checks += 1
# group by motif name
by_name: dict = {}
for h in motif_hits:
by_name.setdefault(h.name, []).append(h)
for name, group in by_name.items():
first = group[0]
sev = min((h.severity for h in group), key=lambda s: _SEVERITY_ORDER.get(s, 9))
positions = ", ".join(f"{h.region}:{h.start}" for h in group[:5])
if len(group) > 5:
positions += f" (+{len(group) - 5} more)"
add("Motif", sev, first.label,
f"{len(group)} occurrence(s). {first.description}",
positions,
first.recommendation)
# ── Score & verdict ───────────────────────────────────────────────────────
n_crit = sum(1 for f in flags if f.severity == CRITICAL)
n_warn = sum(1 for f in flags if f.severity == WARNING)
n_info = sum(1 for f in flags if f.severity == INFO)
penalty = sum(_PENALTY.get(f.severity, 0) for f in flags)
score = max(0, min(100, 100 - penalty))
if n_crit > 0:
verdict = "fail"
elif n_warn > 0:
verdict = "review"
else:
verdict = "pass"
return LiabilityReport(
flags=flags, score=score, verdict=verdict,
n_critical=n_crit, n_warning=n_warn, n_info=n_info,
checks_run=checks,
)