| """ |
| Liability / QC aggregator. |
| |
| Rolls the individual analysis results (GC, homopolymers, restriction sites, |
| uridine, CDS validation, Kozak, secondary structure, sequence motifs) into a |
| single severity-ranked liability report with an overall QC score (0β100) and a |
| pass / review / fail verdict β analogous to a developability/liability overlay. |
| |
| This module is a pure aggregator: it reads attributes off an already-computed |
| analysis report (duck-typed) and the sequence object, so it imports neither the |
| analyzer nor the Panel UI. It only depends on the homopolymer detector (to |
| re-scan the construct *body*, excluding the legitimate poly-A tail). |
| """ |
| from __future__ import annotations |
|
|
| from dataclasses import dataclass, field |
| from typing import Any, List, Optional |
|
|
| from core.analysis.homopolymers import detect_homopolymers |
|
|
| |
| CRITICAL = "critical" |
| WARNING = "warning" |
| INFO = "info" |
|
|
| _SEVERITY_ORDER = {CRITICAL: 0, WARNING: 1, INFO: 2} |
| _PENALTY = {CRITICAL: 25, WARNING: 10, INFO: 3} |
|
|
| |
| _GC_LOW, _GC_HIGH = 40.0, 65.0 |
| _GC_LOW_CRIT, _GC_HIGH_CRIT = 30.0, 70.0 |
| _HOMOPOLYMER_WARN = 10 |
| _HOMOPOLYMER_CRIT = 15 |
| _URIDINE_WARN_PCT = 40.0 |
| _MFE_PER_NT_INFO = -0.45 |
|
|
|
|
| @dataclass |
| class LiabilityFlag: |
| """A single liability finding.""" |
| id: str |
| category: str |
| |
| severity: str |
| title: str |
| detail: str |
| location: str = "" |
| recommendation: str = "" |
|
|
|
|
| @dataclass |
| class LiabilityReport: |
| """Aggregated liability assessment for one sequence.""" |
| flags: List[LiabilityFlag] = field(default_factory=list) |
| score: int = 100 |
| verdict: str = "pass" |
| n_critical: int = 0 |
| n_warning: int = 0 |
| n_info: int = 0 |
| checks_run: int = 0 |
|
|
| @property |
| def flag_count(self) -> int: |
| return len(self.flags) |
|
|
| def sorted_flags(self) -> List[LiabilityFlag]: |
| return sorted(self.flags, key=lambda f: _SEVERITY_ORDER.get(f.severity, 9)) |
|
|
|
|
| def _body_sequence(seq: Any) -> str: |
| """Construct body = everything except the legitimate poly-A tail.""" |
| parts = [ |
| getattr(seq, "five_prime_utr", None), |
| getattr(seq, "kozak", None), |
| getattr(seq, "cds", None), |
| getattr(seq, "three_prime_utr", None), |
| ] |
| return "".join(p for p in parts if p).upper().replace("U", "T") |
|
|
|
|
| def assess_liabilities(report: Any, seq: Any) -> LiabilityReport: |
| """ |
| Build a LiabilityReport from an analysis ``report`` and its ``seq``. |
| |
| ``report`` is duck-typed: it is expected to expose the attributes set by |
| SequenceAnalyzer (gc_percent_global, restriction_enzymes_present, uridine, |
| has_start_codon/has_stop_codon/in_frame, kozak, structure, motif_hits). |
| """ |
| flags: List[LiabilityFlag] = [] |
| checks = 0 |
|
|
| def add(category, severity, title, detail, location="", recommendation=""): |
| flags.append(LiabilityFlag( |
| id=f"{category.lower()}-{len(flags)}", |
| category=category, severity=severity, title=title, |
| detail=detail, location=location, recommendation=recommendation, |
| )) |
|
|
| |
| checks += 1 |
| gc = getattr(report, "gc_percent_global", None) |
| if gc is not None and gc > 0: |
| if gc < _GC_LOW_CRIT or gc > _GC_HIGH_CRIT: |
| add("GC", CRITICAL, "GC content far outside optimal range", |
| f"Global GC is {gc:.1f}% (optimal {_GC_LOW:.0f}β{_GC_HIGH:.0f}%).", |
| "global", |
| "Re-balance GC; extremes hurt synthesis, translation, and stability.") |
| elif gc < _GC_LOW or gc > _GC_HIGH: |
| add("GC", WARNING, "GC content outside optimal range", |
| f"Global GC is {gc:.1f}% (optimal {_GC_LOW:.0f}β{_GC_HIGH:.0f}%).", |
| "global", |
| "Nudge GC toward 40β65% during codon optimisation.") |
|
|
| |
| checks += 1 |
| body = _body_sequence(seq) |
| if body: |
| body_runs = detect_homopolymers(body, min_run=_HOMOPOLYMER_WARN) |
| if body_runs: |
| longest = max(body_runs, key=lambda r: r.length) |
| sev = CRITICAL if longest.length >= _HOMOPOLYMER_CRIT else WARNING |
| add("Homopolymer", sev, "Homopolymer run in construct body", |
| f"{len(body_runs)} run(s) β₯{_HOMOPOLYMER_WARN} nt; longest " |
| f"{longest.nucleotide}Γ{longest.length}.", |
| f"body pos {longest.start}", |
| "Break up long single-base runs to avoid synthesis errors and " |
| "polymerase slippage.") |
|
|
| |
| checks += 1 |
| enzymes = getattr(report, "restriction_enzymes_present", None) or [] |
| if enzymes: |
| add("Restriction", WARNING, "Internal restriction sites present", |
| f"Sites for: {', '.join(sorted(enzymes))}.", |
| "construct", |
| "Remove internal sites or pick a cloning strategy that avoids them.") |
|
|
| |
| checks += 1 |
| uri = getattr(report, "uridine", None) |
| if uri is not None: |
| stretches = getattr(uri, "high_u_stretches", []) or [] |
| u_pct = getattr(uri, "u_percent", 0.0) |
| if u_pct >= _URIDINE_WARN_PCT or stretches: |
| detail = f"Uridine {u_pct:.1f}%" |
| if stretches: |
| detail += f"; {len(stretches)} high-U stretch(es)" |
| add("Uridine", WARNING, "Elevated uridine content", |
| detail + ".", |
| "construct", |
| "High U is immunostimulatory β optimise sequence and/or use " |
| "modified nucleotides (e.g. N1-methylpseudouridine).") |
|
|
| |
| if getattr(report, "has_start_codon", None) is not None: |
| checks += 1 |
| if report.has_start_codon is False: |
| add("CDS", CRITICAL, "CDS missing start codon", |
| "CDS does not begin with ATG.", "CDS 5' end", |
| "Ensure the CDS starts with ATG.") |
| if getattr(report, "has_stop_codon", None) is False: |
| add("CDS", CRITICAL, "CDS missing stop codon", |
| "CDS does not end with a stop codon.", "CDS 3' end", |
| "Append a stop codon (TAA/TAG/TGA).") |
| if getattr(report, "in_frame", None) is False: |
| add("CDS", CRITICAL, "CDS not in frame", |
| "CDS length is not divisible by 3.", "CDS", |
| "Fix indels so the CDS length is a multiple of 3.") |
|
|
| |
| kz = getattr(report, "kozak", None) |
| if kz is not None: |
| checks += 1 |
| strength = getattr(kz, "strength", None) |
| if strength == "weak": |
| add("Kozak", WARNING, "Weak Kozak context", |
| f"Kozak score {getattr(kz, 'score', 0):.2f} (weak).", |
| "around start codon", |
| "Strengthen Kozak: purine (A/G) at -3 and G at +4.") |
| elif strength == "adequate": |
| add("Kozak", INFO, "Sub-optimal Kozak context", |
| f"Kozak score {getattr(kz, 'score', 0):.2f} (adequate).", |
| "around start codon", |
| "Optional: optimise -3/+4 positions for stronger initiation.") |
|
|
| |
| struct = getattr(report, "structure", None) |
| if struct is not None and not getattr(struct, "is_stub", True): |
| checks += 1 |
| length = max(len(getattr(struct, "sequence", "") or ""), 1) |
| per_nt = getattr(struct, "mfe", 0.0) / length |
| if per_nt < _MFE_PER_NT_INFO: |
| add("Structure", INFO, "Highly structured mRNA", |
| f"MFE {struct.mfe:.1f} kcal/mol ({per_nt:.2f}/nt).", |
| "global", |
| "Strong structure (esp. near the 5' cap/start) can impede " |
| "translation initiation.") |
|
|
| |
| motif_hits = getattr(report, "motif_hits", None) or [] |
| if motif_hits: |
| checks += 1 |
| |
| by_name: dict = {} |
| for h in motif_hits: |
| by_name.setdefault(h.name, []).append(h) |
| for name, group in by_name.items(): |
| first = group[0] |
| sev = min((h.severity for h in group), key=lambda s: _SEVERITY_ORDER.get(s, 9)) |
| positions = ", ".join(f"{h.region}:{h.start}" for h in group[:5]) |
| if len(group) > 5: |
| positions += f" (+{len(group) - 5} more)" |
| add("Motif", sev, first.label, |
| f"{len(group)} occurrence(s). {first.description}", |
| positions, |
| first.recommendation) |
|
|
| |
| n_crit = sum(1 for f in flags if f.severity == CRITICAL) |
| n_warn = sum(1 for f in flags if f.severity == WARNING) |
| n_info = sum(1 for f in flags if f.severity == INFO) |
|
|
| penalty = sum(_PENALTY.get(f.severity, 0) for f in flags) |
| score = max(0, min(100, 100 - penalty)) |
|
|
| if n_crit > 0: |
| verdict = "fail" |
| elif n_warn > 0: |
| verdict = "review" |
| else: |
| verdict = "pass" |
|
|
| return LiabilityReport( |
| flags=flags, score=score, verdict=verdict, |
| n_critical=n_crit, n_warning=n_warn, n_info=n_info, |
| checks_run=checks, |
| ) |
|
|