# filter.py """ FunGO — Smart Tier Filtering ============================== Removes generic/root GO terms and assigns evidence tiers to remaining predictions. Changes from original: 1. Tier names updated: GOLD → STRONG (Strong Evidence) GOOD → MODERATE (Moderate Evidence) SILVER → INDICATIVE 2. Combined score = ia_weight × confidence Used for ranking — more scientifically sound. 3. filter_predictions() returns a dict with two keys: "display" — top 20 by combined score (for UI screen) "all" — full filtered list (for CSV download) 4. summarise() updated to use new tier keys. 5. Blacklist + IA/confidence thresholds → completely unchanged. """ import logging from config import ( BLACKLIST_TERMS, TIER_GOLD_IA, TIER_GOLD_CONF, TIER_GOOD_IA, TIER_GOOD_CONF, TIER_SILVER_IA, TIER_SILVER_CONF, ) logger = logging.getLogger(__name__) ONT_LABELS = { "MFO": "Molecular Function", "BPO": "Biological Process", "CCO": "Cellular Component", } TIER_LABELS = { "STRONG": "Strong Evidence", "MODERATE": "Moderate Evidence", "INDICATIVE": "Indicative", } TIER_RANK = {"STRONG": 0, "MODERATE": 1, "INDICATIVE": 2} # Max predictions shown on screen per protein TOP_N_DISPLAY = 20 def assign_tier(go_term: str, ia: float, confidence: float) -> str: """ Assign evidence tier. Thresholds unchanged from original. Returns: "STRONG" | "MODERATE" | "INDICATIVE" | "NOISE" """ if go_term in BLACKLIST_TERMS: return "NOISE" if ia > TIER_GOLD_IA and confidence >= TIER_GOLD_CONF: return "STRONG" if ia > TIER_GOOD_IA and confidence >= TIER_GOOD_CONF: return "MODERATE" if ia > TIER_SILVER_IA and confidence >= TIER_SILVER_CONF: return "INDICATIVE" return "NOISE" def combined_score(ia: float, confidence: float) -> float: """ Ranking score = ia_weight × confidence. Balances specificity (IA) and model certainty (confidence). """ return round(ia * confidence, 6) def filter_predictions(raw_predictions: list, ia_weights: dict) -> dict: """ Filter raw predictions and return display + full sets. Returns ------- { "display": top-20 predictions (sorted by combined_score desc), "all": all filtered predictions (for CSV) } Each prediction dict contains: go_term, ontology, ontology_label, confidence, threshold, ia_weight, combined_score, tier, tier_rank, tier_label """ filtered = [] for pred in raw_predictions: go_term = pred["go_term"] confidence = pred["confidence"] ia = float(ia_weights.get(go_term, 0.0)) tier = assign_tier(go_term, ia, confidence) if tier == "NOISE": continue if tier not in TIER_RANK: logger.warning("Unknown tier %r for %s — skipping", tier, go_term) continue score = combined_score(ia, confidence) filtered.append({ **pred, "ia_weight": round(ia, 4), "combined_score": score, "tier": tier, "tier_rank": TIER_RANK[tier], "tier_label": TIER_LABELS[tier], "ontology_label": ONT_LABELS.get(pred["ontology"], pred["ontology"]), }) # Sort by combined score descending, tier_rank as tiebreaker filtered.sort(key=lambda x: (-x["combined_score"], x["tier_rank"])) return { "display": filtered[:TOP_N_DISPLAY], "all": filtered, } def summarise(filtered_display: list, all_filtered: list, protein_id: str) -> dict: """ Per-protein summary. Counts are over ALL filtered (not just top-20). """ ont_counts = {"MFO": 0, "BPO": 0, "CCO": 0} tier_counts = {"STRONG": 0, "MODERATE": 0, "INDICATIVE": 0} for p in all_filtered: ont = p.get("ontology", "") if ont in ont_counts: ont_counts[ont] += 1 t = p.get("tier", "") if t in tier_counts: tier_counts[t] += 1 n = len(all_filtered) return { "protein_id": protein_id, "total_filtered": n, "displayed": len(filtered_display), "by_ontology": ont_counts, "by_tier": tier_counts, "has_strong_evidence": tier_counts["STRONG"] > 0, "avg_confidence": round(sum(p["confidence"] for p in all_filtered) / n, 4) if n else 0.0, "avg_ia": round(sum(p["ia_weight"] for p in all_filtered) / n, 4) if n else 0.0, "avg_combined_score": round(sum(p["combined_score"] for p in all_filtered) / n, 4) if n else 0.0, }