Spaces:

VyLala
/

BioMetadataAudit

Build error

App Files Files Community

VyLala commited on Dec 21, 2025

Commit

9d45ea6

verified ·

1 Parent(s): ec32cea

Upload confidence_score.py

Browse files

Files changed (1) hide show

confidence_score.py +263 -0

confidence_score.py ADDED Viewed

	@@ -0,0 +1,263 @@

+from typing import Dict, Any, Tuple, List, Optional
+import standardize_location
+def set_rules() -> Dict[str, Any]:
+    """
+    Define weights, penalties and thresholds for the confidence score.
+    V1 principles:
+    - Interpretability > mathematical purity
+    - Conservative > aggressive
+    - Explainable > comprehensive
+    """
+    return {
+        "direct_evidence": {
+            # Based on the table we discussed:
+            # Accession explicitly linked to country in paper/supplement
+            "explicit_geo_pubmed_text": 40,
+            # PubMed ID exists AND geo_loc_name exists
+            "geo_and_pubmed": 30,
+            # geo_loc_name exists (GenBank only)
+            "geo_only": 20,
+            # accession appears in external text but no structured geo_loc_name
+            "accession_in_text_only": 10,
+        },
+        "consistency": {
+            # Predicted country matches GenBank field
+            "match": 20,
+            # No contradiction detected across sources (when some evidence exists)
+            "no_contradiction": 10,
+            # Clear contradiction detected between prediction and GenBank
+            "contradiction": -30,
+        },
+        "evidence_density": {
+            # ≥2 linked publications
+            "two_or_more_pubs": 20,
+            # 1 linked publication
+            "one_pub": 10,
+            # 0 publications
+            "none": 0,
+        },
+        "risk_penalties": {
+            # Missing key metadata fields (geo, host, collection_date, etc.)
+            "missing_key_fields": -10,
+            # Known failure accession pattern (from your existing bug list)
+            "known_failure_pattern": -20,
+        },
+        "tiers": {
+            # Confidence tiers (researchers think in categories, not decimals)
+            "high_min": 70,
+            "medium_min": 40,  # < high_min and >= medium_min = medium; rest = low
+        },
+    }
+def normalize_country(name: Optional[str]) -> Optional[str]:
+    """
+    Normalize country names to improve simple equality checks.
+    This is intentionally simple and rule-based.
+    You can extend the mapping as you see real-world variants.
+    """
+    if not name:
+        return None
+    name = name.strip().lower()
+    mapping = {
+        "usa": "united states",
+        "u.s.a.": "united states",
+        "u.s.": "united states",
+        "us": "united states",
+        "united states of america": "united states",
+        "uk": "united kingdom",
+        "u.k.": "united kingdom",
+        "england": "united kingdom",
+        # Add more mappings here when encounter them in real data
+    }
+    return mapping.get(name, name)
+def compute_confidence_score_and_tier(
+    signals: Dict[str, Any],
+    rules: Optional[Dict[str, Any]] = None,
+) -> Tuple[int, str, List[str]]:
+    """
+    Compute confidence score and tier for a single accession row.
+    Input `signals` dict is expected to contain:
+        has_geo_loc_name: bool
+        has_pubmed: bool
+        accession_found_in_text: bool  # accession present in extracted external text
+        predicted_country: str | None  # final model label / country prediction
+        genbank_country: str | None    # from NCBI / GenBank metadata
+        num_publications: int
+        missing_key_fields: bool
+        known_failure_pattern: bool
+    Returns:
+        score (0–100), tier ("high"/"medium"/"low"),
+        explanations (list of short human-readable reasons)
+    """
+    if rules is None:
+        rules = set_rules()
+    score = 0
+    explanations: List[str] = []
+    # ---------- Signal 1: Direct evidence strength ----------
+    has_geo = bool(signals.get("has_geo_loc_name"))
+    has_pubmed = bool(signals.get("has_pubmed"))
+    accession_in_text = bool(signals.get("accession_found_in_text"))
+    direct_cfg = rules["direct_evidence"]
+    # We pick the strongest applicable case.
+    if has_geo and has_pubmed and accession_in_text:
+        score += direct_cfg["explicit_geo_pubmed_text"]
+        explanations.append(
+            "Accession linked to a country in GenBank and associated publication text."
+        )
+    elif has_geo and has_pubmed:
+        score += direct_cfg["geo_and_pubmed"]
+        explanations.append(
+            "GenBank geo_loc_name and linked publication found."
+        )
+    elif has_geo:
+        score += direct_cfg["geo_only"]
+        explanations.append("GenBank geo_loc_name present.")
+    elif accession_in_text:
+        score += direct_cfg["accession_in_text_only"]
+        explanations.append("Accession keyword found in extracted external text.")
+    # ---------- Signal 2: Cross-source consistency ----------
+    pred_country = standardize_location.smart_country_lookup(signals.get("predicted_country").lower())
+    if pred_country == "not found":
+      pred_country = normalize_country(signals.get("predicted_country"))
+    gb_country = standardize_location.smart_country_lookup(signals.get("genbank_country").lower())
+      if gb_country == "not found":
+        gb_country = normalize_country(signals.get("genbank_country"))
+    cons_cfg = rules["consistency"]
+    if gb_country is not None and pred_country is not None:
+        if gb_country == pred_country:
+            score += cons_cfg["match"]
+            explanations.append(
+                "Predicted country matches GenBank country metadata."
+            )
+        else:
+            score += cons_cfg["contradiction"]
+            explanations.append(
+                "Conflict between predicted country and GenBank country metadata."
+            )
+    else:
+        # Only give "no contradiction" bonus if there is at least some evidence
+        if has_geo or has_pubmed or accession_in_text:
+            score += cons_cfg["no_contradiction"]
+            explanations.append(
+                "No contradiction detected across available sources."
+            )
+    # ---------- Signal 3: Evidence density ----------
+    num_pubs = int(signals.get("num_publications", 0))
+    dens_cfg = rules["evidence_density"]
+    if num_pubs >= 2:
+        score += dens_cfg["two_or_more_pubs"]
+        explanations.append("Multiple linked publications available.")
+    elif num_pubs == 1:
+        score += dens_cfg["one_pub"]
+        explanations.append("One linked publication available.")
+    # else: 0 publications → no extra score
+    # ---------- Signal 4: Risk flags ----------
+    risk_cfg = rules["risk_penalties"]
+    if signals.get("missing_key_fields"):
+        score += risk_cfg["missing_key_fields"]
+        explanations.append(
+            "Missing key metadata fields (higher uncertainty)."
+        )
+    if signals.get("known_failure_pattern"):
+        score += risk_cfg["known_failure_pattern"]
+        explanations.append(
+            "Accession matches a known risky/failure pattern."
+        )
+    # ---------- Clamp score and determine tier ----------
+    score = max(0, min(100, score))
+    tiers = rules["tiers"]
+    if score >= tiers["high_min"]:
+        tier = "high"
+    elif score >= tiers["medium_min"]:
+        tier = "medium"
+    else:
+        tier = "low"
+    # Keep explanations short and readable
+    if len(explanations) > 3:
+        explanations = explanations[:3]
+    return score, tier, explanations
+if __name__ == "__main__":
+    # Quick local sanity-check examples (manual smoke tests)
+    rules = set_rules()
+    examples = [
+        {
+            "name": "Strong, clean case",
+            "signals": {
+                "has_geo_loc_name": True,
+                "has_pubmed": True,
+                "accession_found_in_text": True,
+                "predicted_country": "USA",
+                "genbank_country": "United States of America",
+                "num_publications": 3,
+                "missing_key_fields": False,
+                "known_failure_pattern": False,
+            },
+        },
+        {
+            "name": "Weak, conflicting case",
+            "signals": {
+                "has_geo_loc_name": True,
+                "has_pubmed": False,
+                "accession_found_in_text": False,
+                "predicted_country": "Japan",
+                "genbank_country": "France",
+                "num_publications": 0,
+                "missing_key_fields": True,
+                "known_failure_pattern": True,
+            },
+        },
+        {
+            "name": "Medium, sparse but okay",
+            "signals": {
+                "has_geo_loc_name": False,
+                "has_pubmed": True,
+                "accession_found_in_text": False,
+                "predicted_country": "United Kingdom",
+                "genbank_country": None,
+                "num_publications": 1,
+                "missing_key_fields": False,
+                "known_failure_pattern": False,
+            },
+        },
+    ]
+    for ex in examples:
+        score, tier, expl = compute_confidence_score_and_tier(
+            ex["signals"], rules
+        )
+        print("====", ex["name"], "====")
+        print("Score:", score, "| Tier:", tier)
+        print("Reasons:")
+        for e in expl:
+            print(" -", e)
+        print()