Spaces:

VyLala
/

BioMetadataAudit

Running

File size: 9,428 Bytes

from typing import Dict, Any, Tuple, List, Optional
import standardize_location

def set_rules() -> Dict[str, Any]:
    """
    Define weights, penalties and thresholds for the confidence score.

    V1 principles:
    - Interpretability > mathematical purity
    - Conservative > aggressive
    - Explainable > comprehensive
    """
    return {
        "direct_evidence": {
            # Based on the table we discussed:
            # Accession explicitly linked to country in paper/supplement
            "explicit_geo_pubmed_text": 40,
            # PubMed ID exists AND geo_loc_name exists
            "geo_and_pubmed": 30,
            # geo_loc_name exists (GenBank only)
            "geo_only": 20,
            # accession appears in external text but no structured geo_loc_name
            "accession_in_text_only": 10,
        },
        "consistency": {
            # Predicted country matches GenBank field
            "match": 20,
            # No contradiction detected across sources (when some evidence exists)
            "no_contradiction": 10,
            # Clear contradiction detected between prediction and GenBank
            "contradiction": -30,
        },
        "evidence_density": {
            # ≥2 linked publications
            "two_or_more_pubs": 20,
            # 1 linked publication
            "one_pub": 10,
            # 0 publications
            "none": 0,
        },
        "risk_penalties": {
            # Missing key metadata fields (geo, host, collection_date, etc.)
            "missing_key_fields": -10,
            # Known failure accession pattern (from your existing bug list)
            "known_failure_pattern": -20,
        },
        "tiers": {
            # Confidence tiers (researchers think in categories, not decimals)
            "high_min": 70,
            "medium_min": 40,  # < high_min and >= medium_min = medium; rest = low
        },
    }


def normalize_country(name: Optional[str]) -> Optional[str]:
    """
    Normalize country names to improve simple equality checks.

    This is intentionally simple and rule-based.
    You can extend the mapping as you see real-world variants.
    """
    if not name:
        return None
    name = name.strip().lower()

    mapping = {
        "usa": "united states",
        "u.s.a.": "united states",
        "u.s.": "united states",
        "us": "united states",
        "united states of america": "united states",
        "uk": "united kingdom",
        "u.k.": "united kingdom",
        "england": "united kingdom",
        # Add more mappings here when encounter them in real data
    }

    return mapping.get(name, name)


def compute_confidence_score_and_tier(
    signals: Dict[str, Any],
    rules: Optional[Dict[str, Any]] = None,
) -> Tuple[int, str, List[str]]:
    """
    Compute confidence score and tier for a single accession row.

    Input `signals` dict is expected to contain:

        has_geo_loc_name: bool
        has_pubmed: bool
        accession_found_in_text: bool  # accession present in extracted external text
        predicted_country: str | None  # final model label / country prediction
        genbank_country: str | None    # from NCBI / GenBank metadata
        num_publications: int
        missing_key_fields: bool
        known_failure_pattern: bool

    Returns:
        score (0–100), tier ("high"/"medium"/"low"),
        explanations (list of short human-readable reasons)
    """
    if rules is None:
        rules = set_rules()

    score = 0
    explanations: List[str] = []

    # ---------- Signal 1: Direct evidence strength ----------
    has_geo = bool(signals.get("has_geo_loc_name"))
    has_pubmed = bool(signals.get("has_pubmed"))
    accession_in_text = bool(signals.get("accession_found_in_text"))

    direct_cfg = rules["direct_evidence"]

    # We pick the strongest applicable case.
    if has_geo and has_pubmed and accession_in_text:
        score += direct_cfg["explicit_geo_pubmed_text"]
        explanations.append(
            "Accession linked to a country in GenBank and associated publication text."
        )
    elif has_geo and has_pubmed:
        score += direct_cfg["geo_and_pubmed"]
        explanations.append(
            "GenBank geo_loc_name and linked publication found."
        )
    elif has_geo:
        score += direct_cfg["geo_only"]
        explanations.append("GenBank geo_loc_name present.")
    elif accession_in_text:
        score += direct_cfg["accession_in_text_only"]
        explanations.append("Accession keyword found in extracted external text.")

    # ---------- Signal 2: Cross-source consistency ----------
    print("standardize pre_country")
    pred_country = signals.get("predicted_country")
    if pred_country:
      pred_country = standardize_location.smart_country_lookup(signals.get("predicted_country").lower())
    gb_country = signals.get("genbank_country")
    if gb_country:  
      gb_country = standardize_location.smart_country_lookup(signals.get("genbank_country").lower()) 
    print("both pred_country and gb_country after standardizing: ", pred_country, gb_country)
    cons_cfg = rules["consistency"]
    print("start compare gb country and pre country")
    if gb_country is not None and pred_country is not None:
        print("inside comparison")
        if gb_country.lower() == pred_country.lower():
            score += cons_cfg["match"]
            explanations.append(
                "Predicted country matches GenBank country metadata."
            )
        else:
            score += cons_cfg["contradiction"]
            explanations.append(
                "Conflict between predicted country and GenBank country metadata."
            )
        print("done comparison")    
    else:
        # Only give "no contradiction" bonus if there is at least some evidence
        if has_geo or has_pubmed or accession_in_text:
            score += cons_cfg["no_contradiction"]
            explanations.append(
                "No contradiction detected across available sources."
            )
    print("start evidence density")
    # ---------- Signal 3: Evidence density ----------
    num_pubs = int(signals.get("num_publications", 0))
    dens_cfg = rules["evidence_density"]

    if num_pubs >= 2:
        score += dens_cfg["two_or_more_pubs"]
        explanations.append("Multiple linked publications available.")
    elif num_pubs == 1:
        score += dens_cfg["one_pub"]
        explanations.append("One linked publication available.")
    # else: 0 publications → no extra score

    # ---------- Signal 4: Risk flags ----------
    risk_cfg = rules["risk_penalties"]

    if signals.get("missing_key_fields"):
        score += risk_cfg["missing_key_fields"]
        explanations.append(
            "Missing key metadata fields (higher uncertainty)."
        )

    if signals.get("known_failure_pattern"):
        score += risk_cfg["known_failure_pattern"]
        explanations.append(
            "Accession matches a known risky/failure pattern."
        )

    # ---------- Clamp score and determine tier ----------
    score = max(0, min(100, score))

    tiers = rules["tiers"]
    if score >= tiers["high_min"]:
        tier = "high"
    elif score >= tiers["medium_min"]:
        tier = "medium"
    else:
        tier = "low"

    # Keep explanations short and readable
    if len(explanations) > 3:
        explanations = explanations[:3]
    print("done all")
    return score, tier, explanations


# if __name__ == "__main__":
#     # Quick local sanity-check examples (manual smoke tests)
#     rules = set_rules()

#     examples = [
#         {
#             "name": "Strong, clean case",
#             "signals": {
#                 "has_geo_loc_name": True,
#                 "has_pubmed": True,
#                 "accession_found_in_text": True,
#                 "predicted_country": "USA",
#                 "genbank_country": "United States of America",
#                 "num_publications": 3,
#                 "missing_key_fields": False,
#                 "known_failure_pattern": False,
#             },
#         },
#         {
#             "name": "Weak, conflicting case",
#             "signals": {
#                 "has_geo_loc_name": True,
#                 "has_pubmed": False,
#                 "accession_found_in_text": False,
#                 "predicted_country": "Japan",
#                 "genbank_country": "France",
#                 "num_publications": 0,
#                 "missing_key_fields": True,
#                 "known_failure_pattern": True,
#             },
#         },
#         {
#             "name": "Medium, sparse but okay",
#             "signals": {
#                 "has_geo_loc_name": False,
#                 "has_pubmed": True,
#                 "accession_found_in_text": False,
#                 "predicted_country": "United Kingdom",
#                 "genbank_country": None,
#                 "num_publications": 1,
#                 "missing_key_fields": False,
#                 "known_failure_pattern": False,
#             },
#         },
#     ]

#     for ex in examples:
#         score, tier, expl = compute_confidence_score_and_tier(
#             ex["signals"], rules
#         )
#         print("====", ex["name"], "====")
#         print("Score:", score, "| Tier:", tier)
#         print("Reasons:")
#         for e in expl:
#             print(" -", e)
#         print()