Spaces:

Muteeba
/

FunGO

Running

App Files Files Community

Muteeba commited on Apr 16

Commit

5c389ab

1 Parent(s): 4e8a676

FunGO v2.0 backend

Browse files

Files changed (8) hide show

app.py +278 -0
config.py +104 -0
embedder.py +187 -0
filter.py +152 -0
hf_README.md +41 -0
predictor.py +216 -0
requirements.txt +9 -0
taxonomy.py +200 -0

app.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# app.py — FunGO HuggingFace Space
+"""
+FunGO v2.0 — HuggingFace Spaces Deployment
+=============================================
+Flask API running on port 7860.
+Model files loaded from /data/ (HF persistent storage).
+To upload model files:
+  pip install huggingface_hub
+  huggingface-cli login
+  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/models        /data/models        --repo-type=space
+  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/labels        /data/labels        --repo-type=space
+  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/go_data       /data/go_data       --repo-type=space
+  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/features      /data/features      --repo-type=space
+  huggingface-cli upload Muteeba/FunGO /mnt/e/repeat/embeddings/model_cache /data/esm2_cache --repo-type=space
+"""
+import csv
+import io
+import logging
+import os
+import re as _re
+import sys
+import time
+from collections import OrderedDict
+# ── HuggingFace paths ─────────────────────────────────────────
+os.environ.setdefault("FUNGO_PKL_DIR",    "/data/models")
+os.environ.setdefault("FUNGO_VOCAB_PKL",  "/data/labels/vocabularies.pkl")
+os.environ.setdefault("FUNGO_IA_PKL",     "/data/go_data/ia_weights.pkl")
+os.environ.setdefault("FUNGO_FEAT_META",  "/data/features/feature_metadata.json")
+os.environ.setdefault("FUNGO_MODEL_CACHE","/data/esm2_cache")
+os.environ.setdefault("FUNGO_EMB_CACHE",  "/data/embedding_cache")
+os.environ.setdefault("FUNGO_OFFLINE",    "1")
+os.environ.setdefault("FUNGO_DEBUG",      "0")
+os.environ.setdefault("FUNGO_PORT",       "7860")
+from flask import Flask, jsonify, request, Response
+from flask_cors import CORS
+import config
+import predictor
+import embedder
+import filter as flt
+import taxonomy
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("fungo.app")
+app = Flask(__name__)
+CORS(app)
+app.config["MAX_CONTENT_LENGTH"] = 2 * 1024 * 1024
+_csv_store: OrderedDict = OrderedDict()
+_CSV_MAX = 50
+def _store_csv(job_id, predictions):
+    if len(_csv_store) >= _CSV_MAX:
+        _csv_store.popitem(last=False)
+    _csv_store[job_id] = {"predictions": predictions, "ts": time.time()}
+def _make_csv(predictions):
+    out = io.StringIO()
+    w = csv.writer(out)
+    w.writerow(["protein_id","go_term","ontology","ontology_label",
+                "tier","tier_label","confidence","ia_weight","combined_score","threshold"])
+    for pid, data in predictions.items():
+        for p in data.get("all", []):
+            w.writerow([pid, p.get("go_term",""), p.get("ontology",""),
+                p.get("ontology_label",""), p.get("tier",""), p.get("tier_label",""),
+                p.get("confidence",""), p.get("ia_weight",""),
+                p.get("combined_score",""), p.get("threshold","")])
+    return out.getvalue()
+_OX_RE = _re.compile(r"OX=(\d+)")
+def _parse_taxon_id(header):
+    m = _OX_RE.search(header or "")
+    return int(m.group(1)) if m else None
+def parse_fasta(fasta_text):
+    proteins, current_id, current_hdr, current_seq = [], None, None, []
+    for raw_line in fasta_text.splitlines():
+        line = raw_line.strip()
+        if not line: continue
+        if line.startswith(">"):
+            if current_id is not None:
+                seq = "".join(current_seq).upper()
+                if seq:
+                    proteins.append({"id": current_id, "seq": seq,
+                        "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
+            current_hdr = line[1:].strip()
+            parts = current_hdr.split("|")
+            current_id = parts[1] if len(parts) >= 3 else current_hdr.split()[0]
+            current_seq = []
+        else:
+            current_seq.append(line)
+    if current_id is not None:
+        seq = "".join(current_seq).upper()
+        if seq:
+            proteins.append({"id": current_id, "seq": seq,
+                "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
+    if not proteins:
+        raise ValueError("No valid protein sequences found in FASTA input.")
+    return proteins
+def _run_prediction(fasta_text, taxon_id_override):
+    proteins = parse_fasta(fasta_text)
+    if len(proteins) > config.MAX_SEQUENCES:
+        raise ValueError(f"Too many sequences. Max: {config.MAX_SEQUENCES}.")
+    protein_ids = [p["id"] for p in proteins]
+    sequences   = [p["seq"] for p in proteins]
+    taxon_ids   = [taxon_id_override if taxon_id_override is not None
+                   else p["taxon_id"] for p in proteins]
+    log.info("Proteins: %s | Taxon IDs: %s", protein_ids, taxon_ids)
+    t0      = time.perf_counter()
+    X_esm   = embedder.extract(sequences)
+    top50   = predictor.get_top50_taxa()
+    X_final = embedder.build_features(X_esm, taxon_ids, top50)
+    raw_preds  = predictor.predict(X_final, protein_ids)
+    ia_weights = predictor.get_ia_weights()
+    for p in raw_preds:
+        p["ia_weight"] = round(float(ia_weights.get(p["go_term"], 0.0)), 4)
+    return proteins, raw_preds, ia_weights, round(time.perf_counter() - t0, 2)
+@app.route("/health", methods=["GET"])
+def health():
+    return jsonify({"status":"ok","device":config.DEVICE,"fp16":config.USE_FP16,"version":"2.0.0"})
+@app.route("/model/info", methods=["GET"])
+def model_info():
+    try: stats = predictor.get_model_stats()
+    except RuntimeError as e: return jsonify({"error": str(e)}), 503
+    return jsonify({"device":config.DEVICE,"fp16":config.USE_FP16,
+        "model_name":config.MODEL_NAME,"ontologies":stats,
+        "top50_taxa_count":len(predictor.get_top50_taxa()),
+        "thresholds":{
+            "STRONG":    {"min_ia":config.TIER_GOLD_IA,   "min_conf":config.TIER_GOLD_CONF},
+            "MODERATE":  {"min_ia":config.TIER_GOOD_IA,   "min_conf":config.TIER_GOOD_CONF},
+            "INDICATIVE":{"min_ia":config.TIER_SILVER_IA, "min_conf":config.TIER_SILVER_CONF},
+        },"display_limit":flt.TOP_N_DISPLAY})
+@app.route("/taxonomy/search", methods=["GET"])
+def taxonomy_search():
+    q = request.args.get("q","").strip()
+    if len(q) < 2: return jsonify({"error":"Query must be at least 2 characters."}), 400
+    try: max_r = min(int(request.args.get("max_results",8)),20)
+    except: max_r = 8
+    return jsonify({"query":q,"results":taxonomy.search_species(q,max_results=max_r)})
+@app.route("/taxonomy/verify", methods=["GET"])
+def taxonomy_verify():
+    raw = request.args.get("taxon_id","")
+    if not raw: return jsonify({"error":"taxon_id required."}), 400
+    try: taxon_id = int(raw)
+    except: return jsonify({"error":f"Invalid taxon_id: '{raw}'"}), 400
+    return jsonify(taxonomy.resolve_taxon(taxon_id, predictor.get_top50_taxa()))
+@app.route("/predict", methods=["POST"])
+def predict():
+    if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
+    body = request.get_json(silent=True) or {}
+    fasta_text = body.get("fasta","").strip()
+    if not fasta_text: return jsonify({"error":"'fasta' field is required."}), 400
+    taxon_id_override = None
+    if "taxon_id" in body:
+        try: taxon_id_override = int(body["taxon_id"])
+        except: return jsonify({"error":f"Invalid taxon_id"}), 400
+    try:
+        proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
+    except ValueError as e: return jsonify({"error": str(e)}), 400
+    except RuntimeError as e: return jsonify({"error": str(e)}), 503
+    except Exception as e:
+        log.exception("Prediction error"); return jsonify({"error": str(e)}), 500
+    protein_ids = [p["id"] for p in proteins]
+    raw_by_pid  = {pid:[] for pid in protein_ids}
+    for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
+    predictions, csv_data, total_display, total_all = {}, {}, 0, 0
+    for prot in proteins:
+        pid = prot["id"]
+        res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
+        display, all_f = res["display"], res["all"]
+        total_display += len(display); total_all += len(all_f)
+        predictions[pid] = {"taxon_id":prot["taxon_id"],
+            "summary":flt.summarise(display,all_f,pid),
+            "display":display,"total_all":len(all_f)}
+        csv_data[pid] = {"all":all_f}
+    job_id = str(int(time.time()*1000))
+    _store_csv(job_id, csv_data)
+    return jsonify({"job_id":job_id,
+        "metadata":{"n_proteins":len(protein_ids),"device":config.DEVICE,
+            "total_raw_predictions":len(raw_preds),"total_filtered":total_all,
+            "total_displayed":total_display,"display_limit":flt.TOP_N_DISPLAY,
+            "elapsed_seconds":elapsed},
+        "predictions":predictions})
+@app.route("/predict/csv", methods=["GET"])
+def download_csv():
+    job_id = request.args.get("job_id","").strip()
+    if not job_id: return jsonify({"error":"job_id required."}), 400
+    job = _csv_store.get(job_id)
+    if not job: return jsonify({"error":f"Job '{job_id}' not found."}), 404
+    return Response(_make_csv(job["predictions"]), mimetype="text/csv",
+        headers={"Content-Disposition":f"attachment; filename=fungo_{job_id}.csv"})
+@app.route("/predict/debug", methods=["POST"])
+def predict_debug():
+    if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
+    body = request.get_json(silent=True) or {}
+    fasta_text = body.get("fasta","").strip()
+    if not fasta_text: return jsonify({"error":"'fasta' required."}), 400
+    taxon_id_override = None
+    if "taxon_id" in body:
+        try: taxon_id_override = int(body["taxon_id"])
+        except: return jsonify({"error":"Invalid taxon_id"}), 400
+    try:
+        proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
+    except ValueError as e: return jsonify({"error":str(e)}), 400
+    except RuntimeError as e: return jsonify({"error":str(e)}), 503
+    except Exception as e:
+        log.exception("Debug error"); return jsonify({"error":str(e)}), 500
+    protein_ids = [p["id"] for p in proteins]
+    raw_by_pid  = {pid:[] for pid in protein_ids}
+    for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
+    thr = {"STRONG":{"min_ia":config.TIER_GOLD_IA,"min_conf":config.TIER_GOLD_CONF},
+           "MODERATE":{"min_ia":config.TIER_GOOD_IA,"min_conf":config.TIER_GOOD_CONF},
+           "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF}}
+    predictions = {}
+    for prot in proteins:
+        pid = prot["id"]
+        res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
+        display, all_f = res["display"], res["all"]
+        accepted = {p["go_term"] for p in all_f}
+        fo = []
+        for pred in raw_by_pid[pid]:
+            go = pred["go_term"]
+            if go in accepted: continue
+            ia, conf = pred.get("ia_weight", float(ia_weights.get(go,0.0))), pred["confidence"]
+            if go in config.BLACKLIST_TERMS: reason="blacklisted"
+            elif ia <= config.TIER_SILVER_IA: reason=f"ia_too_low (ia={ia:.4f})"
+            elif conf < config.TIER_SILVER_CONF: reason=f"conf_too_low (conf={conf:.4f})"
+            else: reason="below_all_tiers"
+            fo.append({"go_term":go,"ontology":pred["ontology"],"confidence":conf,
+                       "ia_weight":ia,"reason":reason})
+        fo.sort(key=lambda x:-x["ia_weight"])
+        predictions[pid] = {"taxon_id":prot["taxon_id"],
+            "summary":flt.summarise(display,all_f,pid),
+            "display":display,"all_filtered":all_f,
+            "filtered_out":fo,"thresholds_used":thr}
+    return jsonify({"metadata":{"n_proteins":len(protein_ids),"device":config.DEVICE,
+        "total_raw":len(raw_preds),"elapsed_seconds":elapsed},"predictions":predictions})
+@app.errorhandler(404)
+def not_found(e): return jsonify({"error":"Not found."}), 404
+@app.errorhandler(413)
+def too_large(e): return jsonify({"error":"Request too large."}), 413
+@app.errorhandler(500)
+def internal(e):
+    log.exception("Unhandled error"); return jsonify({"error":"Internal server error."}), 500
+if __name__ == "__main__":
+    log.info("FunGO v2.0 — HuggingFace Space starting …")
+    config.ensure_dirs()
+    if not config.validate_paths():
+        log.error("Model paths missing!")
+        sys.exit(1)
+    predictor.load_all()
+    log.info("Models loaded. Serving on port 7860 …")
+    app.run(host="0.0.0.0", port=7860, debug=False)

config.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# config.py
+"""
+FunGO Backend — Central Configuration
+======================================
+ONLY change paths in this file. Nothing else needs editing.
+How to use:
+  - Update PKL_DIR, VOCAB_PKL, IA_PKL, FEAT_META to point to your model files
+  - Update MODEL_CACHE_DIR to point to your ESM2 weights cache
+  - All other settings work as-is
+"""
+import logging
+import os
+from pathlib import Path
+import torch
+logger = logging.getLogger("config")
+# ── DEVICE (auto-detected) ────────────────────────────────────
+DEVICE   = "cuda" if torch.cuda.is_available() else "cpu"
+USE_FP16 = DEVICE == "cuda"
+# ── MODEL PATHS — UPDATE THESE TO MATCH YOUR SYSTEM ──────────
+PKL_DIR   = Path(os.environ.get("FUNGO_PKL_DIR",   "/mnt/f/research/thesis/pipeline_outputs/models"))
+VOCAB_PKL = Path(os.environ.get("FUNGO_VOCAB_PKL", "/mnt/f/research/thesis/pipeline_outputs/labels/vocabularies.pkl"))
+IA_PKL    = Path(os.environ.get("FUNGO_IA_PKL",    "/mnt/f/research/thesis/pipeline_outputs/go_data/ia_weights.pkl"))
+FEAT_META = Path(os.environ.get("FUNGO_FEAT_META", "/mnt/f/research/thesis/pipeline_outputs/features/feature_metadata.json"))
+# ── ESM2 SETTINGS ─────────────────────────────────────────────
+MODEL_CACHE_DIR     = Path(os.environ.get("FUNGO_MODEL_CACHE", "/mnt/e/repeat/embeddings/model_cache"))
+MODEL_NAME          = "facebook/esm2_t36_3B_UR50D"
+LAYERS_TO_USE       = [30, 31, 32, 33, 34, 35]
+MAX_SEQ_LENGTH      = 1400
+BATCH_SIZE          = 4 if DEVICE == "cpu" else 16
+TRANSFORMERS_OFFLINE = os.environ.get("FUNGO_OFFLINE", "1")
+# ── EMBEDDING CACHE ───────────────────────────────────────────
+EMB_CACHE_DIR = Path(os.environ.get("FUNGO_EMB_CACHE", "./embedding_cache"))
+# ── FILTER THRESHOLDS (do not change) ────────────────────────
+BLACKLIST_TERMS = {
+    "GO:0003674","GO:0008150","GO:0005575","GO:0005488",
+    "GO:0043226","GO:0043229","GO:0043227","GO:0043231",
+    "GO:0110165","GO:0005622","GO:0005623","GO:0044464",
+    "GO:0043232","GO:0044424","GO:0009987","GO:0065007",
+    "GO:0050794","GO:0019222","GO:0060255","GO:0080090",
+    "GO:0050789",
+}
+# Strong Evidence  (was GOLD)
+TIER_GOLD_IA    = 5.0
+TIER_GOLD_CONF  = 0.30
+# Moderate Evidence (was GOOD)
+TIER_GOOD_IA    = 2.0
+TIER_GOOD_CONF  = 0.50
+# Indicative        (was SILVER)
+TIER_SILVER_IA   = 1.0
+TIER_SILVER_CONF = 0.65
+# ── NCBI TAXONOMY API ─────────────────────────────────────────
+NCBI_SEARCH_URL  = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
+NCBI_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
+NCBI_FETCH_URL   = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
+NCBI_TOOL        = "FunGO"
+NCBI_EMAIL       = "fungo@research.com"
+# ── FLASK ─────────────────────────────────────────────────────
+PORT          = int(os.environ.get("FUNGO_PORT", 5000))
+DEBUG         = os.environ.get("FUNGO_DEBUG", "0") == "1"
+MAX_SEQUENCES = int(os.environ.get("FUNGO_MAX_SEQ", 10))
+# ── Runtime helpers ───────────────────────────────────────────
+def ensure_dirs():
+    """Create required runtime directories. Called once at startup."""
+    EMB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
+    logger.info("[config] EMB_CACHE_DIR ready → %s", EMB_CACHE_DIR)
+def validate_paths() -> bool:
+    """
+    Check that all required model files exist.
+    Returns True if all found, False if any missing.
+    Called at startup before loading models.
+    """
+    required = {
+        "PKL_DIR":   PKL_DIR,
+        "VOCAB_PKL": VOCAB_PKL,
+        "IA_PKL":    IA_PKL,
+        "FEAT_META": FEAT_META,
+        "MODEL_CACHE_DIR": MODEL_CACHE_DIR,
+    }
+    all_ok = True
+    for name, path in required.items():
+        if path.exists():
+            logger.info("[config] ✓ %-18s → %s", name, path)
+        else:
+            logger.error("[config] ✗ %-18s → %s  (NOT FOUND)", name, path)
+            all_ok = False
+    return all_ok

embedder.py ADDED Viewed

	@@ -0,0 +1,187 @@

+# embedder.py
+"""
+FunGO — ESM2 Embedding Extractor
+==================================
+Extracts layers 30–35 from ESM2-t36-3B.
+- Auto-detects CPU vs GPU
+- Caches embeddings per session to avoid re-extraction
+- Lazy model loading (loaded only on first request)
+"""
+import os
+import hashlib
+import numpy as np
+import torch
+from pathlib import Path
+from config import (
+    MODEL_CACHE_DIR, MODEL_NAME, LAYERS_TO_USE,
+    MAX_SEQ_LENGTH, BATCH_SIZE, DEVICE, USE_FP16,
+    EMB_CACHE_DIR,
+)
+os.environ["TRANSFORMERS_OFFLINE"]  = "1"
+os.environ["HF_DATASETS_OFFLINE"]   = "1"
+os.environ["TRANSFORMERS_CACHE"]    = str(MODEL_CACHE_DIR)
+os.environ["HF_HOME"]               = str(MODEL_CACHE_DIR)
+N_ESM_DIMS = len(LAYERS_TO_USE) * 2560   # 6 × 2560 = 15,360
+# ── Lazy globals ──────────────────────────────────────────────────────────
+_tokenizer = None
+_model     = None
+def _load_model():
+    """Load ESM2 tokenizer and model (only once)."""
+    global _tokenizer, _model
+    if _tokenizer is not None and _model is not None:
+        return _tokenizer, _model
+    print(f"[embedder] Loading ESM2 from local cache → {MODEL_CACHE_DIR}")
+    print(f"[embedder] Device: {DEVICE}  |  FP16: {USE_FP16}")
+    from transformers import EsmTokenizer, EsmModel
+    _tokenizer = EsmTokenizer.from_pretrained(
+        MODEL_NAME,
+        cache_dir=MODEL_CACHE_DIR,
+        local_files_only=True,
+    )
+    _model = EsmModel.from_pretrained(
+        MODEL_NAME,
+        cache_dir=MODEL_CACHE_DIR,
+        output_hidden_states=True,
+        local_files_only=True,
+    )
+    if USE_FP16:
+        _model = _model.to(DEVICE).half()
+    else:
+        _model = _model.to(DEVICE)
+    _model.eval()
+    for p in _model.parameters():
+        p.requires_grad = False
+    print(f"[embedder] Model ready on {DEVICE}")
+    return _tokenizer, _model
+def _seq_cache_key(sequences: list) -> str:
+    """Hash sequences to use as cache filename."""
+    joined = "|".join(f"{s[:50]}{len(s)}" for s in sequences)
+    return hashlib.md5(joined.encode()).hexdigest()[:16]
+def _load_cache(key: str):
+    path = EMB_CACHE_DIR / f"{key}.npy"
+    if path.exists():
+        return np.load(str(path))
+    return None
+def _save_cache(key: str, arr: np.ndarray):
+    np.save(str(EMB_CACHE_DIR / f"{key}.npy"), arr)
+def extract(sequences: list) -> np.ndarray:
+    """
+    Extract ESM2 embeddings for a list of sequences.
+    Returns np.ndarray of shape (N, 15360), dtype float32.
+    Sequences are truncated to MAX_SEQ_LENGTH if needed.
+    Uses cache to avoid re-extraction.
+    """
+    # Truncate sequences
+    seqs_truncated = [s[:MAX_SEQ_LENGTH] for s in sequences]
+    N = len(seqs_truncated)
+    # Check cache
+    cache_key  = _seq_cache_key(seqs_truncated)
+    cached_emb = _load_cache(cache_key)
+    if cached_emb is not None and cached_emb.shape == (N, N_ESM_DIMS):
+        print(f"[embedder] Cache hit — skipping extraction for {N} sequences")
+        return cached_emb.astype(np.float32)
+    print(f"[embedder] Extracting embeddings: {N} sequences on {DEVICE}")
+    tokenizer, model = _load_model()
+    X = np.zeros((N, N_ESM_DIMS), dtype=np.float32)
+    current_batch = BATCH_SIZE
+    with torch.no_grad():
+        i = 0
+        while i < N:
+            batch_end  = min(i + current_batch, N)
+            batch_seqs = seqs_truncated[i:batch_end]
+            try:
+                inputs = tokenizer(
+                    batch_seqs,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                    max_length=MAX_SEQ_LENGTH + 2,
+                )
+                inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+                outputs       = model(**inputs)
+                hidden_states = outputs.hidden_states
+                for j, seq in enumerate(batch_seqs):
+                    seq_len    = len(seq)
+                    layer_vecs = []
+                    for layer_idx in LAYERS_TO_USE:
+                        h = hidden_states[layer_idx][j, 1:seq_len + 1, :]
+                        v = h.mean(dim=0)
+                        if DEVICE == "cuda":
+                            v = v.float().cpu().numpy()
+                        else:
+                            v = v.numpy()
+                        layer_vecs.append(v)
+                    X[i + j] = np.concatenate(layer_vecs)
+                i += len(batch_seqs)
+                print(f"[embedder] {i}/{N} done")
+            except RuntimeError as e:
+                if "out of memory" in str(e).lower() and current_batch > 1:
+                    current_batch = max(1, current_batch // 2)
+                    print(f"[embedder] OOM — batch size reduced to {current_batch}")
+                    if DEVICE == "cuda":
+                        torch.cuda.empty_cache()
+                else:
+                    raise
+    # Sanitise
+    bad = np.isnan(X).sum() + np.isinf(X).sum()
+    if bad > 0:
+        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
+    # Save cache
+    _save_cache(cache_key, X)
+    print(f"[embedder] Saved to cache: {cache_key}")
+    return X
+def build_features(X_esm: np.ndarray, taxon_ids: list,
+                   top50_taxa: list) -> np.ndarray:
+    """
+    Append 51-dim taxonomy features to ESM embeddings.
+    Returns (N, 15411) feature matrix.
+    """
+    N = X_esm.shape[0]
+    taxon_to_i = {t: i for i, t in enumerate(top50_taxa)}
+    X_tax = np.zeros((N, 51), dtype=np.float32)
+    for i, tx in enumerate(taxon_ids):
+        if tx is not None and tx in taxon_to_i:
+            X_tax[i, taxon_to_i[tx]] = 1.0
+        else:
+            X_tax[i, 50] = 1.0   # unknown species flag
+    return np.hstack([X_esm, X_tax])

filter.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# filter.py
+"""
+FunGO — Smart Tier Filtering
+==============================
+Removes generic/root GO terms and assigns evidence tiers
+to remaining predictions.
+Changes from original:
+  1. Tier names updated:
+       GOLD   → STRONG     (Strong Evidence)
+       GOOD   → MODERATE   (Moderate Evidence)
+       SILVER → INDICATIVE
+  2. Combined score = ia_weight × confidence
+     Used for ranking — more scientifically sound.
+  3. filter_predictions() returns a dict with two keys:
+       "display" — top 20 by combined score (for UI screen)
+       "all"     — full filtered list (for CSV download)
+  4. summarise() updated to use new tier keys.
+  5. Blacklist + IA/confidence thresholds → completely unchanged.
+"""
+import logging
+from config import (
+    BLACKLIST_TERMS,
+    TIER_GOLD_IA,   TIER_GOLD_CONF,
+    TIER_GOOD_IA,   TIER_GOOD_CONF,
+    TIER_SILVER_IA, TIER_SILVER_CONF,
+)
+logger = logging.getLogger(__name__)
+ONT_LABELS = {
+    "MFO": "Molecular Function",
+    "BPO": "Biological Process",
+    "CCO": "Cellular Component",
+}
+TIER_LABELS = {
+    "STRONG":     "Strong Evidence",
+    "MODERATE":   "Moderate Evidence",
+    "INDICATIVE": "Indicative",
+}
+TIER_RANK = {"STRONG": 0, "MODERATE": 1, "INDICATIVE": 2}
+# Max predictions shown on screen per protein
+TOP_N_DISPLAY = 20
+def assign_tier(go_term: str, ia: float, confidence: float) -> str:
+    """
+    Assign evidence tier. Thresholds unchanged from original.
+    Returns: "STRONG" | "MODERATE" | "INDICATIVE" | "NOISE"
+    """
+    if go_term in BLACKLIST_TERMS:
+        return "NOISE"
+    if ia > TIER_GOLD_IA   and confidence >= TIER_GOLD_CONF:
+        return "STRONG"
+    if ia > TIER_GOOD_IA   and confidence >= TIER_GOOD_CONF:
+        return "MODERATE"
+    if ia > TIER_SILVER_IA and confidence >= TIER_SILVER_CONF:
+        return "INDICATIVE"
+    return "NOISE"
+def combined_score(ia: float, confidence: float) -> float:
+    """
+    Ranking score = ia_weight × confidence.
+    Balances specificity (IA) and model certainty (confidence).
+    """
+    return round(ia * confidence, 6)
+def filter_predictions(raw_predictions: list, ia_weights: dict) -> dict:
+    """
+    Filter raw predictions and return display + full sets.
+    Returns
+    -------
+    {
+      "display": top-20 predictions (sorted by combined_score desc),
+      "all":     all filtered predictions (for CSV)
+    }
+    Each prediction dict contains:
+      go_term, ontology, ontology_label, confidence, threshold,
+      ia_weight, combined_score, tier, tier_rank, tier_label
+    """
+    filtered = []
+    for pred in raw_predictions:
+        go_term    = pred["go_term"]
+        confidence = pred["confidence"]
+        ia         = float(ia_weights.get(go_term, 0.0))
+        tier       = assign_tier(go_term, ia, confidence)
+        if tier == "NOISE":
+            continue
+        if tier not in TIER_RANK:
+            logger.warning("Unknown tier %r for %s — skipping", tier, go_term)
+            continue
+        score = combined_score(ia, confidence)
+        filtered.append({
+            **pred,
+            "ia_weight":      round(ia, 4),
+            "combined_score": score,
+            "tier":           tier,
+            "tier_rank":      TIER_RANK[tier],
+            "tier_label":     TIER_LABELS[tier],
+            "ontology_label": ONT_LABELS.get(pred["ontology"], pred["ontology"]),
+        })
+    # Sort by combined score descending, tier_rank as tiebreaker
+    filtered.sort(key=lambda x: (-x["combined_score"], x["tier_rank"]))
+    return {
+        "display": filtered[:TOP_N_DISPLAY],
+        "all":     filtered,
+    }
+def summarise(filtered_display: list, all_filtered: list, protein_id: str) -> dict:
+    """
+    Per-protein summary. Counts are over ALL filtered (not just top-20).
+    """
+    ont_counts  = {"MFO": 0, "BPO": 0, "CCO": 0}
+    tier_counts = {"STRONG": 0, "MODERATE": 0, "INDICATIVE": 0}
+    for p in all_filtered:
+        ont = p.get("ontology", "")
+        if ont in ont_counts:
+            ont_counts[ont] += 1
+        t = p.get("tier", "")
+        if t in tier_counts:
+            tier_counts[t] += 1
+    n = len(all_filtered)
+    return {
+        "protein_id":          protein_id,
+        "total_filtered":      n,
+        "displayed":           len(filtered_display),
+        "by_ontology":         ont_counts,
+        "by_tier":             tier_counts,
+        "has_strong_evidence": tier_counts["STRONG"] > 0,
+        "avg_confidence":      round(sum(p["confidence"]     for p in all_filtered) / n, 4) if n else 0.0,
+        "avg_ia":              round(sum(p["ia_weight"]      for p in all_filtered) / n, 4) if n else 0.0,
+        "avg_combined_score":  round(sum(p["combined_score"] for p in all_filtered) / n, 4) if n else 0.0,
+    }

hf_README.md ADDED Viewed

	@@ -0,0 +1,41 @@

+---
+title: FunGO
+emoji: 🧬
+colorFrom: blue
+colorTo: green
+sdk: gradio
+sdk_version: 4.44.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Protein Function Prediction using ESM2 + XGBoost
+---
+# FunGO — Protein Function Prediction
+**Beyond Prediction — Understanding Function.**
+FunGO predicts Gene Ontology (GO) terms for protein sequences using:
+- **ESM2-t36-3B** — protein language model embeddings (layers 30–35)
+- **XGBoost classifiers** — 4,133 GO-term specific models
+- **Evidence-tiered filtering** — Strong / Moderate / Indicative
+## Evidence Tiers
+| Tier | IA Weight | Confidence | Description |
+|------|-----------|------------|-------------|
+| Strong Evidence | > 5.0 | ≥ 0.30 | Highly specific GO term |
+| Moderate Evidence | > 2.0 | ≥ 0.50 | Moderately specific term |
+| Indicative | > 1.0 | ≥ 0.65 | Lower specificity, high confidence |
+## Ontologies Covered
+- **MFO** — Molecular Function
+- **BPO** — Biological Process
+- **CCO** — Cellular Component
+## Development Team
+- **Dr. Beenish Maqsood** — Principal Investigator, School of Biochemistry and Biotechnology, University of the Punjab
+- **Dr. Naeem Mahmood** — Co-Supervisor, School of Biochemistry and Biotechnology, University of the Punjab
+- **Muteeba Azhar** — Lead Developer, MS Researcher, University of the Punjab

predictor.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# predictor.py
+"""
+FunGO — Prediction Engine
+===========================
+Loads XGBoost models once at startup.
+Runs inference across all 3 ontologies (MFO, BPO, CCO).
+Changes from original:
+  1. Added get_model_stats() — returns classifier counts per ontology
+     (used by /model/info endpoint).
+  2. Fixed open() to use context managers (file handles now closed).
+  3. tempfile.mktemp() replaced with NamedTemporaryFile (WSL fix).
+  4. Failed classifiers are counted and logged instead of silent pass.
+  5. Input shape validation in predict().
+"""
+import json
+import logging
+import pickle
+import shutil
+import subprocess
+import tempfile
+import numpy as np
+from pathlib import Path
+from config import PKL_DIR, VOCAB_PKL, IA_PKL, FEAT_META
+logger = logging.getLogger(__name__)
+ONTS   = ["MFO", "BPO", "CCO"]
+# ── Globals ───────────────────────────────────────────────────
+_models_dict     = None
+_thresholds_dict = None
+_ia_weights      = None
+_vocabularies    = None
+_top50_taxa      = None
+# ── Helpers ───────────────────────────────────────────────────
+def _wsl_copy(src: Path) -> Path:
+    """Copy file to temp path (WSL mounted-drive permission workaround)."""
+    with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as tmp:
+        tmp_path = Path(tmp.name)
+    shutil.copy2(str(src), str(tmp_path))
+    return tmp_path
+def _safe_load(path: Path) -> object:
+    """Load pickle with WSL permission workaround if needed."""
+    try:
+        subprocess.run(["chmod", "644", str(path)], check=False, capture_output=True)
+    except Exception:
+        pass
+    try:
+        with open(path, "rb") as fh:
+            return pickle.load(fh)
+    except PermissionError:
+        pass
+    tmp_path = None
+    try:
+        tmp_path = _wsl_copy(path)
+        with open(tmp_path, "rb") as fh:
+            return pickle.load(fh)
+    finally:
+        if tmp_path and tmp_path.exists():
+            tmp_path.unlink()
+def _safe_read_json(path: Path) -> dict:
+    """Read JSON with WSL permission workaround."""
+    try:
+        subprocess.run(["chmod", "644", str(path)], check=False, capture_output=True)
+    except Exception:
+        pass
+    for mode in ("r", "rb"):
+        try:
+            with open(path, mode) as fh:
+                raw = fh.read()
+                if isinstance(raw, bytes):
+                    raw = raw.decode("utf-8", errors="replace")
+                return json.loads(raw)
+        except PermissionError:
+            continue
+    result = subprocess.run(["cat", str(path)], capture_output=True, text=True, check=True)
+    return json.loads(result.stdout)
+# ── Public API ────────────────────────────────────────────────
+def load_all():
+    """
+    Load all models and supporting data into memory.
+    Call once at Flask startup (~30–120 s depending on hardware).
+    """
+    global _models_dict, _thresholds_dict, _ia_weights, _vocabularies, _top50_taxa
+    logger.info("[predictor] Loading vocabularies …")
+    _vocabularies = _safe_load(VOCAB_PKL)
+    logger.info("[predictor] Loading IA weights …")
+    _ia_weights = _safe_load(IA_PKL)
+    logger.info("[predictor] IA weights: %d terms", len(_ia_weights))
+    meta        = _safe_read_json(FEAT_META)
+    _top50_taxa = [int(t) for t in meta["taxonomy_info"]["top50_taxa"]]
+    logger.info("[predictor] Top-50 taxa loaded (%d)", len(_top50_taxa))
+    _models_dict     = {}
+    _thresholds_dict = {}
+    for ont in ONTS:
+        pkl_path = PKL_DIR / f"models_{ont}.pkl"
+        size_mb  = pkl_path.stat().st_size / 1e6
+        logger.info("[predictor] Loading %s (%.0f MB) …", pkl_path.name, size_mb)
+        raw       = _safe_load(pkl_path)
+        first_key = next(iter(raw))
+        if first_key.startswith("GO:"):
+            models_d     = raw
+            thresholds_d = {t: 0.5 for t in raw}
+        else:
+            clf_list  = raw["models"]
+            term_list = raw["selected_terms"]
+            thr_raw   = raw.get("thresholds", [0.5] * len(clf_list))
+            thr_list  = (list(thr_raw) if not isinstance(thr_raw, dict)
+                         else [thr_raw.get(t, 0.5) for t in term_list])
+            models_d     = dict(zip(term_list, clf_list))
+            thresholds_d = dict(zip(term_list, thr_list))
+        _models_dict[ont]     = models_d
+        _thresholds_dict[ont] = thresholds_d
+        logger.info("[predictor] %s: %d classifiers ready", ont, len(models_d))
+    logger.info("[predictor] All models loaded successfully.")
+def get_top50_taxa() -> list:
+    if _top50_taxa is None:
+        raise RuntimeError("Models not loaded — call load_all() first.")
+    return _top50_taxa
+def get_ia_weights() -> dict:
+    if _ia_weights is None:
+        raise RuntimeError("Models not loaded — call load_all() first.")
+    return _ia_weights
+def get_model_stats() -> dict:
+    """
+    Return classifier counts per ontology.
+    Used by GET /model/info endpoint.
+    Returns: {"MFO": 1500, "BPO": 1500, "CCO": 1133}
+    """
+    if _models_dict is None:
+        raise RuntimeError("Models not loaded — call load_all() first.")
+    return {ont: len(models) for ont, models in _models_dict.items()}
+def predict(X_final: np.ndarray, protein_ids: list) -> list:
+    """
+    Run inference for all proteins across all 3 ontologies.
+    Parameters
+    ----------
+    X_final     : (N, 15411) float32 feature matrix
+    protein_ids : list of N protein ID strings
+    Returns
+    -------
+    List of raw prediction dicts:
+        [{protein_id, go_term, ontology, confidence, threshold}, …]
+    """
+    if _models_dict is None:
+        raise RuntimeError("Models not loaded — call load_all() first.")
+    N = X_final.shape[0]
+    if N != len(protein_ids):
+        raise ValueError(
+            f"X_final has {N} rows but protein_ids has {len(protein_ids)} entries."
+        )
+    all_preds    = []
+    failed_terms = 0
+    for ont in ONTS:
+        ont_models     = _models_dict[ont]
+        ont_thresholds = _thresholds_dict[ont]
+        n_terms        = len(ont_models)
+        logger.info("[predictor] %s — scoring %d terms × %d proteins …", ont, n_terms, N)
+        for go_term, clf in ont_models.items():
+            threshold = float(ont_thresholds.get(go_term, 0.5))
+            try:
+                proba = clf.predict_proba(X_final)[:, 1]
+                for i, pid in enumerate(protein_ids):
+                    conf = float(proba[i])
+                    if conf >= threshold:
+                        all_preds.append({
+                            "protein_id": pid,
+                            "go_term":    go_term,
+                            "ontology":   ont,
+                            "confidence": round(conf, 4),
+                            "threshold":  round(threshold, 4),
+                        })
+            except Exception as exc:
+                failed_terms += 1
+                logger.warning("[predictor] Classifier failed %s/%s: %s", ont, go_term, exc)
+    if failed_terms:
+        logger.warning("[predictor] Total failed classifiers: %d", failed_terms)
+    logger.info("[predictor] Inference complete — %d raw predictions", len(all_preds))
+    return all_preds

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flask>=3.0.0
+flask-cors>=4.0.0
+numpy>=1.24.0
+torch>=2.0.0
+transformers>=4.35.0
+xgboost>=2.0.0
+requests>=2.31.0
+gunicorn>=21.0.0
+gradio>=4.0.0

taxonomy.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# taxonomy.py
+"""
+FunGO — NCBI Taxonomy Service
+===============================
+Species name → taxon ID lookup and reverse lookup.
+Fixes applied:
+  1. UID string/int consistency — result_map keys are always strings,
+     now explicitly uses str(uid) so 9606 never resolves to {}.
+  2. Species-rank preference — results sorted so "species" rank
+     appears before "genus". Prevents 9605 (Homo genus) showing
+     before 9606 (Homo sapiens species).
+  3. Exact-name boost — exact query match moved to position 0.
+  4. Cache key includes max_results to prevent stale smaller lists.
+  5. xml.etree.ElementTree replaces fragile regex XML parsing.
+  6. Retry logic — 3 attempts with 2s gap on connection errors.
+"""
+import logging
+import time
+import xml.etree.ElementTree as ET
+import requests
+from config import (
+    NCBI_SEARCH_URL, NCBI_SUMMARY_URL, NCBI_FETCH_URL,
+    NCBI_TOOL, NCBI_EMAIL,
+)
+logger  = logging.getLogger(__name__)
+HEADERS = {"User-Agent": f"FunGO/1.0 ({NCBI_EMAIL})"}
+TIMEOUT = 10
+RETRIES = 3
+RETRY_DELAY = 2
+_RANK_PRIORITY = {
+    "species": 0, "subspecies": 1, "varietas": 2,
+    "forma": 3, "strain": 4, "no rank": 5,
+    "genus": 6, "family": 7, "order": 8,
+    "class": 9, "phylum": 10, "kingdom": 11, "superkingdom": 12,
+}
+def _rank_priority(rank: str) -> int:
+    return _RANK_PRIORITY.get(rank.lower().strip(), 99)
+_search_cache: dict = {}
+_id_to_info_cache: dict = {}
+def _ncbi_get(url: str, params: dict) -> requests.Response:
+    last_exc = None
+    for attempt in range(1, RETRIES + 1):
+        try:
+            resp = requests.get(url, params=params, timeout=TIMEOUT, headers=HEADERS)
+            resp.raise_for_status()
+            return resp
+        except requests.RequestException as exc:
+            last_exc = exc
+            if attempt < RETRIES:
+                logger.warning("[taxonomy] Request error (attempt %d/%d): %s — retrying in %ds",
+                               attempt, RETRIES, exc, RETRY_DELAY)
+                time.sleep(RETRY_DELAY)
+    raise last_exc
+def search_species(query: str, max_results: int = 8) -> list:
+    """
+    Search NCBI taxonomy by species name.
+    Returns [{taxon_id, scientific_name, common_name, rank, division}]
+    Sorted: species rank first, exact name match at position 0.
+    """
+    query = query.strip()
+    if len(query) < 2:
+        return []
+    cache_key = (query.lower(), max_results)
+    if cache_key in _search_cache:
+        return _search_cache[cache_key]
+    try:
+        search_resp = _ncbi_get(NCBI_SEARCH_URL, {
+            "db": "taxonomy", "term": query,
+            "retmax": max_results, "retmode": "json",
+            "tool": NCBI_TOOL, "email": NCBI_EMAIL,
+        })
+        ids = search_resp.json().get("esearchresult", {}).get("idlist", [])
+        if not ids:
+            _search_cache[cache_key] = []
+            return []
+        summary_resp = _ncbi_get(NCBI_SUMMARY_URL, {
+            "db": "taxonomy", "id": ",".join(ids),
+            "retmode": "json", "tool": NCBI_TOOL, "email": NCBI_EMAIL,
+        })
+        result_map = summary_resp.json().get("result", {})
+        uids = result_map.get("uids", ids)
+        results = []
+        for uid in uids:
+            item = result_map.get(str(uid), {})   # FIX: explicit str()
+            if not item:
+                continue
+            results.append({
+                "taxon_id":        int(uid),
+                "scientific_name": item.get("scientificname", ""),
+                "common_name":     item.get("commonname", ""),
+                "rank":            item.get("rank", ""),
+                "division":        item.get("division", ""),
+            })
+        # FIX: sort by rank — species before genus
+        results.sort(key=lambda r: _rank_priority(r.get("rank", "")))
+        # FIX: exact name match → front of list
+        q_lower = query.lower()
+        exact = [r for r in results if r["scientific_name"].lower() == q_lower]
+        rest  = [r for r in results if r["scientific_name"].lower() != q_lower]
+        results = exact + rest
+        _search_cache[cache_key] = results
+        logger.info("[taxonomy] search %r → %d results", query, len(results))
+        return results
+    except Exception as exc:
+        logger.error("[taxonomy] search_species(%r) failed: %s", query, exc)
+        return [{"error": str(exc)}]
+def get_taxon_info(taxon_id: int) -> dict:
+    """
+    Reverse lookup: taxon ID → full species info with lineage.
+    Uses xml.etree.ElementTree — handles multi-line XML correctly.
+    """
+    if taxon_id in _id_to_info_cache:
+        return _id_to_info_cache[taxon_id]
+    base = {
+        "taxon_id": taxon_id, "scientific_name": "",
+        "common_name": "", "rank": "", "division": "",
+        "lineage": "", "verified": False,
+    }
+    try:
+        resp = _ncbi_get(NCBI_FETCH_URL, {
+            "db": "taxonomy", "id": taxon_id,
+            "retmode": "xml", "tool": NCBI_TOOL, "email": NCBI_EMAIL,
+        })
+        root     = ET.fromstring(resp.text)
+        taxon_el = root.find("Taxon")
+        if taxon_el is None:
+            base["error"] = "Taxon element not found in NCBI XML"
+            return base
+        def txt(tag: str) -> str:
+            el = taxon_el.find(tag)
+            return (el.text or "").strip() if el is not None else ""
+        lineage_parts = [
+            (a.findtext("ScientificName") or "").strip()
+            for a in taxon_el.findall("./LineageEx/Taxon")
+        ]
+        common = (taxon_el.findtext("OtherNames/CommonName") or
+                  taxon_el.findtext("CommonName") or "")
+        info = {
+            **base,
+            "scientific_name": txt("ScientificName"),
+            "common_name":     common.strip(),
+            "rank":            txt("Rank"),
+            "division":        txt("Division"),
+            "lineage":         " > ".join(p for p in lineage_parts if p),
+            "verified":        True,
+        }
+        _id_to_info_cache[taxon_id] = info
+        logger.info("[taxonomy] Resolved taxon %d → %s", taxon_id, info["scientific_name"])
+        return info
+    except ET.ParseError as exc:
+        logger.error("[taxonomy] XML parse error for taxon %d: %s", taxon_id, exc)
+        base["error"] = f"XML parse error: {exc}"
+        return base
+    except Exception as exc:
+        logger.error("[taxonomy] get_taxon_info(%d) failed: %s", taxon_id, exc)
+        base["error"] = str(exc)
+        return base
+def resolve_taxon(taxon_id: int, top50_taxa: list) -> dict:
+    """Check training-set membership for a taxon ID."""
+    info        = get_taxon_info(taxon_id)
+    in_training = taxon_id in top50_taxa
+    return {
+        **info,
+        "in_training":     in_training,
+        "training_status": "in_training_data" if in_training else "unknown_species_fallback",
+    }