Spaces:

Muteeba
/

FunGO

Sleeping

App Files Files Community

Muteeba commited on Apr 16

Commit

c557a60

1 Parent(s): e22292f

fix: graceful start without model files

Browse files

Files changed (2) hide show

app.py +58 -79
hf-space +1 -0

app.py CHANGED Viewed

@@ -1,30 +1,7 @@
-# app.py — FunGO HuggingFace Space
-"""
-FunGO v2.0 — HuggingFace Spaces Deployment
-=============================================
-Flask API running on port 7860.
-Model files loaded from /data/ (HF persistent storage).
-To upload model files:
-  pip install huggingface_hub
-  huggingface-cli login
-  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/models        /data/models        --repo-type=space
-  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/labels        /data/labels        --repo-type=space
-  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/go_data       /data/go_data       --repo-type=space
-  huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/features      /data/features      --repo-type=space
-  huggingface-cli upload Muteeba/FunGO /mnt/e/repeat/embeddings/model_cache /data/esm2_cache --repo-type=space
-"""
-import csv
-import io
-import logging
-import os
-import re as _re
-import sys
-import time
 from collections import OrderedDict
-# ── HuggingFace paths ─────────────────────────────────────────
 os.environ.setdefault("FUNGO_PKL_DIR",    "/data/models")
 os.environ.setdefault("FUNGO_VOCAB_PKL",  "/data/labels/vocabularies.pkl")
 os.environ.setdefault("FUNGO_IA_PKL",     "/data/go_data/ia_weights.pkl")
@@ -37,18 +14,14 @@ os.environ.setdefault("FUNGO_PORT",       "7860")
 from flask import Flask, jsonify, request, Response
 from flask_cors import CORS
 import config
 import predictor
 import embedder
 import filter as flt
 import taxonomy
-logging.basicConfig(
-    level=logging.INFO,
-    format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
-    datefmt="%H:%M:%S",
-)
 log = logging.getLogger("fungo.app")
 app = Flask(__name__)
@@ -57,10 +30,10 @@ app.config["MAX_CONTENT_LENGTH"] = 2 * 1024 * 1024
 _csv_store: OrderedDict = OrderedDict()
 _CSV_MAX = 50
 def _store_csv(job_id, predictions):
-    if len(_csv_store) >= _CSV_MAX:
-        _csv_store.popitem(last=False)
     _csv_store[job_id] = {"predictions": predictions, "ts": time.time()}
 def _make_csv(predictions):
@@ -70,14 +43,13 @@ def _make_csv(predictions):
                 "tier","tier_label","confidence","ia_weight","combined_score","threshold"])
     for pid, data in predictions.items():
         for p in data.get("all", []):
-            w.writerow([pid, p.get("go_term",""), p.get("ontology",""),
-                p.get("ontology_label",""), p.get("tier",""), p.get("tier_label",""),
-                p.get("confidence",""), p.get("ia_weight",""),
-                p.get("combined_score",""), p.get("threshold","")])
     return out.getvalue()
 _OX_RE = _re.compile(r"OX=(\d+)")
 def _parse_taxon_id(header):
     m = _OX_RE.search(header or "")
     return int(m.group(1)) if m else None
@@ -91,21 +63,19 @@ def parse_fasta(fasta_text):
             if current_id is not None:
                 seq = "".join(current_seq).upper()
                 if seq:
-                    proteins.append({"id": current_id, "seq": seq,
-                        "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
             current_hdr = line[1:].strip()
             parts = current_hdr.split("|")
-            current_id = parts[1] if len(parts) >= 3 else current_hdr.split()[0]
             current_seq = []
-        else:
-            current_seq.append(line)
     if current_id is not None:
         seq = "".join(current_seq).upper()
         if seq:
-            proteins.append({"id": current_id, "seq": seq,
-                "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
-    if not proteins:
-        raise ValueError("No valid protein sequences found in FASTA input.")
     return proteins
 def _run_prediction(fasta_text, taxon_id_override):
@@ -117,37 +87,40 @@ def _run_prediction(fasta_text, taxon_id_override):
     taxon_ids   = [taxon_id_override if taxon_id_override is not None
                    else p["taxon_id"] for p in proteins]
     log.info("Proteins: %s | Taxon IDs: %s", protein_ids, taxon_ids)
-    t0      = time.perf_counter()
     X_esm   = embedder.extract(sequences)
     top50   = predictor.get_top50_taxa()
     X_final = embedder.build_features(X_esm, taxon_ids, top50)
     raw_preds  = predictor.predict(X_final, protein_ids)
     ia_weights = predictor.get_ia_weights()
     for p in raw_preds:
-        p["ia_weight"] = round(float(ia_weights.get(p["go_term"], 0.0)), 4)
-    return proteins, raw_preds, ia_weights, round(time.perf_counter() - t0, 2)
 @app.route("/health", methods=["GET"])
 def health():
-    return jsonify({"status":"ok","device":config.DEVICE,"fp16":config.USE_FP16,"version":"2.0.0"})
 @app.route("/model/info", methods=["GET"])
 def model_info():
     try: stats = predictor.get_model_stats()
-    except RuntimeError as e: return jsonify({"error": str(e)}), 503
     return jsonify({"device":config.DEVICE,"fp16":config.USE_FP16,
         "model_name":config.MODEL_NAME,"ontologies":stats,
         "top50_taxa_count":len(predictor.get_top50_taxa()),
         "thresholds":{
-            "STRONG":    {"min_ia":config.TIER_GOLD_IA,   "min_conf":config.TIER_GOLD_CONF},
-            "MODERATE":  {"min_ia":config.TIER_GOOD_IA,   "min_conf":config.TIER_GOOD_CONF},
-            "INDICATIVE":{"min_ia":config.TIER_SILVER_IA, "min_conf":config.TIER_SILVER_CONF},
         },"display_limit":flt.TOP_N_DISPLAY})
 @app.route("/taxonomy/search", methods=["GET"])
 def taxonomy_search():
     q = request.args.get("q","").strip()
-    if len(q) < 2: return jsonify({"error":"Query must be at least 2 characters."}), 400
     try: max_r = min(int(request.args.get("max_results",8)),20)
     except: max_r = 8
     return jsonify({"query":q,"results":taxonomy.search_species(q,max_results=max_r)})
@@ -162,6 +135,8 @@ def taxonomy_verify():
 @app.route("/predict", methods=["POST"])
 def predict():
     if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
     body = request.get_json(silent=True) or {}
     fasta_text = body.get("fasta","").strip()
@@ -169,19 +144,17 @@ def predict():
     taxon_id_override = None
     if "taxon_id" in body:
         try: taxon_id_override = int(body["taxon_id"])
-        except: return jsonify({"error":f"Invalid taxon_id"}), 400
     try:
         proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
-    except ValueError as e: return jsonify({"error": str(e)}), 400
-    except RuntimeError as e: return jsonify({"error": str(e)}), 503
     except Exception as e:
-        log.exception("Prediction error"); return jsonify({"error": str(e)}), 500
     protein_ids = [p["id"] for p in proteins]
     raw_by_pid  = {pid:[] for pid in protein_ids}
     for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
-    predictions, csv_data, total_display, total_all = {}, {}, 0, 0
     for prot in proteins:
         pid = prot["id"]
         res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
@@ -191,7 +164,6 @@ def predict():
             "summary":flt.summarise(display,all_f,pid),
             "display":display,"total_all":len(all_f)}
         csv_data[pid] = {"all":all_f}
     job_id = str(int(time.time()*1000))
     _store_csv(job_id, csv_data)
     return jsonify({"job_id":job_id,
@@ -207,11 +179,13 @@ def download_csv():
     if not job_id: return jsonify({"error":"job_id required."}), 400
     job = _csv_store.get(job_id)
     if not job: return jsonify({"error":f"Job '{job_id}' not found."}), 404
-    return Response(_make_csv(job["predictions"]), mimetype="text/csv",
         headers={"Content-Disposition":f"attachment; filename=fungo_{job_id}.csv"})
 @app.route("/predict/debug", methods=["POST"])
 def predict_debug():
     if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
     body = request.get_json(silent=True) or {}
     fasta_text = body.get("fasta","").strip()
@@ -222,15 +196,11 @@ def predict_debug():
         except: return jsonify({"error":"Invalid taxon_id"}), 400
     try:
         proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
-    except ValueError as e: return jsonify({"error":str(e)}), 400
-    except RuntimeError as e: return jsonify({"error":str(e)}), 503
     except Exception as e:
         log.exception("Debug error"); return jsonify({"error":str(e)}), 500
     protein_ids = [p["id"] for p in proteins]
     raw_by_pid  = {pid:[] for pid in protein_ids}
     for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
     thr = {"STRONG":{"min_ia":config.TIER_GOLD_IA,"min_conf":config.TIER_GOLD_CONF},
            "MODERATE":{"min_ia":config.TIER_GOOD_IA,"min_conf":config.TIER_GOOD_CONF},
            "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF}}
@@ -244,13 +214,13 @@ def predict_debug():
         for pred in raw_by_pid[pid]:
             go = pred["go_term"]
             if go in accepted: continue
-            ia, conf = pred.get("ia_weight", float(ia_weights.get(go,0.0))), pred["confidence"]
             if go in config.BLACKLIST_TERMS: reason="blacklisted"
-            elif ia <= config.TIER_SILVER_IA: reason=f"ia_too_low (ia={ia:.4f})"
-            elif conf < config.TIER_SILVER_CONF: reason=f"conf_too_low (conf={conf:.4f})"
             else: reason="below_all_tiers"
-            fo.append({"go_term":go,"ontology":pred["ontology"],"confidence":conf,
-                       "ia_weight":ia,"reason":reason})
         fo.sort(key=lambda x:-x["ia_weight"])
         predictions[pid] = {"taxon_id":prot["taxon_id"],
             "summary":flt.summarise(display,all_f,pid),
@@ -268,11 +238,20 @@ def internal(e):
     log.exception("Unhandled error"); return jsonify({"error":"Internal server error."}), 500
 if __name__ == "__main__":
     log.info("FunGO v2.0 — HuggingFace Space starting …")
     config.ensure_dirs()
-    if not config.validate_paths():
-        log.error("Model paths missing!")
-        sys.exit(1)
-    predictor.load_all()
-    log.info("Models loaded. Serving on port 7860 …")
     app.run(host="0.0.0.0", port=7860, debug=False)

+# app.py — FunGO HuggingFace Space v2
+import csv, io, logging, os, re as _re, sys, time
 from collections import OrderedDict
 os.environ.setdefault("FUNGO_PKL_DIR",    "/data/models")
 os.environ.setdefault("FUNGO_VOCAB_PKL",  "/data/labels/vocabularies.pkl")
 os.environ.setdefault("FUNGO_IA_PKL",     "/data/go_data/ia_weights.pkl")
 from flask import Flask, jsonify, request, Response
 from flask_cors import CORS
 import config
 import predictor
 import embedder
 import filter as flt
 import taxonomy
+logging.basicConfig(level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s — %(message)s", datefmt="%H:%M:%S")
 log = logging.getLogger("fungo.app")
 app = Flask(__name__)
 _csv_store: OrderedDict = OrderedDict()
 _CSV_MAX = 50
+_models_ready = False   # flag — True only after successful load
 def _store_csv(job_id, predictions):
+    if len(_csv_store) >= _CSV_MAX: _csv_store.popitem(last=False)
     _csv_store[job_id] = {"predictions": predictions, "ts": time.time()}
 def _make_csv(predictions):
                 "tier","tier_label","confidence","ia_weight","combined_score","threshold"])
     for pid, data in predictions.items():
         for p in data.get("all", []):
+            w.writerow([pid,p.get("go_term",""),p.get("ontology",""),
+                p.get("ontology_label",""),p.get("tier",""),p.get("tier_label",""),
+                p.get("confidence",""),p.get("ia_weight",""),
+                p.get("combined_score",""),p.get("threshold","")])
     return out.getvalue()
 _OX_RE = _re.compile(r"OX=(\d+)")
 def _parse_taxon_id(header):
     m = _OX_RE.search(header or "")
     return int(m.group(1)) if m else None
             if current_id is not None:
                 seq = "".join(current_seq).upper()
                 if seq:
+                    proteins.append({"id":current_id,"seq":seq,
+                        "header":current_hdr,"taxon_id":_parse_taxon_id(current_hdr)})
             current_hdr = line[1:].strip()
             parts = current_hdr.split("|")
+            current_id = parts[1] if len(parts)>=3 else current_hdr.split()[0]
             current_seq = []
+        else: current_seq.append(line)
     if current_id is not None:
         seq = "".join(current_seq).upper()
         if seq:
+            proteins.append({"id":current_id,"seq":seq,
+                "header":current_hdr,"taxon_id":_parse_taxon_id(current_hdr)})
+    if not proteins: raise ValueError("No valid protein sequences found.")
     return proteins
 def _run_prediction(fasta_text, taxon_id_override):
     taxon_ids   = [taxon_id_override if taxon_id_override is not None
                    else p["taxon_id"] for p in proteins]
     log.info("Proteins: %s | Taxon IDs: %s", protein_ids, taxon_ids)
+    t0 = time.perf_counter()
     X_esm   = embedder.extract(sequences)
     top50   = predictor.get_top50_taxa()
     X_final = embedder.build_features(X_esm, taxon_ids, top50)
     raw_preds  = predictor.predict(X_final, protein_ids)
     ia_weights = predictor.get_ia_weights()
     for p in raw_preds:
+        p["ia_weight"] = round(float(ia_weights.get(p["go_term"],0.0)),4)
+    return proteins, raw_preds, ia_weights, round(time.perf_counter()-t0,2)
 @app.route("/health", methods=["GET"])
 def health():
+    return jsonify({"status":"ok","device":config.DEVICE,
+        "fp16":config.USE_FP16,"version":"2.0.0","models_ready":_models_ready})
 @app.route("/model/info", methods=["GET"])
 def model_info():
+    if not _models_ready:
+        return jsonify({"error":"Models not loaded yet. Upload model files to /data/ first."}), 503
     try: stats = predictor.get_model_stats()
+    except RuntimeError as e: return jsonify({"error":str(e)}), 503
     return jsonify({"device":config.DEVICE,"fp16":config.USE_FP16,
         "model_name":config.MODEL_NAME,"ontologies":stats,
         "top50_taxa_count":len(predictor.get_top50_taxa()),
         "thresholds":{
+            "STRONG":    {"min_ia":config.TIER_GOLD_IA,  "min_conf":config.TIER_GOLD_CONF},
+            "MODERATE":  {"min_ia":config.TIER_GOOD_IA,  "min_conf":config.TIER_GOOD_CONF},
+            "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF},
         },"display_limit":flt.TOP_N_DISPLAY})
 @app.route("/taxonomy/search", methods=["GET"])
 def taxonomy_search():
     q = request.args.get("q","").strip()
+    if len(q)<2: return jsonify({"error":"Query must be at least 2 characters."}), 400
     try: max_r = min(int(request.args.get("max_results",8)),20)
     except: max_r = 8
     return jsonify({"query":q,"results":taxonomy.search_species(q,max_results=max_r)})
 @app.route("/predict", methods=["POST"])
 def predict():
+    if not _models_ready:
+        return jsonify({"error":"Models not loaded. Upload model files to /data/ first."}), 503
     if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
     body = request.get_json(silent=True) or {}
     fasta_text = body.get("fasta","").strip()
     taxon_id_override = None
     if "taxon_id" in body:
         try: taxon_id_override = int(body["taxon_id"])
+        except: return jsonify({"error":"Invalid taxon_id"}), 400
     try:
         proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
+    except ValueError as e: return jsonify({"error":str(e)}), 400
+    except RuntimeError as e: return jsonify({"error":str(e)}), 503
     except Exception as e:
+        log.exception("Prediction error"); return jsonify({"error":str(e)}), 500
     protein_ids = [p["id"] for p in proteins]
     raw_by_pid  = {pid:[] for pid in protein_ids}
     for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
+    predictions, csv_data, total_display, total_all = {},{},0,0
     for prot in proteins:
         pid = prot["id"]
         res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
             "summary":flt.summarise(display,all_f,pid),
             "display":display,"total_all":len(all_f)}
         csv_data[pid] = {"all":all_f}
     job_id = str(int(time.time()*1000))
     _store_csv(job_id, csv_data)
     return jsonify({"job_id":job_id,
     if not job_id: return jsonify({"error":"job_id required."}), 400
     job = _csv_store.get(job_id)
     if not job: return jsonify({"error":f"Job '{job_id}' not found."}), 404
+    return Response(_make_csv(job["predictions"]),mimetype="text/csv",
         headers={"Content-Disposition":f"attachment; filename=fungo_{job_id}.csv"})
 @app.route("/predict/debug", methods=["POST"])
 def predict_debug():
+    if not _models_ready:
+        return jsonify({"error":"Models not loaded."}), 503
     if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
     body = request.get_json(silent=True) or {}
     fasta_text = body.get("fasta","").strip()
         except: return jsonify({"error":"Invalid taxon_id"}), 400
     try:
         proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
     except Exception as e:
         log.exception("Debug error"); return jsonify({"error":str(e)}), 500
     protein_ids = [p["id"] for p in proteins]
     raw_by_pid  = {pid:[] for pid in protein_ids}
     for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
     thr = {"STRONG":{"min_ia":config.TIER_GOLD_IA,"min_conf":config.TIER_GOLD_CONF},
            "MODERATE":{"min_ia":config.TIER_GOOD_IA,"min_conf":config.TIER_GOOD_CONF},
            "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF}}
         for pred in raw_by_pid[pid]:
             go = pred["go_term"]
             if go in accepted: continue
+            ia,conf = pred.get("ia_weight",float(ia_weights.get(go,0.0))),pred["confidence"]
             if go in config.BLACKLIST_TERMS: reason="blacklisted"
+            elif ia<=config.TIER_SILVER_IA: reason=f"ia_too_low (ia={ia:.4f})"
+            elif conf<config.TIER_SILVER_CONF: reason=f"conf_too_low (conf={conf:.4f})"
             else: reason="below_all_tiers"
+            fo.append({"go_term":go,"ontology":pred["ontology"],
+                       "confidence":conf,"ia_weight":ia,"reason":reason})
         fo.sort(key=lambda x:-x["ia_weight"])
         predictions[pid] = {"taxon_id":prot["taxon_id"],
             "summary":flt.summarise(display,all_f,pid),
     log.exception("Unhandled error"); return jsonify({"error":"Internal server error."}), 500
 if __name__ == "__main__":
+    global _models_ready
     log.info("FunGO v2.0 — HuggingFace Space starting …")
     config.ensure_dirs()
+    paths_ok = config.validate_paths()
+    if paths_ok:
+        try:
+            predictor.load_all()
+            _models_ready = True
+            log.info("Models loaded successfully!")
+        except Exception as e:
+            log.error("Model loading failed: %s", e)
+            log.warning("Starting without models — upload files to /data/ to enable predictions")
+    else:
+        log.warning("Model files not found in /data/ — Space will run but predictions disabled")
+        log.warning("Upload model files using: huggingface-cli upload Muteeba/FunGO")
+    log.info("Serving on port 7860 …")
     app.run(host="0.0.0.0", port=7860, debug=False)

hf-space ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 4e8a67686e6a35864a4b7c8810505d81811f8efa