Muteeba commited on
Commit
c557a60
·
1 Parent(s): e22292f

fix: graceful start without model files

Browse files
Files changed (2) hide show
  1. app.py +58 -79
  2. hf-space +1 -0
app.py CHANGED
@@ -1,30 +1,7 @@
1
- # app.py — FunGO HuggingFace Space
2
- """
3
- FunGO v2.0 — HuggingFace Spaces Deployment
4
- =============================================
5
- Flask API running on port 7860.
6
- Model files loaded from /data/ (HF persistent storage).
7
-
8
- To upload model files:
9
- pip install huggingface_hub
10
- huggingface-cli login
11
- huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/models /data/models --repo-type=space
12
- huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/labels /data/labels --repo-type=space
13
- huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/go_data /data/go_data --repo-type=space
14
- huggingface-cli upload Muteeba/FunGO ./pipeline_outputs/features /data/features --repo-type=space
15
- huggingface-cli upload Muteeba/FunGO /mnt/e/repeat/embeddings/model_cache /data/esm2_cache --repo-type=space
16
- """
17
-
18
- import csv
19
- import io
20
- import logging
21
- import os
22
- import re as _re
23
- import sys
24
- import time
25
  from collections import OrderedDict
26
 
27
- # ── HuggingFace paths ─────────────────────────────────────────
28
  os.environ.setdefault("FUNGO_PKL_DIR", "/data/models")
29
  os.environ.setdefault("FUNGO_VOCAB_PKL", "/data/labels/vocabularies.pkl")
30
  os.environ.setdefault("FUNGO_IA_PKL", "/data/go_data/ia_weights.pkl")
@@ -37,18 +14,14 @@ os.environ.setdefault("FUNGO_PORT", "7860")
37
 
38
  from flask import Flask, jsonify, request, Response
39
  from flask_cors import CORS
40
-
41
  import config
42
  import predictor
43
  import embedder
44
  import filter as flt
45
  import taxonomy
46
 
47
- logging.basicConfig(
48
- level=logging.INFO,
49
- format="%(asctime)s [%(levelname)s] %(name)s — %(message)s",
50
- datefmt="%H:%M:%S",
51
- )
52
  log = logging.getLogger("fungo.app")
53
 
54
  app = Flask(__name__)
@@ -57,10 +30,10 @@ app.config["MAX_CONTENT_LENGTH"] = 2 * 1024 * 1024
57
 
58
  _csv_store: OrderedDict = OrderedDict()
59
  _CSV_MAX = 50
 
60
 
61
  def _store_csv(job_id, predictions):
62
- if len(_csv_store) >= _CSV_MAX:
63
- _csv_store.popitem(last=False)
64
  _csv_store[job_id] = {"predictions": predictions, "ts": time.time()}
65
 
66
  def _make_csv(predictions):
@@ -70,14 +43,13 @@ def _make_csv(predictions):
70
  "tier","tier_label","confidence","ia_weight","combined_score","threshold"])
71
  for pid, data in predictions.items():
72
  for p in data.get("all", []):
73
- w.writerow([pid, p.get("go_term",""), p.get("ontology",""),
74
- p.get("ontology_label",""), p.get("tier",""), p.get("tier_label",""),
75
- p.get("confidence",""), p.get("ia_weight",""),
76
- p.get("combined_score",""), p.get("threshold","")])
77
  return out.getvalue()
78
 
79
  _OX_RE = _re.compile(r"OX=(\d+)")
80
-
81
  def _parse_taxon_id(header):
82
  m = _OX_RE.search(header or "")
83
  return int(m.group(1)) if m else None
@@ -91,21 +63,19 @@ def parse_fasta(fasta_text):
91
  if current_id is not None:
92
  seq = "".join(current_seq).upper()
93
  if seq:
94
- proteins.append({"id": current_id, "seq": seq,
95
- "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
96
  current_hdr = line[1:].strip()
97
  parts = current_hdr.split("|")
98
- current_id = parts[1] if len(parts) >= 3 else current_hdr.split()[0]
99
  current_seq = []
100
- else:
101
- current_seq.append(line)
102
  if current_id is not None:
103
  seq = "".join(current_seq).upper()
104
  if seq:
105
- proteins.append({"id": current_id, "seq": seq,
106
- "header": current_hdr, "taxon_id": _parse_taxon_id(current_hdr)})
107
- if not proteins:
108
- raise ValueError("No valid protein sequences found in FASTA input.")
109
  return proteins
110
 
111
  def _run_prediction(fasta_text, taxon_id_override):
@@ -117,37 +87,40 @@ def _run_prediction(fasta_text, taxon_id_override):
117
  taxon_ids = [taxon_id_override if taxon_id_override is not None
118
  else p["taxon_id"] for p in proteins]
119
  log.info("Proteins: %s | Taxon IDs: %s", protein_ids, taxon_ids)
120
- t0 = time.perf_counter()
121
  X_esm = embedder.extract(sequences)
122
  top50 = predictor.get_top50_taxa()
123
  X_final = embedder.build_features(X_esm, taxon_ids, top50)
124
  raw_preds = predictor.predict(X_final, protein_ids)
125
  ia_weights = predictor.get_ia_weights()
126
  for p in raw_preds:
127
- p["ia_weight"] = round(float(ia_weights.get(p["go_term"], 0.0)), 4)
128
- return proteins, raw_preds, ia_weights, round(time.perf_counter() - t0, 2)
129
 
130
  @app.route("/health", methods=["GET"])
131
  def health():
132
- return jsonify({"status":"ok","device":config.DEVICE,"fp16":config.USE_FP16,"version":"2.0.0"})
 
133
 
134
  @app.route("/model/info", methods=["GET"])
135
  def model_info():
 
 
136
  try: stats = predictor.get_model_stats()
137
- except RuntimeError as e: return jsonify({"error": str(e)}), 503
138
  return jsonify({"device":config.DEVICE,"fp16":config.USE_FP16,
139
  "model_name":config.MODEL_NAME,"ontologies":stats,
140
  "top50_taxa_count":len(predictor.get_top50_taxa()),
141
  "thresholds":{
142
- "STRONG": {"min_ia":config.TIER_GOLD_IA, "min_conf":config.TIER_GOLD_CONF},
143
- "MODERATE": {"min_ia":config.TIER_GOOD_IA, "min_conf":config.TIER_GOOD_CONF},
144
- "INDICATIVE":{"min_ia":config.TIER_SILVER_IA, "min_conf":config.TIER_SILVER_CONF},
145
  },"display_limit":flt.TOP_N_DISPLAY})
146
 
147
  @app.route("/taxonomy/search", methods=["GET"])
148
  def taxonomy_search():
149
  q = request.args.get("q","").strip()
150
- if len(q) < 2: return jsonify({"error":"Query must be at least 2 characters."}), 400
151
  try: max_r = min(int(request.args.get("max_results",8)),20)
152
  except: max_r = 8
153
  return jsonify({"query":q,"results":taxonomy.search_species(q,max_results=max_r)})
@@ -162,6 +135,8 @@ def taxonomy_verify():
162
 
163
  @app.route("/predict", methods=["POST"])
164
  def predict():
 
 
165
  if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
166
  body = request.get_json(silent=True) or {}
167
  fasta_text = body.get("fasta","").strip()
@@ -169,19 +144,17 @@ def predict():
169
  taxon_id_override = None
170
  if "taxon_id" in body:
171
  try: taxon_id_override = int(body["taxon_id"])
172
- except: return jsonify({"error":f"Invalid taxon_id"}), 400
173
  try:
174
  proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
175
- except ValueError as e: return jsonify({"error": str(e)}), 400
176
- except RuntimeError as e: return jsonify({"error": str(e)}), 503
177
  except Exception as e:
178
- log.exception("Prediction error"); return jsonify({"error": str(e)}), 500
179
-
180
  protein_ids = [p["id"] for p in proteins]
181
  raw_by_pid = {pid:[] for pid in protein_ids}
182
  for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
183
-
184
- predictions, csv_data, total_display, total_all = {}, {}, 0, 0
185
  for prot in proteins:
186
  pid = prot["id"]
187
  res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
@@ -191,7 +164,6 @@ def predict():
191
  "summary":flt.summarise(display,all_f,pid),
192
  "display":display,"total_all":len(all_f)}
193
  csv_data[pid] = {"all":all_f}
194
-
195
  job_id = str(int(time.time()*1000))
196
  _store_csv(job_id, csv_data)
197
  return jsonify({"job_id":job_id,
@@ -207,11 +179,13 @@ def download_csv():
207
  if not job_id: return jsonify({"error":"job_id required."}), 400
208
  job = _csv_store.get(job_id)
209
  if not job: return jsonify({"error":f"Job '{job_id}' not found."}), 404
210
- return Response(_make_csv(job["predictions"]), mimetype="text/csv",
211
  headers={"Content-Disposition":f"attachment; filename=fungo_{job_id}.csv"})
212
 
213
  @app.route("/predict/debug", methods=["POST"])
214
  def predict_debug():
 
 
215
  if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
216
  body = request.get_json(silent=True) or {}
217
  fasta_text = body.get("fasta","").strip()
@@ -222,15 +196,11 @@ def predict_debug():
222
  except: return jsonify({"error":"Invalid taxon_id"}), 400
223
  try:
224
  proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
225
- except ValueError as e: return jsonify({"error":str(e)}), 400
226
- except RuntimeError as e: return jsonify({"error":str(e)}), 503
227
  except Exception as e:
228
  log.exception("Debug error"); return jsonify({"error":str(e)}), 500
229
-
230
  protein_ids = [p["id"] for p in proteins]
231
  raw_by_pid = {pid:[] for pid in protein_ids}
232
  for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
233
-
234
  thr = {"STRONG":{"min_ia":config.TIER_GOLD_IA,"min_conf":config.TIER_GOLD_CONF},
235
  "MODERATE":{"min_ia":config.TIER_GOOD_IA,"min_conf":config.TIER_GOOD_CONF},
236
  "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF}}
@@ -244,13 +214,13 @@ def predict_debug():
244
  for pred in raw_by_pid[pid]:
245
  go = pred["go_term"]
246
  if go in accepted: continue
247
- ia, conf = pred.get("ia_weight", float(ia_weights.get(go,0.0))), pred["confidence"]
248
  if go in config.BLACKLIST_TERMS: reason="blacklisted"
249
- elif ia <= config.TIER_SILVER_IA: reason=f"ia_too_low (ia={ia:.4f})"
250
- elif conf < config.TIER_SILVER_CONF: reason=f"conf_too_low (conf={conf:.4f})"
251
  else: reason="below_all_tiers"
252
- fo.append({"go_term":go,"ontology":pred["ontology"],"confidence":conf,
253
- "ia_weight":ia,"reason":reason})
254
  fo.sort(key=lambda x:-x["ia_weight"])
255
  predictions[pid] = {"taxon_id":prot["taxon_id"],
256
  "summary":flt.summarise(display,all_f,pid),
@@ -268,11 +238,20 @@ def internal(e):
268
  log.exception("Unhandled error"); return jsonify({"error":"Internal server error."}), 500
269
 
270
  if __name__ == "__main__":
 
271
  log.info("FunGO v2.0 — HuggingFace Space starting …")
272
  config.ensure_dirs()
273
- if not config.validate_paths():
274
- log.error("Model paths missing!")
275
- sys.exit(1)
276
- predictor.load_all()
277
- log.info("Models loaded. Serving on port 7860 …")
 
 
 
 
 
 
 
 
278
  app.run(host="0.0.0.0", port=7860, debug=False)
 
1
+ # app.py — FunGO HuggingFace Space v2
2
+ import csv, io, logging, os, re as _re, sys, time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from collections import OrderedDict
4
 
 
5
  os.environ.setdefault("FUNGO_PKL_DIR", "/data/models")
6
  os.environ.setdefault("FUNGO_VOCAB_PKL", "/data/labels/vocabularies.pkl")
7
  os.environ.setdefault("FUNGO_IA_PKL", "/data/go_data/ia_weights.pkl")
 
14
 
15
  from flask import Flask, jsonify, request, Response
16
  from flask_cors import CORS
 
17
  import config
18
  import predictor
19
  import embedder
20
  import filter as flt
21
  import taxonomy
22
 
23
+ logging.basicConfig(level=logging.INFO,
24
+ format="%(asctime)s [%(levelname)s] %(name)s — %(message)s", datefmt="%H:%M:%S")
 
 
 
25
  log = logging.getLogger("fungo.app")
26
 
27
  app = Flask(__name__)
 
30
 
31
  _csv_store: OrderedDict = OrderedDict()
32
  _CSV_MAX = 50
33
+ _models_ready = False # flag — True only after successful load
34
 
35
  def _store_csv(job_id, predictions):
36
+ if len(_csv_store) >= _CSV_MAX: _csv_store.popitem(last=False)
 
37
  _csv_store[job_id] = {"predictions": predictions, "ts": time.time()}
38
 
39
  def _make_csv(predictions):
 
43
  "tier","tier_label","confidence","ia_weight","combined_score","threshold"])
44
  for pid, data in predictions.items():
45
  for p in data.get("all", []):
46
+ w.writerow([pid,p.get("go_term",""),p.get("ontology",""),
47
+ p.get("ontology_label",""),p.get("tier",""),p.get("tier_label",""),
48
+ p.get("confidence",""),p.get("ia_weight",""),
49
+ p.get("combined_score",""),p.get("threshold","")])
50
  return out.getvalue()
51
 
52
  _OX_RE = _re.compile(r"OX=(\d+)")
 
53
  def _parse_taxon_id(header):
54
  m = _OX_RE.search(header or "")
55
  return int(m.group(1)) if m else None
 
63
  if current_id is not None:
64
  seq = "".join(current_seq).upper()
65
  if seq:
66
+ proteins.append({"id":current_id,"seq":seq,
67
+ "header":current_hdr,"taxon_id":_parse_taxon_id(current_hdr)})
68
  current_hdr = line[1:].strip()
69
  parts = current_hdr.split("|")
70
+ current_id = parts[1] if len(parts)>=3 else current_hdr.split()[0]
71
  current_seq = []
72
+ else: current_seq.append(line)
 
73
  if current_id is not None:
74
  seq = "".join(current_seq).upper()
75
  if seq:
76
+ proteins.append({"id":current_id,"seq":seq,
77
+ "header":current_hdr,"taxon_id":_parse_taxon_id(current_hdr)})
78
+ if not proteins: raise ValueError("No valid protein sequences found.")
 
79
  return proteins
80
 
81
  def _run_prediction(fasta_text, taxon_id_override):
 
87
  taxon_ids = [taxon_id_override if taxon_id_override is not None
88
  else p["taxon_id"] for p in proteins]
89
  log.info("Proteins: %s | Taxon IDs: %s", protein_ids, taxon_ids)
90
+ t0 = time.perf_counter()
91
  X_esm = embedder.extract(sequences)
92
  top50 = predictor.get_top50_taxa()
93
  X_final = embedder.build_features(X_esm, taxon_ids, top50)
94
  raw_preds = predictor.predict(X_final, protein_ids)
95
  ia_weights = predictor.get_ia_weights()
96
  for p in raw_preds:
97
+ p["ia_weight"] = round(float(ia_weights.get(p["go_term"],0.0)),4)
98
+ return proteins, raw_preds, ia_weights, round(time.perf_counter()-t0,2)
99
 
100
  @app.route("/health", methods=["GET"])
101
  def health():
102
+ return jsonify({"status":"ok","device":config.DEVICE,
103
+ "fp16":config.USE_FP16,"version":"2.0.0","models_ready":_models_ready})
104
 
105
  @app.route("/model/info", methods=["GET"])
106
  def model_info():
107
+ if not _models_ready:
108
+ return jsonify({"error":"Models not loaded yet. Upload model files to /data/ first."}), 503
109
  try: stats = predictor.get_model_stats()
110
+ except RuntimeError as e: return jsonify({"error":str(e)}), 503
111
  return jsonify({"device":config.DEVICE,"fp16":config.USE_FP16,
112
  "model_name":config.MODEL_NAME,"ontologies":stats,
113
  "top50_taxa_count":len(predictor.get_top50_taxa()),
114
  "thresholds":{
115
+ "STRONG": {"min_ia":config.TIER_GOLD_IA, "min_conf":config.TIER_GOLD_CONF},
116
+ "MODERATE": {"min_ia":config.TIER_GOOD_IA, "min_conf":config.TIER_GOOD_CONF},
117
+ "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF},
118
  },"display_limit":flt.TOP_N_DISPLAY})
119
 
120
  @app.route("/taxonomy/search", methods=["GET"])
121
  def taxonomy_search():
122
  q = request.args.get("q","").strip()
123
+ if len(q)<2: return jsonify({"error":"Query must be at least 2 characters."}), 400
124
  try: max_r = min(int(request.args.get("max_results",8)),20)
125
  except: max_r = 8
126
  return jsonify({"query":q,"results":taxonomy.search_species(q,max_results=max_r)})
 
135
 
136
  @app.route("/predict", methods=["POST"])
137
  def predict():
138
+ if not _models_ready:
139
+ return jsonify({"error":"Models not loaded. Upload model files to /data/ first."}), 503
140
  if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
141
  body = request.get_json(silent=True) or {}
142
  fasta_text = body.get("fasta","").strip()
 
144
  taxon_id_override = None
145
  if "taxon_id" in body:
146
  try: taxon_id_override = int(body["taxon_id"])
147
+ except: return jsonify({"error":"Invalid taxon_id"}), 400
148
  try:
149
  proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
150
+ except ValueError as e: return jsonify({"error":str(e)}), 400
151
+ except RuntimeError as e: return jsonify({"error":str(e)}), 503
152
  except Exception as e:
153
+ log.exception("Prediction error"); return jsonify({"error":str(e)}), 500
 
154
  protein_ids = [p["id"] for p in proteins]
155
  raw_by_pid = {pid:[] for pid in protein_ids}
156
  for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
157
+ predictions, csv_data, total_display, total_all = {},{},0,0
 
158
  for prot in proteins:
159
  pid = prot["id"]
160
  res = flt.filter_predictions(raw_by_pid[pid], ia_weights)
 
164
  "summary":flt.summarise(display,all_f,pid),
165
  "display":display,"total_all":len(all_f)}
166
  csv_data[pid] = {"all":all_f}
 
167
  job_id = str(int(time.time()*1000))
168
  _store_csv(job_id, csv_data)
169
  return jsonify({"job_id":job_id,
 
179
  if not job_id: return jsonify({"error":"job_id required."}), 400
180
  job = _csv_store.get(job_id)
181
  if not job: return jsonify({"error":f"Job '{job_id}' not found."}), 404
182
+ return Response(_make_csv(job["predictions"]),mimetype="text/csv",
183
  headers={"Content-Disposition":f"attachment; filename=fungo_{job_id}.csv"})
184
 
185
  @app.route("/predict/debug", methods=["POST"])
186
  def predict_debug():
187
+ if not _models_ready:
188
+ return jsonify({"error":"Models not loaded."}), 503
189
  if not request.is_json: return jsonify({"error":"Content-Type must be application/json."}), 415
190
  body = request.get_json(silent=True) or {}
191
  fasta_text = body.get("fasta","").strip()
 
196
  except: return jsonify({"error":"Invalid taxon_id"}), 400
197
  try:
198
  proteins, raw_preds, ia_weights, elapsed = _run_prediction(fasta_text, taxon_id_override)
 
 
199
  except Exception as e:
200
  log.exception("Debug error"); return jsonify({"error":str(e)}), 500
 
201
  protein_ids = [p["id"] for p in proteins]
202
  raw_by_pid = {pid:[] for pid in protein_ids}
203
  for pred in raw_preds: raw_by_pid[pred["protein_id"]].append(pred)
 
204
  thr = {"STRONG":{"min_ia":config.TIER_GOLD_IA,"min_conf":config.TIER_GOLD_CONF},
205
  "MODERATE":{"min_ia":config.TIER_GOOD_IA,"min_conf":config.TIER_GOOD_CONF},
206
  "INDICATIVE":{"min_ia":config.TIER_SILVER_IA,"min_conf":config.TIER_SILVER_CONF}}
 
214
  for pred in raw_by_pid[pid]:
215
  go = pred["go_term"]
216
  if go in accepted: continue
217
+ ia,conf = pred.get("ia_weight",float(ia_weights.get(go,0.0))),pred["confidence"]
218
  if go in config.BLACKLIST_TERMS: reason="blacklisted"
219
+ elif ia<=config.TIER_SILVER_IA: reason=f"ia_too_low (ia={ia:.4f})"
220
+ elif conf<config.TIER_SILVER_CONF: reason=f"conf_too_low (conf={conf:.4f})"
221
  else: reason="below_all_tiers"
222
+ fo.append({"go_term":go,"ontology":pred["ontology"],
223
+ "confidence":conf,"ia_weight":ia,"reason":reason})
224
  fo.sort(key=lambda x:-x["ia_weight"])
225
  predictions[pid] = {"taxon_id":prot["taxon_id"],
226
  "summary":flt.summarise(display,all_f,pid),
 
238
  log.exception("Unhandled error"); return jsonify({"error":"Internal server error."}), 500
239
 
240
  if __name__ == "__main__":
241
+ global _models_ready
242
  log.info("FunGO v2.0 — HuggingFace Space starting …")
243
  config.ensure_dirs()
244
+ paths_ok = config.validate_paths()
245
+ if paths_ok:
246
+ try:
247
+ predictor.load_all()
248
+ _models_ready = True
249
+ log.info("Models loaded successfully!")
250
+ except Exception as e:
251
+ log.error("Model loading failed: %s", e)
252
+ log.warning("Starting without models — upload files to /data/ to enable predictions")
253
+ else:
254
+ log.warning("Model files not found in /data/ — Space will run but predictions disabled")
255
+ log.warning("Upload model files using: huggingface-cli upload Muteeba/FunGO")
256
+ log.info("Serving on port 7860 …")
257
  app.run(host="0.0.0.0", port=7860, debug=False)
hf-space ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 4e8a67686e6a35864a4b7c8810505d81811f8efa