File size: 4,489 Bytes
5c389ab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | # config.py
"""
FunGO Backend β Central Configuration
======================================
ONLY change paths in this file. Nothing else needs editing.
How to use:
- Update PKL_DIR, VOCAB_PKL, IA_PKL, FEAT_META to point to your model files
- Update MODEL_CACHE_DIR to point to your ESM2 weights cache
- All other settings work as-is
"""
import logging
import os
from pathlib import Path
import torch
logger = logging.getLogger("config")
# ββ DEVICE (auto-detected) ββββββββββββββββββββββββββββββββββββ
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
USE_FP16 = DEVICE == "cuda"
# ββ MODEL PATHS β UPDATE THESE TO MATCH YOUR SYSTEM ββββββββββ
PKL_DIR = Path(os.environ.get("FUNGO_PKL_DIR", "/mnt/f/research/thesis/pipeline_outputs/models"))
VOCAB_PKL = Path(os.environ.get("FUNGO_VOCAB_PKL", "/mnt/f/research/thesis/pipeline_outputs/labels/vocabularies.pkl"))
IA_PKL = Path(os.environ.get("FUNGO_IA_PKL", "/mnt/f/research/thesis/pipeline_outputs/go_data/ia_weights.pkl"))
FEAT_META = Path(os.environ.get("FUNGO_FEAT_META", "/mnt/f/research/thesis/pipeline_outputs/features/feature_metadata.json"))
# ββ ESM2 SETTINGS βββββββββββββββββββββββββββββββββββββββββββββ
MODEL_CACHE_DIR = Path(os.environ.get("FUNGO_MODEL_CACHE", "/mnt/e/repeat/embeddings/model_cache"))
MODEL_NAME = "facebook/esm2_t36_3B_UR50D"
LAYERS_TO_USE = [30, 31, 32, 33, 34, 35]
MAX_SEQ_LENGTH = 1400
BATCH_SIZE = 4 if DEVICE == "cpu" else 16
TRANSFORMERS_OFFLINE = os.environ.get("FUNGO_OFFLINE", "1")
# ββ EMBEDDING CACHE βββββββββββββββββββββββββββββββββββββββββββ
EMB_CACHE_DIR = Path(os.environ.get("FUNGO_EMB_CACHE", "./embedding_cache"))
# ββ FILTER THRESHOLDS (do not change) ββββββββββββββββββββββββ
BLACKLIST_TERMS = {
"GO:0003674","GO:0008150","GO:0005575","GO:0005488",
"GO:0043226","GO:0043229","GO:0043227","GO:0043231",
"GO:0110165","GO:0005622","GO:0005623","GO:0044464",
"GO:0043232","GO:0044424","GO:0009987","GO:0065007",
"GO:0050794","GO:0019222","GO:0060255","GO:0080090",
"GO:0050789",
}
# Strong Evidence (was GOLD)
TIER_GOLD_IA = 5.0
TIER_GOLD_CONF = 0.30
# Moderate Evidence (was GOOD)
TIER_GOOD_IA = 2.0
TIER_GOOD_CONF = 0.50
# Indicative (was SILVER)
TIER_SILVER_IA = 1.0
TIER_SILVER_CONF = 0.65
# ββ NCBI TAXONOMY API βββββββββββββββββββββββββββββββββββββββββ
NCBI_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
NCBI_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
NCBI_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
NCBI_TOOL = "FunGO"
NCBI_EMAIL = "fungo@research.com"
# ββ FLASK βββββββββββββββββββββββββββββββββββββββββββββββββββββ
PORT = int(os.environ.get("FUNGO_PORT", 5000))
DEBUG = os.environ.get("FUNGO_DEBUG", "0") == "1"
MAX_SEQUENCES = int(os.environ.get("FUNGO_MAX_SEQ", 10))
# ββ Runtime helpers βββββββββββββββββββββββββββββββββββββββββββ
def ensure_dirs():
"""Create required runtime directories. Called once at startup."""
EMB_CACHE_DIR.mkdir(parents=True, exist_ok=True)
logger.info("[config] EMB_CACHE_DIR ready β %s", EMB_CACHE_DIR)
def validate_paths() -> bool:
"""
Check that all required model files exist.
Returns True if all found, False if any missing.
Called at startup before loading models.
"""
required = {
"PKL_DIR": PKL_DIR,
"VOCAB_PKL": VOCAB_PKL,
"IA_PKL": IA_PKL,
"FEAT_META": FEAT_META,
"MODEL_CACHE_DIR": MODEL_CACHE_DIR,
}
all_ok = True
for name, path in required.items():
if path.exists():
logger.info("[config] β %-18s β %s", name, path)
else:
logger.error("[config] β %-18s β %s (NOT FOUND)", name, path)
all_ok = False
return all_ok
|