| |
| """ |
| FunGO Backend β Central Configuration |
| ====================================== |
| ONLY change paths in this file. Nothing else needs editing. |
| |
| How to use: |
| - Update PKL_DIR, VOCAB_PKL, IA_PKL, FEAT_META to point to your model files |
| - Update MODEL_CACHE_DIR to point to your ESM2 weights cache |
| - All other settings work as-is |
| """ |
|
|
| import logging |
| import os |
| from pathlib import Path |
| import torch |
|
|
| logger = logging.getLogger("config") |
|
|
| |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| USE_FP16 = DEVICE == "cuda" |
|
|
| |
| PKL_DIR = Path(os.environ.get("FUNGO_PKL_DIR", "/mnt/f/research/thesis/pipeline_outputs/models")) |
| VOCAB_PKL = Path(os.environ.get("FUNGO_VOCAB_PKL", "/mnt/f/research/thesis/pipeline_outputs/labels/vocabularies.pkl")) |
| IA_PKL = Path(os.environ.get("FUNGO_IA_PKL", "/mnt/f/research/thesis/pipeline_outputs/go_data/ia_weights.pkl")) |
| FEAT_META = Path(os.environ.get("FUNGO_FEAT_META", "/mnt/f/research/thesis/pipeline_outputs/features/feature_metadata.json")) |
|
|
| |
| MODEL_CACHE_DIR = Path(os.environ.get("FUNGO_MODEL_CACHE", "/mnt/e/repeat/embeddings/model_cache")) |
| MODEL_NAME = "facebook/esm2_t36_3B_UR50D" |
| LAYERS_TO_USE = [30, 31, 32, 33, 34, 35] |
| MAX_SEQ_LENGTH = 1400 |
| BATCH_SIZE = 4 if DEVICE == "cpu" else 16 |
| TRANSFORMERS_OFFLINE = os.environ.get("FUNGO_OFFLINE", "1") |
|
|
| |
| EMB_CACHE_DIR = Path(os.environ.get("FUNGO_EMB_CACHE", "./embedding_cache")) |
|
|
| |
| BLACKLIST_TERMS = { |
| "GO:0003674","GO:0008150","GO:0005575","GO:0005488", |
| "GO:0043226","GO:0043229","GO:0043227","GO:0043231", |
| "GO:0110165","GO:0005622","GO:0005623","GO:0044464", |
| "GO:0043232","GO:0044424","GO:0009987","GO:0065007", |
| "GO:0050794","GO:0019222","GO:0060255","GO:0080090", |
| "GO:0050789", |
| } |
|
|
| |
| TIER_GOLD_IA = 5.0 |
| TIER_GOLD_CONF = 0.30 |
|
|
| |
| TIER_GOOD_IA = 2.0 |
| TIER_GOOD_CONF = 0.50 |
|
|
| |
| TIER_SILVER_IA = 1.0 |
| TIER_SILVER_CONF = 0.65 |
|
|
| |
| NCBI_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" |
| NCBI_SUMMARY_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" |
| NCBI_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" |
| NCBI_TOOL = "FunGO" |
| NCBI_EMAIL = "fungo@research.com" |
|
|
| |
| PORT = int(os.environ.get("FUNGO_PORT", 5000)) |
| DEBUG = os.environ.get("FUNGO_DEBUG", "0") == "1" |
| MAX_SEQUENCES = int(os.environ.get("FUNGO_MAX_SEQ", 10)) |
|
|
|
|
| |
|
|
| def ensure_dirs(): |
| """Create required runtime directories. Called once at startup.""" |
| EMB_CACHE_DIR.mkdir(parents=True, exist_ok=True) |
| logger.info("[config] EMB_CACHE_DIR ready β %s", EMB_CACHE_DIR) |
|
|
|
|
| def validate_paths() -> bool: |
| """ |
| Check that all required model files exist. |
| Returns True if all found, False if any missing. |
| Called at startup before loading models. |
| """ |
| required = { |
| "PKL_DIR": PKL_DIR, |
| "VOCAB_PKL": VOCAB_PKL, |
| "IA_PKL": IA_PKL, |
| "FEAT_META": FEAT_META, |
| "MODEL_CACHE_DIR": MODEL_CACHE_DIR, |
| } |
| all_ok = True |
| for name, path in required.items(): |
| if path.exists(): |
| logger.info("[config] β %-18s β %s", name, path) |
| else: |
| logger.error("[config] β %-18s β %s (NOT FOUND)", name, path) |
| all_ok = False |
| return all_ok |
|
|