bioflow / config.py
vignt97867896's picture
Upload folder using huggingface_hub
54592ce verified
"""
Shared configuration for DeepPurpose + Qdrant Pipeline.
Used by: ingest_qdrant.py, server/api.py
"""
import os
# --- MODEL CONFIG ---
# Best performing run with saved model.pt
BEST_MODEL_RUN = r"runs\20260125_104915_KIBA"
# Encoding config - MUST match what was used during training!
# Verified from config.pkl: Morgan + CNN
MODEL_CONFIG = {
"drug_encoding": "Morgan", # Morgan fingerprints
"target_encoding": "CNN", # CNN for protein sequences
"cls_hidden_dims": [1024, 1024, 512],
"hidden_dim_drug": 128,
"hidden_dim_protein": 128,
}
# Data source - best CI run for ground truth
PREDICTIONS_SOURCE = r"runs\20260125_080409_BindingDB_Kd\predictions_test.csv"
# --- QDRANT CONFIG ---
QDRANT_HOST = os.getenv("QDRANT_HOST", "localhost")
QDRANT_PORT = int(os.getenv("QDRANT_PORT", 6333))
COLLECTION_NAME = "bio_discovery"
# --- METRICS (from best runs) ---
METRICS = {
"KIBA": {"CI": 0.7003, "Pearson": 0.5219, "MSE": 0.0008},
"BindingDB_Kd": {"CI": 0.8083, "Pearson": 0.7679, "MSE": 0.6668},
"DAVIS": {"CI": 0.7914, "Pearson": 0.5446, "MSE": 0.4684},
}
# --- VALID SEQUENCES FOR API ---
# Minimal valid sequences for encoding (avoids dummy data issues)
VALID_DUMMY_DRUG = "CC(=O)OC1=CC=CC=C1C(=O)O" # Aspirin SMILES
VALID_DUMMY_TARGET = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQQIAAALEHHHHHH" # Short valid protein