kenlkehl's picture
Update config.py
784ed65 verified
raw
history blame
4.12 kB
# Configuration for Clinical Trial Matching Pipeline
#
# Edit the values below to set your default models and trial database.
# Models will auto-load on application startup.
# ============================================================================
# MODEL PATHS - Set your default models here
# ============================================================================
# Set to None to skip auto-loading, or provide model path/HuggingFace ID
MODEL_CONFIG = {
# TinyBERT tagger for extracting relevant excerpts
"tagger": "ksg-dfci/TinyBertOncoTagger-1225", # e.g., "prajjwal1/bert-tiny" or "./auto-tiny-bert-tagger"
# Sentence transformer for embedding patient summaries and trials
"embedder": "ksg-dfci/TrialSpace-1225", # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model"
# Large language model for patient history summarization
"llm": "ksg-dfci/OncoReasoning-3B-1225",
#"llm": "openai/gpt-oss-120b",
# ModernBERT classifier for eligibility prediction
"trial_checker": "ksg-dfci/TrialChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker"
# ModernBERT classifier for boilerplate exclusion prediction
"boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker"
}
# Example configuration with base models:
# MODEL_CONFIG = {
# "tagger": "prajjwal1/bert-tiny",
# "embedder": "Qwen/Qwen3-Embedding-0.6B",
# "llm": "microsoft/Phi-3-mini-4k-instruct",
# "trial_checker": "answerdotai/ModernBERT-large",
# "boilerplate_checker": "answerdotai/ModernBERT-large",
# }
# Example configuration with fine-tuned models:
# MODEL_CONFIG = {
# "tagger": "./auto-tiny-bert-tagger",
# "embedder": "./reranker_round2.model",
# "llm": "/data/models/gpt-oss-120b",
# "trial_checker": "./modernbert-trial-checker",
# "boilerplate_checker": "./modernbert-boilerplate-checker",
# }
# ============================================================================
# DEFAULT TRIAL DATABASE
# ============================================================================
# Path to default trial database CSV/Excel file
# Will auto-load and embed when embedder model is ready
# Set to None to disable auto-loading
#DEFAULT_TRIAL_DB = "./trial_space_lineitems.csv" # e.g., "./my_trials.csv" or "./sample_trials.csv"
# ============================================================================
# PRE-EMBEDDED TRIALS (Recommended for faster startup)
# ============================================================================
# Path to pre-embedded trial database (parquet file with 'embedding' column)
# This is preferred over DEFAULT_TRIAL_DB as it loads instantly without re-embedding
# Generate with: python preembed_trials.py --trials trials.csv --embedder model --output trial_embeddings.parquet
# Set to None to disable pre-embedded loading (will fall back to DEFAULT_TRIAL_DB)
PREEMBEDDED_TRIALS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/trial_embeddings.parquet"
# ============================================================================
# USAGE NOTES
# ============================================================================
#
# 1. Set the model paths above to your preferred models
# 2. Optionally set DEFAULT_TRIAL_DB to your trial database file
# 3. For faster startup, pre-embed your trials:
# python preembed_trials.py --trials your_trials.csv --embedder your_model --output trial_embeddings.parquet
# Then set PREEMBEDDED_TRIALS = "trial_embeddings.parquet"
# 4. Save this file
# 5. Run: python app.py
# 6. Models will load automatically on startup
#
# You can still manually load different models through the web interface
# if you need to switch models during a session.
#
# PRE-EMBEDDED FORMAT:
# The parquet file contains all original trial columns plus an 'embedding' column
# where each row has a list of floats representing the trial's embedding vector.
# This format is compatible with HuggingFace Datasets for easy sharing.
#