Spaces:
Sleeping
Sleeping
| # Configuration for Clinical Trial Matching Pipeline | |
| # | |
| # Edit the values below to set your default models and trial database. | |
| # Models will auto-load on application startup. | |
| # ============================================================================ | |
| # MODEL PATHS - Set your default models here | |
| # ============================================================================ | |
| # Set to None to skip auto-loading, or provide model path/HuggingFace ID | |
| MODEL_CONFIG = { | |
| # TinyBERT tagger for extracting relevant excerpts | |
| "tagger": "ksg-dfci/TinyBertOncoTagger-1225", # e.g., "prajjwal1/bert-tiny" or "./auto-tiny-bert-tagger" | |
| # Sentence transformer for embedding patient summaries and trials | |
| "embedder": "ksg-dfci/TrialSpace-1225", # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model" | |
| # Large language model for patient history summarization | |
| "llm": "ksg-dfci/OncoReasoning-3B-1225", | |
| #"llm": "openai/gpt-oss-120b", | |
| # ModernBERT classifier for eligibility prediction | |
| "trial_checker": "ksg-dfci/TrialChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker" | |
| # ModernBERT classifier for boilerplate exclusion prediction | |
| "boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker" | |
| } | |
| # Example configuration with base models: | |
| # MODEL_CONFIG = { | |
| # "tagger": "prajjwal1/bert-tiny", | |
| # "embedder": "Qwen/Qwen3-Embedding-0.6B", | |
| # "llm": "microsoft/Phi-3-mini-4k-instruct", | |
| # "trial_checker": "answerdotai/ModernBERT-large", | |
| # "boilerplate_checker": "answerdotai/ModernBERT-large", | |
| # } | |
| # Example configuration with fine-tuned models: | |
| # MODEL_CONFIG = { | |
| # "tagger": "./auto-tiny-bert-tagger", | |
| # "embedder": "./reranker_round2.model", | |
| # "llm": "/data/models/gpt-oss-120b", | |
| # "trial_checker": "./modernbert-trial-checker", | |
| # "boilerplate_checker": "./modernbert-boilerplate-checker", | |
| # } | |
| # ============================================================================ | |
| # DEFAULT TRIAL DATABASE | |
| # ============================================================================ | |
| # Path to default trial database CSV/Excel file | |
| # Will auto-load and embed when embedder model is ready | |
| # Set to None to disable auto-loading | |
| #DEFAULT_TRIAL_DB = "./trial_space_lineitems.csv" # e.g., "./my_trials.csv" or "./sample_trials.csv" | |
| # ============================================================================ | |
| # PRE-EMBEDDED TRIALS (Recommended for faster startup) | |
| # ============================================================================ | |
| # Path to pre-embedded trial database (parquet file with 'embedding' column) | |
| # This is preferred over DEFAULT_TRIAL_DB as it loads instantly without re-embedding | |
| # Generate with: python preembed_trials.py --trials trials.csv --embedder model --output trial_embeddings.parquet | |
| # Set to None to disable pre-embedded loading (will fall back to DEFAULT_TRIAL_DB) | |
| PREEMBEDDED_TRIALS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/trial_embeddings.parquet" | |
| # ============================================================================ | |
| # USAGE NOTES | |
| # ============================================================================ | |
| # | |
| # 1. Set the model paths above to your preferred models | |
| # 2. Optionally set DEFAULT_TRIAL_DB to your trial database file | |
| # 3. For faster startup, pre-embed your trials: | |
| # python preembed_trials.py --trials your_trials.csv --embedder your_model --output trial_embeddings.parquet | |
| # Then set PREEMBEDDED_TRIALS = "trial_embeddings.parquet" | |
| # 4. Save this file | |
| # 5. Run: python app.py | |
| # 6. Models will load automatically on startup | |
| # | |
| # You can still manually load different models through the web interface | |
| # if you need to switch models during a session. | |
| # | |
| # PRE-EMBEDDED FORMAT: | |
| # The parquet file contains all original trial columns plus an 'embedding' column | |
| # where each row has a list of floats representing the trial's embedding vector. | |
| # This format is compatible with HuggingFace Datasets for easy sharing. | |
| # | |