Spaces:
Running
on
L4
Running
on
L4
| # Configuration for Patient Matching Pipeline | |
| # | |
| # Edit the values below to set your default models and patient database. | |
| # Models will auto-load on application startup. | |
| # ============================================================================ | |
| # MODEL PATHS - Set your default models here | |
| # ============================================================================ | |
| # Set to None to skip auto-loading, or provide model path/HuggingFace ID | |
| MODEL_CONFIG = { | |
| # Sentence transformer for embedding patient summaries and clinical spaces | |
| "embedder": "ksg-dfci/TrialSpace-1225", # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model" | |
| # ModernBERT classifier for eligibility prediction | |
| "trial_checker": "ksg-dfci/TrialChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker" | |
| # ModernBERT classifier for boilerplate exclusion prediction | |
| "boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker" | |
| } | |
| # Example configuration with base models: | |
| # MODEL_CONFIG = { | |
| # "embedder": "Qwen/Qwen3-Embedding-0.6B", | |
| # "trial_checker": "answerdotai/ModernBERT-large", | |
| # "boilerplate_checker": "answerdotai/ModernBERT-large", | |
| # } | |
| # Example configuration with fine-tuned models: | |
| # MODEL_CONFIG = { | |
| # "embedder": "./reranker_round2.model", | |
| # "trial_checker": "./modernbert-trial-checker", | |
| # "boilerplate_checker": "./modernbert-boilerplate-checker", | |
| # } | |
| # ============================================================================ | |
| # DEFAULT PATIENT DATABASE | |
| # ============================================================================ | |
| # Path to default patient database parquet file | |
| # Required columns: patient_id, patient_summary | |
| # Optional columns: patient_boilerplate (for boilerplate checking) | |
| # Will auto-load and embed when embedder model is ready | |
| # Set to None to disable auto-loading | |
| #DEFAULT_PATIENT_DB = "./synthetic_patient_summary_sample.parquet" # e.g., "./patients.parquet" or "./patient_summaries.parquet" | |
| # Path to pre-embedded patient database (faster loading) | |
| # | |
| # NEW FORMAT (recommended): Single parquet file with embedding column | |
| # - Created by: python preembed_patients.py --output patient_embeddings.parquet | |
| # - Contains all patient data + patient_embedding column (list of floats) | |
| # - Compatible with Hugging Face datasets | |
| # - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings.parquet" | |
| # | |
| # LEGACY FORMAT (still supported): Prefix for pkl/npy/json files | |
| # - Created by old version of preembed_patients.py | |
| # - Files: {prefix}_data.pkl, {prefix}_vectors.npy, {prefix}_metadata.json | |
| # - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings" | |
| # | |
| PREEMBEDDED_PATIENTS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/synthetic_patient_embeddings.parquet" # e.g., "patient_embeddings.parquet" or "patient_embeddings" (legacy) | |
| # ============================================================================ | |
| # CLINICAL SPACE TEMPLATE | |
| # ============================================================================ | |
| # Default template for the clinical space query input | |
| # Users will fill in these fields to define their search criteria | |
| CLINICAL_SPACE_TEMPLATE = """Age range allowed: any | |
| Sex allowed: Any | |
| Cancer type allowed: Non-small cell lung cancer | |
| Histology allowed: Adenocarcinoma | |
| Cancer burden allowed: Metastatic | |
| Prior treatment required: No requirements | |
| Prior treatment excluded: No requirements | |
| Biomarkers required: EGFR mutant | |
| Biomarkers excluded: None""" | |
| BOILERPLATE_TEMPLATE = "Patients must have no history of pneumonitis" | |
| # ============================================================================ | |
| # USAGE NOTES | |
| # ============================================================================ | |
| # | |
| # 1. Set the model paths above to your preferred models | |
| # 2. Optionally set DEFAULT_PATIENT_DB or PREEMBEDDED_PATIENTS | |
| # 3. Customize CLINICAL_SPACE_TEMPLATE if needed | |
| # 4. Save this file | |
| # 5. Run: streamlit run patient_matching_app.py | |
| # 6. Models will load automatically on startup | |
| # | |
| # To create pre-embedded patients (new parquet format, recommended): | |
| # python preembed_patients.py --patients patients.parquet --embedder path/to/embedder --output patient_embeddings.parquet | |
| # | |
| # To upload pre-embedded patients to Hugging Face Hub: | |
| # from datasets import Dataset | |
| # ds = Dataset.from_parquet("patient_embeddings.parquet") | |
| # ds.push_to_hub("your-username/patient-embeddings") | |
| # | |
| # To load pre-embedded patients from Hugging Face Hub in your app: | |
| # from datasets import load_dataset | |
| # ds = load_dataset("your-username/patient-embeddings") | |
| # ds['train'].to_parquet("local_patient_embeddings.parquet") | |
| # # Then set PREEMBEDDED_PATIENTS = "local_patient_embeddings.parquet" | |
| # | |