# Configuration for Patient Matching Pipeline # # Edit the values below to set your default models and patient database. # Models will auto-load on application startup. # ============================================================================ # MODEL PATHS - Set your default models here # ============================================================================ # Set to None to skip auto-loading, or provide model path/HuggingFace ID MODEL_CONFIG = { # Sentence transformer for embedding patient summaries and clinical spaces "embedder": "ksg-dfci/TrialSpace-1225", # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model" # ModernBERT classifier for eligibility prediction "trial_checker": "ksg-dfci/TrialChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker" # ModernBERT classifier for boilerplate exclusion prediction "boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker" } # Example configuration with base models: # MODEL_CONFIG = { # "embedder": "Qwen/Qwen3-Embedding-0.6B", # "trial_checker": "answerdotai/ModernBERT-large", # "boilerplate_checker": "answerdotai/ModernBERT-large", # } # Example configuration with fine-tuned models: # MODEL_CONFIG = { # "embedder": "./reranker_round2.model", # "trial_checker": "./modernbert-trial-checker", # "boilerplate_checker": "./modernbert-boilerplate-checker", # } # ============================================================================ # DEFAULT PATIENT DATABASE # ============================================================================ # Path to default patient database parquet file # Required columns: patient_id, patient_summary # Optional columns: patient_boilerplate (for boilerplate checking) # Will auto-load and embed when embedder model is ready # Set to None to disable auto-loading #DEFAULT_PATIENT_DB = "./synthetic_patient_summary_sample.parquet" # e.g., "./patients.parquet" or "./patient_summaries.parquet" # Path to pre-embedded patient database (faster loading) # # NEW FORMAT (recommended): Single parquet file with embedding column # - Created by: python preembed_patients.py --output patient_embeddings.parquet # - Contains all patient data + patient_embedding column (list of floats) # - Compatible with Hugging Face datasets # - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings.parquet" # # LEGACY FORMAT (still supported): Prefix for pkl/npy/json files # - Created by old version of preembed_patients.py # - Files: {prefix}_data.pkl, {prefix}_vectors.npy, {prefix}_metadata.json # - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings" # PREEMBEDDED_PATIENTS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/synthetic_patient_embeddings.parquet" # e.g., "patient_embeddings.parquet" or "patient_embeddings" (legacy) # ============================================================================ # CLINICAL SPACE TEMPLATE # ============================================================================ # Default template for the clinical space query input # Users will fill in these fields to define their search criteria CLINICAL_SPACE_TEMPLATE = """Age range allowed: any Sex allowed: Any Cancer type allowed: Non-small cell lung cancer Histology allowed: Adenocarcinoma Cancer burden allowed: Metastatic Prior treatment required: No requirements Prior treatment excluded: No requirements Biomarkers required: EGFR mutant Biomarkers excluded: None""" BOILERPLATE_TEMPLATE = "Patients must have no history of pneumonitis" # ============================================================================ # USAGE NOTES # ============================================================================ # # 1. Set the model paths above to your preferred models # 2. Optionally set DEFAULT_PATIENT_DB or PREEMBEDDED_PATIENTS # 3. Customize CLINICAL_SPACE_TEMPLATE if needed # 4. Save this file # 5. Run: streamlit run patient_matching_app.py # 6. Models will load automatically on startup # # To create pre-embedded patients (new parquet format, recommended): # python preembed_patients.py --patients patients.parquet --embedder path/to/embedder --output patient_embeddings.parquet # # To upload pre-embedded patients to Hugging Face Hub: # from datasets import Dataset # ds = Dataset.from_parquet("patient_embeddings.parquet") # ds.push_to_hub("your-username/patient-embeddings") # # To load pre-embedded patients from Hugging Face Hub in your app: # from datasets import load_dataset # ds = load_dataset("your-username/patient-embeddings") # ds['train'].to_parquet("local_patient_embeddings.parquet") # # Then set PREEMBEDDED_PATIENTS = "local_patient_embeddings.parquet" #