Spaces:

ksg-dfci
/

MatchMiner-AI-Patient-Search

Running on L4

App Files Files Community

MatchMiner-AI-Patient-Search / config.py

kenlkehl

Upload 3 files

0fe62c7 verified about 16 hours ago

raw

history blame

4.84 kB

	# Configuration for Patient Matching Pipeline
	#
	# Edit the values below to set your default models and patient database.
	# Models will auto-load on application startup.

	# ============================================================================
	# MODEL PATHS - Set your default models here
	# ============================================================================

	# Set to None to skip auto-loading, or provide model path/HuggingFace ID
	MODEL_CONFIG = {
	# Sentence transformer for embedding patient summaries and clinical spaces
	"embedder": "ksg-dfci/TrialSpace-1225", # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model"

	# ModernBERT classifier for eligibility prediction
	"trial_checker": "ksg-dfci/TrialChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker"

	# ModernBERT classifier for boilerplate exclusion prediction
	"boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225", # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker"
	}

	# Example configuration with base models:
	# MODEL_CONFIG = {
	# "embedder": "Qwen/Qwen3-Embedding-0.6B",
	# "trial_checker": "answerdotai/ModernBERT-large",
	# "boilerplate_checker": "answerdotai/ModernBERT-large",
	# }

	# Example configuration with fine-tuned models:
	# MODEL_CONFIG = {
	# "embedder": "./reranker_round2.model",
	# "trial_checker": "./modernbert-trial-checker",
	# "boilerplate_checker": "./modernbert-boilerplate-checker",
	# }

	# ============================================================================
	# DEFAULT PATIENT DATABASE
	# ============================================================================

	# Path to default patient database parquet file
	# Required columns: patient_id, patient_summary
	# Optional columns: patient_boilerplate (for boilerplate checking)
	# Will auto-load and embed when embedder model is ready
	# Set to None to disable auto-loading
	#DEFAULT_PATIENT_DB = "./synthetic_patient_summary_sample.parquet" # e.g., "./patients.parquet" or "./patient_summaries.parquet"

	# Path to pre-embedded patient database (faster loading)
	#
	# NEW FORMAT (recommended): Single parquet file with embedding column
	# - Created by: python preembed_patients.py --output patient_embeddings.parquet
	# - Contains all patient data + patient_embedding column (list of floats)
	# - Compatible with Hugging Face datasets
	# - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings.parquet"
	#
	# LEGACY FORMAT (still supported): Prefix for pkl/npy/json files
	# - Created by old version of preembed_patients.py
	# - Files: {prefix}_data.pkl, {prefix}_vectors.npy, {prefix}_metadata.json
	# - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings"
	#
	PREEMBEDDED_PATIENTS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/synthetic_patient_embeddings.parquet" # e.g., "patient_embeddings.parquet" or "patient_embeddings" (legacy)

	# ============================================================================
	# CLINICAL SPACE TEMPLATE
	# ============================================================================

	# Default template for the clinical space query input
	# Users will fill in these fields to define their search criteria
	CLINICAL_SPACE_TEMPLATE = """Age range allowed: any
	Sex allowed: Any
	Cancer type allowed: Non-small cell lung cancer
	Histology allowed: Adenocarcinoma
	Cancer burden allowed: Metastatic
	Prior treatment required: No requirements
	Prior treatment excluded: No requirements
	Biomarkers required: EGFR mutant
	Biomarkers excluded: None"""

	BOILERPLATE_TEMPLATE = "Patients must have no history of pneumonitis"


	# ============================================================================
	# USAGE NOTES
	# ============================================================================
	#
	# 1. Set the model paths above to your preferred models
	# 2. Optionally set DEFAULT_PATIENT_DB or PREEMBEDDED_PATIENTS
	# 3. Customize CLINICAL_SPACE_TEMPLATE if needed
	# 4. Save this file
	# 5. Run: streamlit run patient_matching_app.py
	# 6. Models will load automatically on startup
	#
	# To create pre-embedded patients (new parquet format, recommended):
	# python preembed_patients.py --patients patients.parquet --embedder path/to/embedder --output patient_embeddings.parquet
	#
	# To upload pre-embedded patients to Hugging Face Hub:
	# from datasets import Dataset
	# ds = Dataset.from_parquet("patient_embeddings.parquet")
	# ds.push_to_hub("your-username/patient-embeddings")
	#
	# To load pre-embedded patients from Hugging Face Hub in your app:
	# from datasets import load_dataset
	# ds = load_dataset("your-username/patient-embeddings")
	# ds['train'].to_parquet("local_patient_embeddings.parquet")
	# # Then set PREEMBEDDED_PATIENTS = "local_patient_embeddings.parquet"
	#