File size: 4,839 Bytes
0fe62c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# Configuration for Patient Matching Pipeline
#
# Edit the values below to set your default models and patient database.
# Models will auto-load on application startup.

# ============================================================================
# MODEL PATHS - Set your default models here
# ============================================================================

# Set to None to skip auto-loading, or provide model path/HuggingFace ID
MODEL_CONFIG = {
    # Sentence transformer for embedding patient summaries and clinical spaces
    "embedder": "ksg-dfci/TrialSpace-1225",  # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model"
    
    # ModernBERT classifier for eligibility prediction
    "trial_checker": "ksg-dfci/TrialChecker-1225",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker"
    
    # ModernBERT classifier for boilerplate exclusion prediction
    "boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker"
}

# Example configuration with base models:
# MODEL_CONFIG = {
#     "embedder": "Qwen/Qwen3-Embedding-0.6B",
#     "trial_checker": "answerdotai/ModernBERT-large",
#     "boilerplate_checker": "answerdotai/ModernBERT-large",
# }

# Example configuration with fine-tuned models:
# MODEL_CONFIG = {
#     "embedder": "./reranker_round2.model",
#     "trial_checker": "./modernbert-trial-checker",
#     "boilerplate_checker": "./modernbert-boilerplate-checker",
# }

# ============================================================================
# DEFAULT PATIENT DATABASE
# ============================================================================

# Path to default patient database parquet file
# Required columns: patient_id, patient_summary
# Optional columns: patient_boilerplate (for boilerplate checking)
# Will auto-load and embed when embedder model is ready
# Set to None to disable auto-loading
#DEFAULT_PATIENT_DB = "./synthetic_patient_summary_sample.parquet"  # e.g., "./patients.parquet" or "./patient_summaries.parquet"

# Path to pre-embedded patient database (faster loading)
# 
# NEW FORMAT (recommended): Single parquet file with embedding column
#   - Created by: python preembed_patients.py --output patient_embeddings.parquet
#   - Contains all patient data + patient_embedding column (list of floats)
#   - Compatible with Hugging Face datasets
#   - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings.parquet"
#
# LEGACY FORMAT (still supported): Prefix for pkl/npy/json files
#   - Created by old version of preembed_patients.py
#   - Files: {prefix}_data.pkl, {prefix}_vectors.npy, {prefix}_metadata.json
#   - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings"
#
PREEMBEDDED_PATIENTS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/synthetic_patient_embeddings.parquet"  # e.g., "patient_embeddings.parquet" or "patient_embeddings" (legacy)

# ============================================================================
# CLINICAL SPACE TEMPLATE
# ============================================================================

# Default template for the clinical space query input
# Users will fill in these fields to define their search criteria
CLINICAL_SPACE_TEMPLATE = """Age range allowed: any
Sex allowed: Any
Cancer type allowed: Non-small cell lung cancer
Histology allowed: Adenocarcinoma
Cancer burden allowed: Metastatic
Prior treatment required: No requirements
Prior treatment excluded: No requirements
Biomarkers required: EGFR mutant
Biomarkers excluded: None"""

BOILERPLATE_TEMPLATE = "Patients must have no history of pneumonitis"


# ============================================================================
# USAGE NOTES
# ============================================================================
# 
# 1. Set the model paths above to your preferred models
# 2. Optionally set DEFAULT_PATIENT_DB or PREEMBEDDED_PATIENTS
# 3. Customize CLINICAL_SPACE_TEMPLATE if needed
# 4. Save this file
# 5. Run: streamlit run patient_matching_app.py
# 6. Models will load automatically on startup
# 
# To create pre-embedded patients (new parquet format, recommended):
#   python preembed_patients.py --patients patients.parquet --embedder path/to/embedder --output patient_embeddings.parquet
#
# To upload pre-embedded patients to Hugging Face Hub:
#   from datasets import Dataset
#   ds = Dataset.from_parquet("patient_embeddings.parquet")
#   ds.push_to_hub("your-username/patient-embeddings")
#
# To load pre-embedded patients from Hugging Face Hub in your app:
#   from datasets import load_dataset
#   ds = load_dataset("your-username/patient-embeddings")
#   ds['train'].to_parquet("local_patient_embeddings.parquet")
#   # Then set PREEMBEDDED_PATIENTS = "local_patient_embeddings.parquet"
#