Spaces:

ksg-dfci
/

MatchMiner-AI-Patient-Search

Sleeping

App Files Files Community

kenlkehl commited on about 4 hours ago

Commit

0fe62c7

verified ·

1 Parent(s): f064c8d

Upload 3 files

Browse files

Files changed (3) hide show

app.py +1128 -0
config.py +105 -0
preembed_patients.py +392 -0

app.py ADDED Viewed

	@@ -0,0 +1,1128 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Patient Matching Pipeline - Gradio Web Interface
+This interface allows users to:
+1. Configure models (embedder, trial_checker, boilerplate_checker)
+2. Upload patient database OR load pre-embedded patients
+3. Enter set of clinical criteria (trial eligibility criteria)
+4. Get ranked patient recommendations with eligibility predictions
+"""
+import gradio as gr
+import pandas as pd
+import numpy as np
+import torch
+import os
+import json
+import pickle
+import html
+from typing import List, Tuple
+from pathlib import Path
+import pyarrow.parquet as pq
+# HuggingFace imports
+from transformers import (
+    AutoTokenizer,
+    AutoModelForSequenceClassification,
+)
+from sentence_transformers import SentenceTransformer
+# Try to import configuration
+try:
+    import config
+    HAS_CONFIG = True
+    print("✓ Found config.py - will auto-load models on startup")
+except ImportError:
+    HAS_CONFIG = False
+    print("○ No config.py found - using manual model loading")
+# ============================================================================
+# GLOBAL STATE
+# ============================================================================
+class AppState:
+    def __init__(self):
+        self.embedder_model = None
+        self.embedder_tokenizer = None
+        self.trial_checker_model = None
+        self.trial_checker_tokenizer = None
+        self.boilerplate_checker_model = None
+        self.boilerplate_checker_tokenizer = None
+        self.patient_df = None
+        self.patient_embeddings = None
+        self.patient_preview_df = None
+        # Store last results for export
+        self.last_results_df = None
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.auto_load_status = {
+            "embedder": "",
+            "trial_checker": "",
+            "boilerplate_checker": "",
+            "patients": ""
+        }
+    def reset_patients(self):
+        self.patient_df = None
+        self.patient_embeddings = None
+        self.patient_preview_df = None
+state = AppState()
+# ============================================================================
+# CONSTANTS
+# ============================================================================
+MAX_EMBEDDER_SEQ_LEN = 2500
+MAX_TRIAL_CHECKER_LENGTH = 4096
+MAX_BOILERPLATE_CHECKER_LENGTH = 3192
+CLASSIFIER_BATCH_SIZE = 32  # Batch size for trial_checker and boilerplate_checker inference
+# Default templates
+DEFAULT_CLINICAL_SPACE_TEMPLATE = """Age range allowed:
+Sex allowed:
+Cancer type allowed:
+Histology allowed:
+Cancer burden allowed:
+Prior treatment required:
+Prior treatment excluded:
+Biomarkers required:
+Biomarkers excluded: """
+DEFAULT_BOILERPLATE_TEMPLATE = """History of pneumonitis:
+Heart failure or cardiac dysfunction:
+Renal dysfunction:
+Liver dysfunction:
+Uncontrolled brain metastases:
+HIV or hepatitis infection:
+Poor performance status (ECOG >= 2):
+Other relevant exclusions: """
+# ============================================================================
+# UTILITY FUNCTIONS
+# ============================================================================
+def truncate_text(text: str, tokenizer, max_tokens: int = 1500) -> str:
+    """Truncate text to a maximum number of tokens."""
+    return tokenizer.decode(
+        tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_tokens),
+        skip_special_tokens=True
+    )
+def format_probability_visual(val, is_exclusion=False):
+    """Format probabilities with visual indicators."""
+    try:
+        val_float = float(val)
+    except:
+        return val
+    if not is_exclusion:
+        # High eligibility is good
+        if val_float >= 0.8:
+            return f"🟢 **{val_float:.2f}**"
+        elif val_float >= 0.5:
+            return f"🟡 {val_float:.2f}"
+        else:
+            return f"🔴 {val_float:.2f}"
+    else:
+        # High exclusion is bad
+        if val_float >= 0.5:
+            return f"🔴 **{val_float:.2f}**"
+        elif val_float >= 0.2:
+            return f"🟡 {val_float:.2f}"
+        else:
+            return f"🟢 {val_float:.2f}"
+# ============================================================================
+# AUTO-LOADING FROM CONFIG
+# ============================================================================
+def auto_load_models_from_config():
+    """Auto-load models specified in config.py"""
+    if not HAS_CONFIG:
+        return
+    print("\n" + "="*70)
+    print("AUTO-LOADING MODELS FROM CONFIG")
+    print("="*70)
+    # Load embedder
+    if config.MODEL_CONFIG.get("embedder"):
+        print(f"\n[1/3] Loading embedder: {config.MODEL_CONFIG['embedder']}")
+        status, _, _ = load_embedder_model(config.MODEL_CONFIG["embedder"])
+        state.auto_load_status["embedder"] = status
+        print(status)
+    # Load trial checker
+    if config.MODEL_CONFIG.get("trial_checker"):
+        print(f"\n[2/3] Loading trial checker: {config.MODEL_CONFIG['trial_checker']}")
+        status, _ = load_trial_checker(config.MODEL_CONFIG["trial_checker"])
+        state.auto_load_status["trial_checker"] = status
+        print(status)
+    # Load boilerplate checker
+    if config.MODEL_CONFIG.get("boilerplate_checker"):
+        print(f"\n[3/3] Loading boilerplate checker: {config.MODEL_CONFIG['boilerplate_checker']}")
+        status, _ = load_boilerplate_checker(config.MODEL_CONFIG["boilerplate_checker"])
+        state.auto_load_status["boilerplate_checker"] = status
+        print(status)
+    print("\n" + "="*70)
+    print("MODEL AUTO-LOADING COMPLETE")
+    print("="*70 + "\n")
+def auto_load_patients_from_config():
+    """Auto-load patient database from config.py - prefers pre-embedded over fresh embedding."""
+    if not HAS_CONFIG:
+        return
+    # Check for pre-embedded patients first (much faster)
+    if hasattr(config, 'PREEMBEDDED_PATIENTS') and config.PREEMBEDDED_PATIENTS:
+        preembed_path = config.PREEMBEDDED_PATIENTS
+        # Handle URL paths for Hugging Face datasets
+        if preembed_path.startswith("http://") or preembed_path.startswith("https://"):
+            print("\n" + "="*70)
+            print(f"AUTO-LOADING PRE-EMBEDDED PATIENTS (URL): {preembed_path}")
+            print("="*70)
+            status, preview = load_preembedded_patients(preembed_path)
+            state.auto_load_status["patients"] = status
+            state.patient_preview_df = preview
+            print("="*70)
+            print("PRE-EMBEDDED PATIENTS AUTO-LOADING COMPLETE")
+            print("="*70 + "\n")
+            return
+        # Check for new parquet format first, then fall back to old format
+        parquet_path = preembed_path if preembed_path.endswith('.parquet') else f"{preembed_path}.parquet"
+        old_format_data = f"{preembed_path}_data.pkl"
+        if os.path.exists(parquet_path):
+            # New parquet format
+            print("\n" + "="*70)
+            print(f"AUTO-LOADING PRE-EMBEDDED PATIENTS (parquet): {parquet_path}")
+            print("="*70)
+            status, preview = load_preembedded_patients(parquet_path)
+            state.auto_load_status["patients"] = status
+            state.patient_preview_df = preview
+            print("="*70)
+            print("PRE-EMBEDDED PATIENTS AUTO-LOADING COMPLETE")
+            print("="*70 + "\n")
+            return
+        elif os.path.exists(old_format_data):
+            # Old format (pkl + npy + json)
+            print("\n" + "="*70)
+            print(f"AUTO-LOADING PRE-EMBEDDED PATIENTS (legacy): {preembed_path}")
+            print("="*70)
+            status, preview = load_preembedded_patients(preembed_path)
+            state.auto_load_status["patients"] = status
+            state.patient_preview_df = preview
+            print("="*70)
+            print("PRE-EMBEDDED PATIENTS AUTO-LOADING COMPLETE")
+            print("="*70 + "\n")
+            return
+        else:
+            print(f"✗ Pre-embedded patient files not found: {preembed_path}")
+            state.auto_load_status["patients"] = f"✗ Pre-embedded files not found: {preembed_path}"
+        return
+    # Fall back to fresh embedding if no pre-embedded patients specified
+    if not hasattr(config, 'DEFAULT_PATIENT_DB') or not config.DEFAULT_PATIENT_DB:
+        print("○ No patient database specified in config")
+        return
+    if not os.path.exists(config.DEFAULT_PATIENT_DB):
+        print(f"✗ Default patient database not found: {config.DEFAULT_PATIENT_DB}")
+        state.auto_load_status["patients"] = f"✗ Patient database file not found: {config.DEFAULT_PATIENT_DB}"
+        return
+    if state.embedder_model is None:
+        print("○ Embedder not loaded yet - skipping patient database auto-load")
+        state.auto_load_status["patients"] = "○ Waiting for embedder model to be loaded..."
+        return
+    print("\n" + "="*70)
+    print(f"AUTO-LOADING PATIENT DATABASE: {config.DEFAULT_PATIENT_DB}")
+    print("="*70)
+    class FilePath:
+        def __init__(self, path):
+            self.name = path
+    status, preview = load_and_embed_patients(FilePath(config.DEFAULT_PATIENT_DB), show_progress=True)
+    state.auto_load_status["patients"] = status
+    state.patient_preview_df = preview
+    print("="*70)
+    print("PATIENT DATABASE AUTO-LOADING COMPLETE")
+    print("="*70 + "\n")
+# ============================================================================
+# MODEL LOADING FUNCTIONS
+# ============================================================================
+def load_embedder_model(model_path: str) -> Tuple[str, str, str]:
+    """Load sentence transformer embedder model."""
+    try:
+        will_need_reembed = state.patient_df is not None and len(state.patient_df) > 0
+        if will_need_reembed:
+            warning_msg = f"\n⚠️  Warning: {len(state.patient_df)} patients are currently loaded. They will need to be re-embedded with the new model."
+        else:
+            warning_msg = ""
+        state.embedder_model = SentenceTransformer(model_path, device=state.device, trust_remote_code=True)
+        state.embedder_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        # Set the instruction prompt
+        try:
+            state.embedder_model.prompts['query'] = (
+                "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+                "that are reasonable for that patient; or, given a clinical trial option, "
+                "retrieve cancer patients who are reasonable candidates for that trial."
+            )
+        except:
+            pass
+        try:
+            state.embedder_model.max_seq_length = MAX_EMBEDDER_SEQ_LEN
+        except:
+            pass
+        success_msg = f"✓ Embedder model loaded from {model_path}{warning_msg}"
+        if will_need_reembed:
+            state.patient_embeddings = None
+            success_msg += "\n→ Patient embeddings cleared. Please reload patient database to re-embed."
+        return success_msg, "", warning_msg
+    except Exception as e:
+        return f"✗ Error loading embedder model: {str(e)}", str(e), ""
+def load_trial_checker(model_path: str) -> Tuple[str, str]:
+    """Load ModernBERT trial checker."""
+    try:
+        state.trial_checker_tokenizer = AutoTokenizer.from_pretrained(model_path)
+        state.trial_checker_model = AutoModelForSequenceClassification.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16 if state.device == "cuda" else torch.float32
+        ).to(state.device)
+        state.trial_checker_model.eval()
+        return f"✓ Trial checker loaded from {model_path}", ""
+    except Exception as e:
+        return f"✗ Error loading trial checker: {str(e)}", str(e)
+def load_boilerplate_checker(model_path: str) -> Tuple[str, str]:
+    """Load ModernBERT boilerplate checker."""
+    try:
+        state.boilerplate_checker_tokenizer = AutoTokenizer.from_pretrained(model_path)
+        state.boilerplate_checker_model = AutoModelForSequenceClassification.from_pretrained(
+            model_path,
+            torch_dtype=torch.float16 if state.device == "cuda" else torch.float32
+        ).to(state.device)
+        state.boilerplate_checker_model.eval()
+        return f"✓ Boilerplate checker loaded from {model_path}", ""
+    except Exception as e:
+        return f"✗ Error loading boilerplate checker: {str(e)}", str(e)
+# ============================================================================
+# PATIENT DATA LOADING
+# ============================================================================
+def load_preembedded_patients(preembedded_path: str) -> Tuple[str, pd.DataFrame]:
+    """Load pre-embedded patient database from disk.
+    Supports two formats:
+    1. New format: Single parquet file with patient_embedding column
+       - Path should end with .parquet
+       - Embeddings stored as lists in patient_embedding column
+       - Metadata stored in parquet file metadata
+    2. Legacy format: Separate pkl/npy/json files
+       - Path is a prefix (e.g., "patient_embeddings")
+       - Creates patient_embeddings_data.pkl, _vectors.npy, _metadata.json
+    """
+    try:
+        # Determine format based on path
+        is_parquet = preembedded_path.endswith('.parquet') or os.path.exists(f"{preembedded_path}.parquet") if not preembedded_path.endswith('.parquet') else True
+        if is_parquet:
+            return _load_preembedded_parquet(preembedded_path)
+        else:
+            return _load_preembedded_legacy(preembedded_path)
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return f"✗ Error loading pre-embedded patients: {str(e)}", None
+def _load_preembedded_parquet(parquet_path: str) -> Tuple[str, pd.DataFrame]:
+    """Load pre-embedded patients from new single parquet format."""
+    is_url = parquet_path.startswith("http://") or parquet_path.startswith("https://")
+    # Ensure .parquet extension for local files
+    if not is_url and not parquet_path.endswith('.parquet'):
+        parquet_path = f"{parquet_path}.parquet"
+    if not is_url and not os.path.exists(parquet_path):
+        return f"✗ Pre-embedded parquet file not found: {parquet_path}", None
+    print(f"\n{'='*70}")
+    print(f"LOADING PRE-EMBEDDED PATIENTS (Parquet Format)")
+    print(f"{'='*70}")
+    print(f"Loading from: {parquet_path}")
+    try:
+        # Read parquet file - from URL or local path
+        if is_url:
+            df = pd.read_parquet(parquet_path)
+            # For remote files, we can't easily read pyarrow metadata without downloading
+            # the file first, so we'll just load the dataframe directly.
+            print(f"Metadata: (Skipped for URL)")
+        else:
+            # Read local parquet file with pyarrow to access metadata
+            parquet_file = pq.read_table(parquet_path)
+            # Extract metadata if available
+            if parquet_file.schema.metadata and b'patient_embedding_metadata' in parquet_file.schema.metadata:
+                metadata = json.loads(parquet_file.schema.metadata[b'patient_embedding_metadata'].decode('utf-8'))
+                print(f"Metadata:")
+                print(f"  Created: {metadata.get('created_at', 'unknown')}")
+                print(f"  Embedder: {metadata.get('embedder_model', 'unknown')}")
+                print(f"  Patients: {metadata.get('num_patients', 'unknown')}")
+                print(f"  Embedding dim: {metadata.get('embedding_dim', 'unknown')}")
+            # Convert to pandas
+            df = parquet_file.to_pandas()
+    except Exception as e:
+        error_msg = f"✗ Failed to read parquet file from {parquet_path}: {str(e)}"
+        print(error_msg)
+        return error_msg, None
+    print(f"✓ Loaded {len(df)} patients")
+    print(f"  Columns: {', '.join(df.columns.tolist())}")
+    # Check for required columns
+    if 'patient_embedding' not in df.columns:
+        return f"✗ Parquet file missing 'patient_embedding' column: {parquet_path}", None
+    if 'patient_id' not in df.columns:
+        return f"✗ Parquet file missing 'patient_id' column: {parquet_path}", None
+    if 'patient_summary' not in df.columns:
+        return f"✗ Parquet file missing 'patient_summary' column: {parquet_path}", None
+    # Check boilerplate column
+    if 'patient_boilerplate' in df.columns:
+        non_empty_bp = (df['patient_boilerplate'].astype(str).str.strip().str.len() > 0).sum()
+        print(f"  ✓ patient_boilerplate column: {non_empty_bp}/{len(df)} patients have boilerplate text")
+    else:
+        print(f"  ⚠ No patient_boilerplate column found")
+        df['patient_boilerplate'] = ''
+    # Extract embeddings from column and convert to numpy array
+    print(f"Converting embeddings to numpy array...")
+    embeddings = np.array(df['patient_embedding'].tolist(), dtype=np.float32)
+    print(f"✓ Loaded embeddings: {embeddings.shape}")
+    # Remove embedding column from dataframe (we store it separately in memory)
+    df_without_embeddings = df.drop(columns=['patient_embedding'])
+    state.patient_df = df_without_embeddings
+    state.patient_embeddings = embeddings
+    print(f"{'='*70}")
+    print(f"PRE-EMBEDDED PATIENTS LOADED SUCCESSFULLY")
+    print(f"{'='*70}\n")
+    preview = df_without_embeddings[['patient_id', 'patient_summary']].head(10)
+    return f"✓ Loaded {len(df)} pre-embedded patients from {os.path.basename(parquet_path)}", preview
+def _load_preembedded_legacy(preembedded_prefix: str) -> Tuple[str, pd.DataFrame]:
+    """Load pre-embedded patients from legacy format (pkl + npy + json files)."""
+    data_file = f"{preembedded_prefix}_data.pkl"
+    vectors_file = f"{preembedded_prefix}_vectors.npy"
+    metadata_file = f"{preembedded_prefix}_metadata.json"
+    if not os.path.exists(data_file):
+        return f"✗ Pre-embedded data file not found: {data_file}", None
+    if not os.path.exists(vectors_file):
+        return f"✗ Pre-embedded vectors file not found: {vectors_file}", None
+    print(f"\n{'='*70}")
+    print(f"LOADING PRE-EMBEDDED PATIENTS (Legacy Format)")
+    print(f"{'='*70}")
+    print(f"Loading from: {preembedded_prefix}_*")
+    if os.path.exists(metadata_file):
+        with open(metadata_file, 'r') as f:
+            metadata = json.load(f)
+        print(f"Metadata:")
+        print(f"  Created: {metadata.get('created_at', 'unknown')}")
+        print(f"  Embedder: {metadata.get('embedder_model', 'unknown')}")
+        print(f"  Patients: {metadata.get('num_patients', 'unknown')}")
+        print(f"  Embedding dim: {metadata.get('embedding_dim', 'unknown')}")
+    print(f"Loading patient dataframe...")
+    with open(data_file, 'rb') as f:
+        df = pickle.load(f)
+    print(f"✓ Loaded {len(df)} patients")
+    print(f"  Columns: {', '.join(df.columns.tolist())}")
+    # Check boilerplate column
+    if 'patient_boilerplate' in df.columns:
+        non_empty_bp = (df['patient_boilerplate'].astype(str).str.strip().str.len() > 0).sum()
+        print(f"  ✓ patient_boilerplate column: {non_empty_bp}/{len(df)} patients have boilerplate text")
+    else:
+        print(f"  ⚠ No patient_boilerplate column found")
+        df['patient_boilerplate'] = ''
+    print(f"Loading embeddings...")
+    embeddings = np.load(vectors_file)
+    print(f"✓ Loaded embeddings: {embeddings.shape}")
+    if len(df) != embeddings.shape[0]:
+        return (
+            f"✗ Mismatch: {len(df)} patients but {embeddings.shape[0]} embeddings",
+            None
+        )
+    state.patient_df = df
+    state.patient_embeddings = embeddings
+    print(f"{'='*70}")
+    print(f"PRE-EMBEDDED PATIENTS LOADED SUCCESSFULLY")
+    print(f"{'='*70}\n")
+    preview = df[['patient_id', 'patient_summary']].head(10)
+    return f"✓ Loaded {len(df)} pre-embedded patients from {preembedded_prefix}_*", preview
+def load_and_embed_patients(file, show_progress: bool = False) -> Tuple[str, pd.DataFrame]:
+    """Load patient database and embed summaries."""
+    try:
+        if state.embedder_model is None:
+            return "✗ Please load the embedder model first!", None
+        # Read file
+        if file.name.endswith('.parquet'):
+            df = pd.read_parquet(file.name)
+        elif file.name.endswith('.csv'):
+            df = pd.read_csv(file.name)
+        elif file.name.endswith(('.xlsx', '.xls')):
+            df = pd.read_excel(file.name)
+        else:
+            return "✗ Unsupported format. Use Parquet, CSV, or Excel.", None
+        # Check required columns
+        required_cols = ['patient_id', 'patient_summary']
+        missing = [col for col in required_cols if col not in df.columns]
+        if missing:
+            return f"✗ Missing columns: {', '.join(missing)}", None
+        # Clean data
+        df = df[~df['patient_summary'].isnull()].copy()
+        df = df[df['patient_summary'].astype(str).str.strip().str.len() > 0].copy()
+        if 'patient_boilerplate' not in df.columns:
+            df['patient_boilerplate'] = ''
+        else:
+            df['patient_boilerplate'] = df['patient_boilerplate'].fillna('')
+        # Prepare texts for embedding
+        df['patient_summary_trunc'] = df['patient_summary'].apply(
+            lambda x: truncate_text(str(x), state.embedder_tokenizer, max_tokens=1500)
+        )
+        prefix = (
+            "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+            "that are reasonable for that patient; or, given a clinical trial option, "
+            "retrieve cancer patients who are reasonable candidates for that trial. "
+        )
+        texts_to_embed = [prefix + txt for txt in df['patient_summary_trunc'].tolist()]
+        if not show_progress:
+            gr.Info(f"Embedding {len(df)} patient summaries...")
+        else:
+            print(f"Embedding {len(df)} patient summaries...")
+        with torch.no_grad():
+            embeddings = state.embedder_model.encode(
+                texts_to_embed,
+                batch_size=64,
+                convert_to_tensor=True,
+                normalize_embeddings=True,
+                show_progress_bar=show_progress,
+                prompt='query'
+            )
+        state.patient_df = df
+        state.patient_embeddings = embeddings.cpu().numpy()
+        preview = df[['patient_id', 'patient_summary']].head(10)
+        success_msg = f"✓ Loaded and embedded {len(df)} patients"
+        if show_progress:
+            print(success_msg)
+        return success_msg, preview
+    except Exception as e:
+        return f"✗ Error processing patients: {str(e)}", None
+# ============================================================================
+# PATIENT MATCHING
+# ============================================================================
+def match_patients(
+    clinical_space: str,
+    boilerplate_criteria: str,
+    top_k_check: int = 1000,
+    eligibility_threshold: float = 0.5
+) -> Tuple[pd.DataFrame, str]:
+    """Match clinical query to patients and run eligibility checks."""
+    try:
+        if state.embedder_model is None:
+            raise ValueError("Embedder model not loaded")
+        if state.patient_embeddings is None:
+            raise ValueError("Patient database not loaded")
+        if state.trial_checker_model is None:
+            raise ValueError("Trial checker model not loaded")
+        if state.boilerplate_checker_model is None:
+            raise ValueError("Boilerplate checker model not loaded")
+        if not clinical_space or not clinical_space.strip():
+            raise ValueError("Please enter clinical criteria")
+        # Embed clinical query
+        prefix = (
+            "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+            "that are reasonable for that patient; or, given a clinical trial option, "
+            "retrieve cancer patients who are reasonable candidates for that trial. "
+        )
+        query_text = truncate_text(clinical_space, state.embedder_tokenizer, max_tokens=MAX_EMBEDDER_SEQ_LEN)
+        query_text_with_prefix = prefix + query_text
+        gr.Info("Ranking all patients by similarity...")
+        with torch.no_grad():
+            query_emb = state.embedder_model.encode(
+                [query_text_with_prefix],
+                convert_to_tensor=True,
+                normalize_embeddings=True,
+                prompt='query'
+            )
+        # Calculate similarities for all patients
+        query_emb_np = query_emb.cpu().numpy()
+        similarities = np.dot(state.patient_embeddings, query_emb_np.T).squeeze()
+        # Rank all patients by similarity
+        sorted_indices = np.argsort(similarities)[::-1]
+        # Get all patients ranked
+        all_patients_ranked = state.patient_df.iloc[sorted_indices].copy()
+        all_patients_ranked['similarity_score'] = similarities[sorted_indices]
+        # Limit to top_k_check for classifier models
+        top_k_check = min(top_k_check, len(all_patients_ranked))
+        patients_to_check = all_patients_ranked.head(top_k_check).copy()
+        gr.Info(f"Running eligibility checks on top {len(patients_to_check)} patients...")
+        # Run trial checker in batches
+        trial_check_inputs = [
+            f"{clinical_space}\nNow here is the patient summary:{row['patient_summary']}"
+            for _, row in patients_to_check.iterrows()
+        ]
+        trial_probs_list = []
+        for i in range(0, len(trial_check_inputs), CLASSIFIER_BATCH_SIZE):
+            batch_inputs = trial_check_inputs[i:i + CLASSIFIER_BATCH_SIZE]
+            batch_encodings = state.trial_checker_tokenizer(
+                batch_inputs,
+                truncation=True,
+                max_length=MAX_TRIAL_CHECKER_LENGTH,
+                padding=True,
+                return_tensors='pt'
+            ).to(state.device)
+            with torch.no_grad():
+                batch_outputs = state.trial_checker_model(**batch_encodings)
+                batch_probs = torch.softmax(batch_outputs.logits, dim=1)[:, 1].cpu().numpy()
+                trial_probs_list.append(batch_probs)
+        trial_probs = np.concatenate(trial_probs_list)
+        patients_to_check['eligibility_probability'] = trial_probs
+        # Run boilerplate checker in batches
+        # Use patient_boilerplate if available, otherwise fall back to patient_summary
+        def get_boilerplate_text(row):
+            bp = row.get('patient_boilerplate', '')
+            if bp and isinstance(bp, str) and bp.strip():
+                return bp
+            return row['patient_summary']
+        boilerplate_check_inputs = [
+            f"Patient history: {get_boilerplate_text(row)}\nTrial exclusions:{boilerplate_criteria}"
+            for _, row in patients_to_check.iterrows()
+        ]
+        boilerplate_probs_list = []
+        for i in range(0, len(boilerplate_check_inputs), CLASSIFIER_BATCH_SIZE):
+            batch_inputs = boilerplate_check_inputs[i:i + CLASSIFIER_BATCH_SIZE]
+            batch_encodings = state.boilerplate_checker_tokenizer(
+                batch_inputs,
+                truncation=True,
+                max_length=MAX_BOILERPLATE_CHECKER_LENGTH,
+                padding=True,
+                return_tensors='pt'
+            ).to(state.device)
+            with torch.no_grad():
+                batch_outputs = state.boilerplate_checker_model(**batch_encodings)
+                batch_probs = torch.softmax(batch_outputs.logits, dim=1)[:, 1].cpu().numpy()
+                boilerplate_probs_list.append(batch_probs)
+        boilerplate_probs = np.concatenate(boilerplate_probs_list)
+        patients_to_check['exclusion_probability'] = boilerplate_probs
+        # Sort by eligibility probability
+        patients_to_check = patients_to_check.sort_values('eligibility_probability', ascending=False)
+        # Store full results for export
+        state.last_results_df = patients_to_check.copy()
+        # Calculate bottom line stats
+        num_eligible = (patients_to_check['eligibility_probability'] >= eligibility_threshold).sum()
+        num_no_exclusion = (patients_to_check['exclusion_probability'] < 0.5).sum()
+        num_both = ((patients_to_check['eligibility_probability'] >= eligibility_threshold) &
+                   (patients_to_check['exclusion_probability'] < 0.5)).sum()
+        bottom_line = f"""
+### 📊 Summary: Patients Meeting Your Criteria
+| Metric | Count |
+|--------|-------|
+| Total patients in database | **{len(state.patient_df)}** |
+| Top patients checked with classifiers | **{len(patients_to_check)}** |
+| Meeting eligibility criteria (≥{eligibility_threshold}) | **{num_eligible}** |
+| Without boilerplate exclusions (<0.5) | **{num_no_exclusion}** |
+| **Meeting BOTH criteria** | **{num_both}** |
+"""
+        # Format for display
+        patients_to_check['eligibility_display'] = patients_to_check['eligibility_probability'].apply(
+            lambda x: format_probability_visual(x, is_exclusion=False)
+        )
+        patients_to_check['exclusion_display'] = patients_to_check['exclusion_probability'].apply(
+            lambda x: format_probability_visual(x, is_exclusion=True)
+        )
+        patients_to_check['similarity_display'] = patients_to_check['similarity_score'].apply(
+            lambda x: f"{x:.3f}"
+        )
+        # Truncate summary for display
+        patients_to_check['summary_preview'] = patients_to_check['patient_summary'].apply(
+            lambda x: str(x)[:300] + "..." if len(str(x)) > 300 else str(x)
+        )
+        # Select columns for display
+        display_cols = [
+            'patient_id',
+            'eligibility_display',
+            'exclusion_display',
+            'similarity_display',
+            'summary_preview'
+        ]
+        result_df = patients_to_check[display_cols].reset_index(drop=True)
+        result_df.columns = [
+            'Patient ID',
+            'Eligibility',
+            'Exclusion',
+            'Similarity',
+            'Summary Preview'
+        ]
+        return result_df, bottom_line
+    except Exception as e:
+        gr.Error(f"Error matching patients: {str(e)}")
+        return pd.DataFrame(), f"**Error:** {str(e)}"
+def get_patient_details(df: pd.DataFrame, evt: gr.SelectData) -> str:
+    """Get full patient details when user clicks on a row."""
+    try:
+        if df is None or len(df) == 0:
+            return "No patient selected"
+        row_idx = evt.index[0]
+        patient_id = df.iloc[row_idx]['Patient ID']
+        # Find in full results
+        if state.last_results_df is None:
+            return "No results available"
+        matching_rows = state.last_results_df[
+            state.last_results_df['patient_id'] == patient_id
+        ]
+        if len(matching_rows) == 0:
+            return f"Error: Could not find patient {patient_id}"
+        patient_row = matching_rows.iloc[0]
+        # Get boilerplate text - use same fallback logic as the checker
+        raw_boilerplate = patient_row.get('patient_boilerplate', '')
+        has_separate_boilerplate = raw_boilerplate and isinstance(raw_boilerplate, str) and raw_boilerplate.strip()
+        if has_separate_boilerplate:
+            boilerplate_text = raw_boilerplate
+        else:
+            boilerplate_text = "(No separate boilerplate column - patient summary was used for boilerplate exclusion check)"
+        # Escape any HTML characters in the text
+        summary_escaped = html.escape(str(patient_row['patient_summary']))
+        boilerplate_escaped = html.escape(str(boilerplate_text))
+        details = f"""
+# Patient Details: {patient_id}
+---
+## Scores
+- **Eligibility Probability:** {patient_row['eligibility_probability']:.3f}
+- **Exclusion Probability:** {patient_row['exclusion_probability']:.3f}
+- **Similarity Score:** {patient_row['similarity_score']:.3f}
+---
+## Full Patient Summary
+<pre style="white-space: pre-wrap; word-wrap: break-word; background-color: #1a1a1a; color: #ffffff; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.9em;">{summary_escaped}</pre>
+---
+## Boilerplate Exclusion Check Input
+<pre style="white-space: pre-wrap; word-wrap: break-word; background-color: #1a1a1a; color: #ffffff; padding: 10px; border-radius: 5px; font-family: monospace; font-size: 0.9em;">{boilerplate_escaped}</pre>
+"""
+        return details
+    except Exception as e:
+        return f"Error retrieving patient details: {str(e)}"
+def request_identified_patients():
+    """Placeholder for requesting identified patient list."""
+    if state.last_results_df is None or len(state.last_results_df) == 0:
+        gr.Warning("No results to request - run a search first")
+        return
+    # TODO: Implement actual request functionality
+    gr.Info("Request functionality not yet implemented")
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+def create_interface():
+    theme = gr.themes.Soft(
+        primary_hue="teal",
+        secondary_hue="slate",
+    ).set(
+        body_background_fill="*neutral_50",
+        block_background_fill="white",
+        block_border_width="1px",
+        block_label_background_fill="*primary_50",
+    )
+    custom_css = """
+    .gradio-container { font-family: 'Inter', Arial, sans-serif !important; }
+    .model-status { min-height: 80px !important; font-size: 0.9em; }
+    .status-box { background: #f9fafb; border: 1px solid #e5e7eb; border-radius: 8px; padding: 10px; }
+    h1 { color: #0d9488; }
+    """
+    # Get templates from config or use defaults
+    clinical_space_template = getattr(config, 'CLINICAL_SPACE_TEMPLATE', DEFAULT_CLINICAL_SPACE_TEMPLATE) if HAS_CONFIG else DEFAULT_CLINICAL_SPACE_TEMPLATE
+    boilerplate_template = getattr(config, 'BOILERPLATE_TEMPLATE', DEFAULT_BOILERPLATE_TEMPLATE) if HAS_CONFIG else DEFAULT_BOILERPLATE_TEMPLATE
+    with gr.Blocks(title="Patient Search Prototype", theme=theme, css=custom_css) as demo:
+        with gr.Row(variant="panel"):
+            with gr.Column(scale=4):
+                gr.Markdown("""
+                # 🔬 Patient Search Prototype
+                **Find patients matching clinical criteria. Designed for clinical trial matching.**
+                """)
+            with gr.Column(scale=1):
+                pass
+        with gr.Tabs():
+            # ============= TAB 1: SEARCH =============
+            with gr.Tab("1️⃣ Search"):
+                gr.Markdown("""
+                ### Define Your Search Criteria
+                Enter the clinical criteria to search for matching patients.
+                """)
+                with gr.Row():
+                    with gr.Column():
+                        clinical_space_input = gr.Textbox(
+                            label="Clinical Criteria",
+                            placeholder="Enter eligibility criteria...",
+                            value=clinical_space_template,
+                            lines=12,
+                            info="Define age, sex, cancer type, histology, treatments, biomarkers, etc."
+                        )
+                    with gr.Column():
+                        boilerplate_input = gr.Textbox(
+                            label="Boilerplate Exclusion Criteria",
+                            placeholder="Enter boilerplate exclusions...",
+                            value=boilerplate_template,
+                            lines=12,
+                            info="Common exclusions like organ dysfunction, infections, etc."
+                        )
+                gr.Markdown("---")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        match_btn = gr.Button("🔍 Find Matching Patients", variant="primary", size="lg")
+                    with gr.Column(scale=3):
+                        with gr.Accordion("Search Settings", open=False):
+                            top_k_check_slider = gr.Slider(
+                                minimum=5, maximum=10000, value=500, step=50,
+                                label="Patients to Check with Classifiers",
+                                info="Number of top-ranked patients to run through eligibility/boilerplate models (larger queries take more time)"
+                            )
+                            eligibility_threshold_slider = gr.Slider(
+                                minimum=0.0, maximum=1.0, value=0.5, step=0.05,
+                                label="Eligibility Threshold",
+                                info="Threshold for counting patients as 'eligible'"
+                            )
+                gr.Markdown("### 📊 Results")
+                # Bottom line summary
+                bottom_line_output = gr.Markdown(
+                    value="*Run a search to see results*"
+                )
+                with gr.Row():
+                    with gr.Column(scale=7):
+                        results_df = gr.Dataframe(
+                            label="Matched Patients",
+                            interactive=False,
+                            wrap=True,
+                            datatype=["str", "markdown", "markdown", "str", "str"],
+                            column_widths=["12%", "12%", "12%", "10%", "54%"]
+                        )
+                    with gr.Column(scale=5):
+                        patient_details = gr.Markdown(
+                            label="Patient Details",
+                            value="<div style='text-align: center; padding: 50px; color: #666;'>👈 Click on a patient row to see full details here</div>"
+                        )
+                # Request identified patients button
+                with gr.Row():
+                    request_btn = gr.Button("📋 Request Identified Patient List", variant="secondary")
+                # Wire up matching
+                match_btn.click(
+                    fn=match_patients,
+                    inputs=[clinical_space_input, boilerplate_input, top_k_check_slider, eligibility_threshold_slider],
+                    outputs=[results_df, bottom_line_output]
+                )
+                results_df.select(
+                    fn=get_patient_details,
+                    inputs=[results_df],
+                    outputs=[patient_details]
+                )
+                request_btn.click(
+                    fn=request_identified_patients,
+                    inputs=[],
+                    outputs=[]
+                )
+            # ============= TAB 2: PATIENT DATABASE =============
+            with gr.Tab("2️⃣ Patient Database"):
+                gr.Markdown("### 📊 Patient Database Management")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("#### Load Pre-embedded Patients (Fast)")
+                        preembed_prefix = gr.Textbox(
+                            label="Pre-embedded Prefix",
+                            placeholder="patient_embeddings",
+                            value=getattr(config, 'PREEMBEDDED_PATIENTS', '') or "" if HAS_CONFIG else ""
+                        )
+                        preembed_btn = gr.Button("Load Pre-embedded", variant="secondary")
+                    with gr.Column():
+                        gr.Markdown("#### Upload & Embed New Database")
+                        patient_file = gr.File(
+                            label="Upload Patient Database (Parquet/CSV/Excel)",
+                            file_types=[".parquet", ".csv", ".xlsx", ".xls"]
+                        )
+                        patient_upload_btn = gr.Button("Process & Embed", variant="secondary")
+                patient_status = gr.Textbox(
+                    label="Status",
+                    interactive=False,
+                    value=state.auto_load_status.get("patients", "No patients loaded")
+                )
+                patient_preview = gr.Dataframe(
+                    label="Patient Preview (first 10)",
+                    value=state.patient_preview_df,
+                    wrap=True
+                )
+                preembed_btn.click(
+                    fn=load_preembedded_patients,
+                    inputs=[preembed_prefix],
+                    outputs=[patient_status, patient_preview]
+                )
+                patient_upload_btn.click(
+                    fn=load_and_embed_patients,
+                    inputs=[patient_file],
+                    outputs=[patient_status, patient_preview]
+                )
+            # ============= TAB 3: MODEL CONFIGURATION =============
+            with gr.Tab("3️⃣ Model Configuration"):
+                gr.Markdown("### 🧠 Model Management")
+                status_msg = """
+                **Config file detected** - Models will auto-load on startup.
+                """ if HAS_CONFIG else """
+                **No config file found** - Please load models manually below.
+                """
+                gr.Info(status_msg)
+                with gr.Group():
+                    with gr.Row():
+                        with gr.Column():
+                            embedder_input = gr.Textbox(
+                                label="Embedder Model",
+                                placeholder="Qwen/Qwen3-Embedding-0.6B",
+                                value=config.MODEL_CONFIG.get("embedder", "") if HAS_CONFIG else ""
+                            )
+                            embedder_btn = gr.Button("Load Embedder")
+                            embedder_status = gr.Textbox(
+                                label="Status",
+                                interactive=False,
+                                value=state.auto_load_status.get("embedder", ""),
+                                elem_classes=["model-status"]
+                            )
+                            embedder_warning = gr.Textbox(visible=False)
+                        with gr.Column():
+                            trial_checker_input = gr.Textbox(
+                                label="Trial Checker Model",
+                                placeholder="answerdotai/ModernBERT-large",
+                                value=config.MODEL_CONFIG.get("trial_checker", "") if HAS_CONFIG else ""
+                            )
+                            trial_checker_btn = gr.Button("Load Trial Checker")
+                            trial_checker_status = gr.Textbox(
+                                label="Status",
+                                interactive=False,
+                                value=state.auto_load_status.get("trial_checker", ""),
+                                elem_classes=["model-status"]
+                            )
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        boilerplate_checker_input = gr.Textbox(
+                            label="Boilerplate Checker Model",
+                            placeholder="answerdotai/ModernBERT-large",
+                            value=config.MODEL_CONFIG.get("boilerplate_checker", "") if HAS_CONFIG else ""
+                        )
+                        boilerplate_checker_btn = gr.Button("Load Boilerplate Checker")
+                        boilerplate_checker_status = gr.Textbox(
+                            label="Status",
+                            interactive=False,
+                            value=state.auto_load_status.get("boilerplate_checker", ""),
+                            elem_classes=["model-status"]
+                        )
+                    with gr.Column(scale=1):
+                        pass
+                # Wire up model loading
+                embedder_btn.click(
+                    fn=load_embedder_model,
+                    inputs=[embedder_input],
+                    outputs=[embedder_status, gr.Textbox(visible=False), embedder_warning]
+                )
+                trial_checker_btn.click(
+                    fn=load_trial_checker,
+                    inputs=[trial_checker_input],
+                    outputs=[trial_checker_status, gr.Textbox(visible=False)]
+                )
+                boilerplate_checker_btn.click(
+                    fn=load_boilerplate_checker,
+                    inputs=[boilerplate_checker_input],
+                    outputs=[boilerplate_checker_status, gr.Textbox(visible=False)]
+                )
+    return demo
+# ============================================================================
+# MAIN
+# ============================================================================
+if __name__ == "__main__":
+    print(f"Device: {state.device}")
+    print(f"GPU Available: {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print(f"GPU Count: {torch.cuda.device_count()}")
+    # Auto-load models from config if available
+    if HAS_CONFIG:
+        auto_load_models_from_config()
+        # Auto-load patients after embedder is ready
+        if state.embedder_model is not None or (hasattr(config, 'PREEMBEDDED_PATIENTS') and config.PREEMBEDDED_PATIENTS):
+            auto_load_patients_from_config()
+    demo = create_interface()
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7861,
+        share=False
+    )

config.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Configuration for Patient Matching Pipeline
+#
+# Edit the values below to set your default models and patient database.
+# Models will auto-load on application startup.
+# ============================================================================
+# MODEL PATHS - Set your default models here
+# ============================================================================
+# Set to None to skip auto-loading, or provide model path/HuggingFace ID
+MODEL_CONFIG = {
+    # Sentence transformer for embedding patient summaries and clinical spaces
+    "embedder": "ksg-dfci/TrialSpace-1225",  # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model"
+    # ModernBERT classifier for eligibility prediction
+    "trial_checker": "ksg-dfci/TrialChecker-1225",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker"
+    # ModernBERT classifier for boilerplate exclusion prediction
+    "boilerplate_checker": "ksg-dfci/BoilerplateChecker-1225",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker"
+}
+# Example configuration with base models:
+# MODEL_CONFIG = {
+#     "embedder": "Qwen/Qwen3-Embedding-0.6B",
+#     "trial_checker": "answerdotai/ModernBERT-large",
+#     "boilerplate_checker": "answerdotai/ModernBERT-large",
+# }
+# Example configuration with fine-tuned models:
+# MODEL_CONFIG = {
+#     "embedder": "./reranker_round2.model",
+#     "trial_checker": "./modernbert-trial-checker",
+#     "boilerplate_checker": "./modernbert-boilerplate-checker",
+# }
+# ============================================================================
+# DEFAULT PATIENT DATABASE
+# ============================================================================
+# Path to default patient database parquet file
+# Required columns: patient_id, patient_summary
+# Optional columns: patient_boilerplate (for boilerplate checking)
+# Will auto-load and embed when embedder model is ready
+# Set to None to disable auto-loading
+#DEFAULT_PATIENT_DB = "./synthetic_patient_summary_sample.parquet"  # e.g., "./patients.parquet" or "./patient_summaries.parquet"
+# Path to pre-embedded patient database (faster loading)
+#
+# NEW FORMAT (recommended): Single parquet file with embedding column
+#   - Created by: python preembed_patients.py --output patient_embeddings.parquet
+#   - Contains all patient data + patient_embedding column (list of floats)
+#   - Compatible with Hugging Face datasets
+#   - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings.parquet"
+#
+# LEGACY FORMAT (still supported): Prefix for pkl/npy/json files
+#   - Created by old version of preembed_patients.py
+#   - Files: {prefix}_data.pkl, {prefix}_vectors.npy, {prefix}_metadata.json
+#   - Example: PREEMBEDDED_PATIENTS = "synthetic_patient_embeddings"
+#
+PREEMBEDDED_PATIENTS = "https://huggingface.co/datasets/ksg-dfci/mmai-synthetic/resolve/main/synthetic_patient_embeddings.parquet"  # e.g., "patient_embeddings.parquet" or "patient_embeddings" (legacy)
+# ============================================================================
+# CLINICAL SPACE TEMPLATE
+# ============================================================================
+# Default template for the clinical space query input
+# Users will fill in these fields to define their search criteria
+CLINICAL_SPACE_TEMPLATE = """Age range allowed: any
+Sex allowed: Any
+Cancer type allowed: Non-small cell lung cancer
+Histology allowed: Adenocarcinoma
+Cancer burden allowed: Metastatic
+Prior treatment required: No requirements
+Prior treatment excluded: No requirements
+Biomarkers required: EGFR mutant
+Biomarkers excluded: None"""
+BOILERPLATE_TEMPLATE = "Patients must have no history of pneumonitis"
+# ============================================================================
+# USAGE NOTES
+# ============================================================================
+#
+# 1. Set the model paths above to your preferred models
+# 2. Optionally set DEFAULT_PATIENT_DB or PREEMBEDDED_PATIENTS
+# 3. Customize CLINICAL_SPACE_TEMPLATE if needed
+# 4. Save this file
+# 5. Run: streamlit run patient_matching_app.py
+# 6. Models will load automatically on startup
+#
+# To create pre-embedded patients (new parquet format, recommended):
+#   python preembed_patients.py --patients patients.parquet --embedder path/to/embedder --output patient_embeddings.parquet
+#
+# To upload pre-embedded patients to Hugging Face Hub:
+#   from datasets import Dataset
+#   ds = Dataset.from_parquet("patient_embeddings.parquet")
+#   ds.push_to_hub("your-username/patient-embeddings")
+#
+# To load pre-embedded patients from Hugging Face Hub in your app:
+#   from datasets import load_dataset
+#   ds = load_dataset("your-username/patient-embeddings")
+#   ds['train'].to_parquet("local_patient_embeddings.parquet")
+#   # Then set PREEMBEDDED_PATIENTS = "local_patient_embeddings.parquet"
+#

preembed_patients.py ADDED Viewed

	@@ -0,0 +1,392 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Pre-embed Patient Summaries Script
+This script pre-processes and embeds a patient database,
+saving the results to a single Parquet file for faster loading
+in the main application and compatibility with Hugging Face datasets.
+Usage:
+python preembed_patients.py --patients ../v20_public_data/patient_summaries_and_their_spaces.parquet --embedder ksg-dfci/TrialSpace-1225 --output synthetic_patient_embeddings.parquet --gpus 0,1 --patient-boilerplate-col patient_boilerplate_text --patient-id-col pseudo_mrn
+This will create:
+    - synthetic_patient_embeddings.parquet: Patient dataframe with embedding vectors as a column
+The parquet file contains:
+    - All original patient columns (patient_id, patient_summary, patient_boilerplate, etc.)
+    - patient_embedding: The embedding vector for each patient (stored as list of floats)
+    - Metadata stored in parquet file metadata (embedder model, creation date, etc.)
+To upload to Hugging Face:
+    from datasets import Dataset
+    ds = Dataset.from_parquet("synthetic_patient_embeddings.parquet")
+    ds.push_to_hub("your-username/patient-embeddings")
+"""
+import argparse
+import pandas as pd
+import numpy as np
+import torch
+import json
+import pyarrow as pa
+import pyarrow.parquet as pq
+from pathlib import Path
+from datetime import datetime
+from typing import Tuple, List
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer
+def truncate_text(text: str, tokenizer, max_tokens: int = 1500) -> str:
+    """Truncate text to a maximum number of tokens."""
+    return tokenizer.decode(
+        tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_tokens),
+        skip_special_tokens=True
+    )
+def load_patients(file_path: str, patient_id_col: str = 'patient_id', patient_boilerplate_col: str = 'patient_boilerplate') -> pd.DataFrame:
+    """Load patients from parquet file."""
+    print(f"\n{'='*70}")
+    print(f"Loading patient database from: {file_path}")
+    print(f"{'='*70}")
+    if file_path.endswith('.parquet'):
+        df = pd.read_parquet(file_path)
+    elif file_path.endswith('.csv'):
+        df = pd.read_csv(file_path)
+    elif file_path.endswith(('.xlsx', '.xls')):
+        df = pd.read_excel(file_path)
+    else:
+        raise ValueError("Unsupported file format. Use Parquet, CSV, or Excel.")
+    # Check required columns
+    required_cols = [patient_id_col, 'patient_summary']
+    missing = [col for col in required_cols if col not in df.columns]
+    if missing:
+        raise ValueError(f"Missing required columns: {', '.join(missing)}")
+    # Rename patient_id column to standard name if different
+    if patient_id_col != 'patient_id':
+        df = df.rename(columns={patient_id_col: 'patient_id'})
+        print(f"  Renamed column '{patient_id_col}' to 'patient_id'")
+    print(f"✓ Loaded {len(df)} patients")
+    print(f"  Columns: {', '.join(df.columns.tolist())}")
+    # Clean data
+    original_count = len(df)
+    df = df[~df['patient_summary'].isnull()].copy()
+    df = df[df['patient_summary'].str.strip().str.len() > 0].copy()
+    # Handle boilerplate column
+    if patient_boilerplate_col and patient_boilerplate_col in df.columns:
+        if patient_boilerplate_col != 'patient_boilerplate':
+            df = df.rename(columns={patient_boilerplate_col: 'patient_boilerplate'})
+            print(f"  Renamed column '{patient_boilerplate_col}' to 'patient_boilerplate'")
+        df['patient_boilerplate'] = df['patient_boilerplate'].fillna('')
+        non_empty_bp = (df['patient_boilerplate'].str.strip().str.len() > 0).sum()
+        print(f"  ✓ Found patient_boilerplate column: {non_empty_bp}/{len(df)} patients have boilerplate text")
+    else:
+        df['patient_boilerplate'] = ''
+        if patient_boilerplate_col:
+            print(f"  ⚠ Column '{patient_boilerplate_col}' not found - patient_boilerplate will be empty")
+        else:
+            print(f"  ○ No boilerplate column specified - patient_boilerplate will be empty")
+    if len(df) < original_count:
+        print(f"  ⚠ Removed {original_count - len(df)} patients with missing/empty 'patient_summary'")
+    return df
+def embed_patients(df: pd.DataFrame, embedder_path: str, device: str = None, gpus: list = None) -> Tuple[np.ndarray, str]:
+    """Embed patient summaries using the specified embedder model.
+    Args:
+        df: DataFrame with patient data
+        embedder_path: Path to embedder model
+        device: Single device string (e.g., 'cuda:0', 'cpu') - used if gpus not specified
+        gpus: List of GPU indices for multi-GPU parallel processing (e.g., [0, 1, 2, 3])
+    """
+    print(f"\n{'='*70}")
+    print(f"Loading embedder model: {embedder_path}")
+    print(f"{'='*70}")
+    # Determine device configuration
+    use_multi_gpu = gpus is not None and len(gpus) > 1
+    if use_multi_gpu:
+        target_devices = [f"cuda:{gpu}" for gpu in gpus]
+        print(f"Multi-GPU mode: {target_devices}")
+        # Load model on CPU first for multi-process pool
+        embedder_model = SentenceTransformer(embedder_path, device='cpu', trust_remote_code=True)
+    else:
+        if gpus is not None and len(gpus) == 1:
+            device = f"cuda:{gpus[0]}"
+        elif device is None:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Device: {device}")
+        embedder_model = SentenceTransformer(embedder_path, device=device, trust_remote_code=True)
+    embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_path, trust_remote_code=True)
+    print(f"✓ Embedder loaded")
+    # Set the instruction prompt
+    try:
+        embedder_model.prompts['query'] = (
+            "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+            "that are reasonable for that patient; or, given a clinical trial option, "
+            "retrieve cancer patients who are reasonable candidates for that trial."
+        )
+    except:
+        pass
+    try:
+        embedder_model.max_seq_length = 2500
+    except:
+        pass
+    print(f"\n{'='*70}")
+    print(f"Embedding {len(df)} patient summaries")
+    print(f"{'='*70}")
+    # Prepare texts for embedding
+    df['patient_summary_trunc'] = df['patient_summary'].apply(
+        lambda x: truncate_text(str(x), embedder_tokenizer, max_tokens=1500)
+    )
+    # Add instruction prefix
+    prefix = (
+        "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+        "that are reasonable for that patient; or, given a clinical trial option, "
+        "retrieve cancer patients who are reasonable candidates for that trial. "
+    )
+    texts_to_embed = [prefix + txt for txt in df['patient_summary_trunc'].tolist()]
+    print(f"  Text length stats:")
+    print(f"    Mean: {np.mean([len(t) for t in texts_to_embed]):.0f} chars")
+    print(f"    Max: {max([len(t) for t in texts_to_embed])} chars")
+    # Embed with progress bar
+    if use_multi_gpu:
+        print(f"  Starting multi-GPU pool on {target_devices}...")
+        pool = embedder_model.start_multi_process_pool(target_devices=target_devices)
+        try:
+            embeddings_np = embedder_model.encode_multi_process(
+                texts_to_embed,
+                pool,
+                batch_size=64,
+                normalize_embeddings=True,
+            )
+        finally:
+            embedder_model.stop_multi_process_pool(pool)
+    else:
+        with torch.no_grad():
+            embeddings = embedder_model.encode(
+                texts_to_embed,
+                batch_size=64,
+                convert_to_tensor=True,
+                normalize_embeddings=True,
+                show_progress_bar=True,
+                prompt='query'
+            )
+        embeddings_np = embeddings.cpu().numpy()
+    print(f"✓ Embedding complete")
+    print(f"  Shape: {embeddings_np.shape}")
+    print(f"  Dtype: {embeddings_np.dtype}")
+    return embeddings_np, embedder_path
+def save_embeddings(df: pd.DataFrame, embeddings: np.ndarray, output_path: str, embedder_path: str, gpus: list = None):
+    """Save patient data with embeddings to a single Parquet file.
+    The embeddings are stored as a column of lists, which is compatible with
+    Hugging Face datasets and PyArrow.
+    """
+    print(f"\n{'='*70}")
+    print(f"Saving to: {output_path}")
+    print(f"{'='*70}")
+    # Ensure output path ends with .parquet
+    if not output_path.endswith('.parquet'):
+        output_path = f"{output_path}.parquet"
+    output_dir = Path(output_path).parent
+    if str(output_dir) and str(output_dir) != '.':
+        output_dir.mkdir(parents=True, exist_ok=True)
+    # Add embeddings as a column (convert numpy arrays to lists for parquet compatibility)
+    df_out = df.copy()
+    df_out['patient_embedding'] = [emb.tolist() for emb in embeddings]
+    # Create metadata dictionary
+    metadata = {
+        "created_at": datetime.now().isoformat(),
+        "embedder_model": embedder_path,
+        "num_patients": str(len(df)),
+        "embedding_dim": str(embeddings.shape[1]),
+        "embedding_dtype": str(embeddings.dtype),
+        "normalized": "true",
+        "gpus_used": str(gpus) if gpus else "single device",
+        "format_version": "2.0",  # Version indicator for the new format
+    }
+    # Convert DataFrame to PyArrow Table
+    table = pa.Table.from_pandas(df_out)
+    # Add metadata to the table schema
+    existing_metadata = table.schema.metadata or {}
+    existing_metadata[b'patient_embedding_metadata'] = json.dumps(metadata).encode('utf-8')
+    table = table.replace_schema_metadata(existing_metadata)
+    # Write to parquet
+    pq.write_table(table, output_path)
+    file_size_mb = Path(output_path).stat().st_size / 1024 / 1024
+    print(f"✓ Saved parquet file: {output_path}")
+    print(f"  Size: {file_size_mb:.2f} MB")
+    print(f"  Columns: {', '.join(df_out.columns.tolist())}")
+    print(f"  Embedding column: patient_embedding (dim={embeddings.shape[1]})")
+    print(f"\n{'='*70}")
+    print(f"PRE-EMBEDDING COMPLETE")
+    print(f"{'='*70}")
+    print(f"\nTo use these pre-embedded patients in your app:")
+    print(f"1. Update config.py with:")
+    print(f"   PREEMBEDDED_PATIENTS = '{output_path}'")
+    print(f"2. Restart the application")
+    print(f"\nThe app will automatically load these embeddings on startup!")
+    print(f"\nTo upload to Hugging Face Hub:")
+    print(f"  from datasets import Dataset")
+    print(f"  ds = Dataset.from_parquet('{output_path}')")
+    print(f"  ds.push_to_hub('your-username/patient-embeddings')")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pre-embed patient summaries for faster loading",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python preembed_patients.py --patients data/patients.parquet --embedder models/embedder --output embeddings/patient_embeddings.parquet
+  python preembed_patients.py --patients patients.csv --embedder Qwen/Qwen3-Embedding-0.6B --output patient_embeddings.parquet --device cuda
+  python preembed_patients.py --patients data.parquet --embedder models/embedder --output out.parquet --patient-id-col mrn
+  python preembed_patients.py --patients data.parquet --embedder models/embedder --output out.parquet --gpus 0,1,2,3
+  python preembed_patients.py --patients data.parquet --embedder models/embedder --output out.parquet --patient-boilerplate-col boilerplate_summary
+Hugging Face Upload:
+  After creating the parquet file, you can upload to Hugging Face Hub:
+    from datasets import Dataset
+    ds = Dataset.from_parquet("patient_embeddings.parquet")
+    ds.push_to_hub("your-username/patient-embeddings")
+        """
+    )
+    parser.add_argument(
+        '--patients',
+        type=str,
+        required=True,
+        help='Path to patient database (Parquet, CSV, or Excel). Required columns: patient_summary and the patient ID column (default: patient_id, or specify with --patient-id-col)'
+    )
+    parser.add_argument(
+        '--embedder',
+        type=str,
+        required=True,
+        help='Path to embedder model or HuggingFace model name'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help='Output path for the parquet file (e.g., "patient_embeddings.parquet")'
+    )
+    parser.add_argument(
+        '--device',
+        type=str,
+        default=None,
+        help='Device to use for embedding (default: auto-detect). Examples: cuda, cuda:0, cuda:3, cpu. Ignored if --gpus is specified.'
+    )
+    parser.add_argument(
+        '--patient-id-col',
+        type=str,
+        default='patient_id',
+        help='Name of the patient ID column in the input file (default: patient_id)'
+    )
+    parser.add_argument(
+        '--patient-boilerplate-col',
+        type=str,
+        default='patient_boilerplate',
+        help='Name of the patient boilerplate column in the input file (default: patient_boilerplate). Set to empty string to skip.'
+    )
+    parser.add_argument(
+        '--gpus',
+        type=str,
+        default=None,
+        help='Comma-separated list of GPU indices for multi-GPU parallel processing (e.g., "0,1,2,3"). Overrides --device if specified.'
+    )
+    args = parser.parse_args()
+    # Parse GPU list if provided
+    gpu_list = None
+    if args.gpus:
+        try:
+            gpu_list = [int(g.strip()) for g in args.gpus.split(',')]
+        except ValueError:
+            print(f"✗ ERROR: Invalid GPU list format: {args.gpus}")
+            print("  Use comma-separated integers, e.g., '0,1,2,3'")
+            return 1
+    print(f"\n{'='*70}")
+    print(f"PATIENT SUMMARY PRE-EMBEDDING SCRIPT")
+    print(f"{'='*70}")
+    print(f"Patient Database:    {args.patients}")
+    print(f"Embedder Model:      {args.embedder}")
+    print(f"Output File:         {args.output}")
+    print(f"Patient ID Col:      {args.patient_id_col}")
+    print(f"Boilerplate Col:     {args.patient_boilerplate_col or '(none)'}")
+    if gpu_list:
+        print(f"GPUs:                {gpu_list} (multi-GPU mode)")
+    elif args.device:
+        print(f"Device:              {args.device}")
+    else:
+        print(f"Device:              auto-detect")
+    print(f"{'='*70}\n")
+    try:
+        # Load patients
+        df = load_patients(args.patients, args.patient_id_col, args.patient_boilerplate_col)
+        # Embed patients
+        embeddings, embedder_path = embed_patients(df, args.embedder, args.device, gpu_list)
+        # Save everything to single parquet file
+        save_embeddings(df, embeddings, args.output, embedder_path, gpu_list)
+        print(f"\n✓ SUCCESS!")
+    except Exception as e:
+        print(f"\n✗ ERROR: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())