Spaces:

kenlkehl
/

mm-ai-demo

Sleeping

App Files Files Community

kenlkehl commited on Nov 3

Commit

2ed7323

verified ·

1 Parent(s): 3a186e7

Upload 8 files

Browse files

Files changed (8) hide show

config.py +69 -0
create_sample_data.py +249 -0
launch.py +122 -0
preembed_trials.py +261 -0
requirements.txt +15 -0
trial_embeddings_data.pkl +3 -0
trial_embeddings_metadata.json +21 -0
trial_embeddings_vectors.npy +3 -0

config.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Configuration for Clinical Trial Matching Pipeline
+#
+# Edit the values below to set your default models and trial database.
+# Models will auto-load on application startup.
+# ============================================================================
+# MODEL PATHS - Set your default models here
+# ============================================================================
+# Set to None to skip auto-loading, or provide model path/HuggingFace ID
+MODEL_CONFIG = {
+    # TinyBERT tagger for extracting relevant excerpts
+    "tagger": "/ksg/kehl_mm_data/meta/2024/v17/v17_models/auto-tiny-bert-tagger",  # e.g., "prajjwal1/bert-tiny" or "./auto-tiny-bert-tagger"
+    # Sentence transformer for embedding patient summaries and trials
+    "embedder": "/ksg/kehl_mm_data/meta/2024/v17/v17_models/reranker_round2.model",  # e.g., "Qwen/Qwen3-Embedding-0.6B" or "./reranker_round2.model"
+    # Large language model for patient history summarization
+    "llm": "meta-llama/Llama-3.2-3B-Instruct",  # e.g., "microsoft/Phi-3-mini-4k-instruct" or "openai/gpt-oss-120b"
+    # ModernBERT classifier for eligibility prediction
+    "trial_checker": "/ksg/kehl_mm_data/meta/2024/v17/v17_models/modernbert-trial-checker",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-trial-checker"
+    # ModernBERT classifier for boilerplate exclusion prediction
+    "boilerplate_checker": "/ksg/kehl_mm_data/meta/2024/v17/v17_models/modernbert-boilerplate-checker",  # e.g., "answerdotai/ModernBERT-large" or "./modernbert-boilerplate-checker"
+}
+# Example configuration with base models:
+# MODEL_CONFIG = {
+#     "tagger": "prajjwal1/bert-tiny",
+#     "embedder": "Qwen/Qwen3-Embedding-0.6B",
+#     "llm": "microsoft/Phi-3-mini-4k-instruct",
+#     "trial_checker": "answerdotai/ModernBERT-large",
+#     "boilerplate_checker": "answerdotai/ModernBERT-large",
+# }
+# Example configuration with fine-tuned models:
+# MODEL_CONFIG = {
+#     "tagger": "./auto-tiny-bert-tagger",
+#     "embedder": "./reranker_round2.model",
+#     "llm": "/data/models/gpt-oss-120b",
+#     "trial_checker": "./modernbert-trial-checker",
+#     "boilerplate_checker": "./modernbert-boilerplate-checker",
+# }
+# ============================================================================
+# DEFAULT TRIAL DATABASE
+# ============================================================================
+# Path to default trial database CSV/Excel file
+# Will auto-load and embed when embedder model is ready
+# Set to None to disable auto-loading
+DEFAULT_TRIAL_DB = "/data1/ken/meta/2024/v17b/trial_space_lineitems.csv"  # e.g., "./my_trials.csv" or "./sample_trials.csv"
+PREEMBEDDED_TRIALS = "trial_embeddings"
+# ============================================================================
+# USAGE NOTES
+# ============================================================================
+#
+# 1. Set the model paths above to your preferred models
+# 2. Optionally set DEFAULT_TRIAL_DB to your trial database file
+# 3. Save this file
+# 4. Run: python trial_matching_app.py
+# 5. Models will load automatically on startup
+#
+# You can still manually load different models through the web interface
+# if you need to switch models during a session.
+#

create_sample_data.py ADDED Viewed

	@@ -0,0 +1,249 @@

+#!/usr/bin/env python3
+"""
+Generate sample data for testing the Clinical Trial Matching Pipeline
+"""
+import pandas as pd
+from datetime import datetime, timedelta
+def create_sample_trials():
+    """Create a sample trial database CSV."""
+    trials = [
+        {
+            'nct_id': 'NCT12345678',
+            'this_space': '''Metastatic non-small cell lung cancer (NSCLC) with EGFR exon 19 deletion or L858R mutation
+Prior treatment: At least one prior platinum-based chemotherapy regimen
+ECOG performance status: 0-2
+Measurable disease per RECIST v1.1
+Adequate organ function''',
+            'trial_text': '''Phase III randomized study of osimertinib versus platinum-based chemotherapy in patients with
+EGFR-mutated metastatic NSCLC who have progressed on first-line EGFR TKI therapy. Primary endpoint is progression-free
+survival. Secondary endpoints include overall survival, objective response rate, and quality of life.''',
+            'trial_boilerplate_text': '''No active brain metastases requiring immediate intervention
+No prior treatment with third-generation EGFR TKIs
+No interstitial lung disease or pneumonitis
+No congestive heart failure NYHA class III-IV
+No HIV, hepatitis B, or hepatitis C infection'''
+        },
+        {
+            'nct_id': 'NCT23456789',
+            'this_space': '''HER2-positive metastatic breast cancer
+Prior treatment: Trastuzumab and pertuzumab in any setting
+ECOG performance status: 0-1
+Brain metastases allowed if treated and stable
+LVEF ≥50%''',
+            'trial_text': '''Phase II study of trastuzumab deruxtecan in HER2-positive metastatic breast cancer patients
+who have received prior trastuzumab and pertuzumab. Primary endpoint is objective response rate. Key secondary endpoints
+include duration of response, progression-free survival, and safety.''',
+            'trial_boilerplate_text': '''No history of pneumonitis or interstitial lung disease
+No concurrent cardiac dysfunction
+No active hepatitis B or C infection
+No pregnancy or breastfeeding'''
+        },
+        {
+            'nct_id': 'NCT34567890',
+            'this_space': '''Advanced melanoma with BRAF V600E or V600K mutation
+Treatment-naive for metastatic disease (adjuvant therapy allowed if completed >6 months prior)
+ECOG performance status: 0-1
+No active autoimmune disease requiring systemic therapy
+Adequate bone marrow, hepatic, and renal function''',
+            'trial_text': '''Phase III randomized trial comparing dabrafenib plus trametinib versus vemurafenib monotherapy
+in previously untreated BRAF-mutant metastatic melanoma. Primary endpoint is overall survival. Secondary endpoints include
+progression-free survival, response rate, and toxicity.''',
+            'trial_boilerplate_text': '''No prior systemic therapy for metastatic melanoma
+No active brain metastases (treated and stable brain metastases allowed)
+No history of inflammatory bowel disease
+No significant cardiac disease
+No HIV infection on antiretroviral therapy'''
+        },
+        {
+            'nct_id': 'NCT45678901',
+            'this_space': '''Microsatellite instability-high (MSI-H) or mismatch repair deficient (dMMR) advanced solid tumors
+Progressive disease on or after prior standard therapy
+ECOG performance status: 0-2
+Measurable disease per RECIST v1.1
+No prior checkpoint inhibitor therapy''',
+            'trial_text': '''Phase II basket study of pembrolizumab in patients with MSI-H/dMMR advanced solid tumors.
+Primary endpoint is objective response rate by tumor type. Secondary endpoints include duration of response,
+progression-free survival, and overall survival.''',
+            'trial_boilerplate_text': '''No active autoimmune disease requiring systemic therapy
+No history of severe immune-related adverse events
+No active pneumonitis or interstitial lung disease
+No concurrent systemic corticosteroids (>10mg prednisone equivalent daily)
+No HIV, hepatitis B, or hepatitis C infection'''
+        },
+        {
+            'nct_id': 'NCT56789012',
+            'this_space': '''Advanced or metastatic renal cell carcinoma (RCC), clear cell histology
+No prior systemic therapy for advanced disease
+Intermediate or poor risk per IMDC criteria
+ECOG performance status: 0-1
+Measurable disease per RECIST v1.1''',
+            'trial_text': '''Phase III randomized study of cabozantinib plus nivolumab versus sunitinib in previously
+untreated advanced RCC. Primary endpoint is progression-free survival. Secondary endpoints include overall survival,
+objective response rate, and safety.''',
+            'trial_boilerplate_text': '''No prior systemic therapy for metastatic RCC
+No active brain metastases
+No history of bowel perforation or fistula
+No poorly controlled hypertension
+No active hepatitis B or C infection
+No significant cardiovascular disease'''
+        }
+    ]
+    df = pd.DataFrame(trials)
+    df.to_csv('sample_trials.csv', index=False)
+    print(f"✓ Created sample_trials.csv with {len(df)} trials")
+    return df
+def create_sample_patient_notes():
+    """Create sample patient clinical notes CSV."""
+    base_date = datetime(2023, 1, 1)
+    notes = [
+        {
+            'date': base_date,
+            'text': 'Patient is a 67-year-old male with a 40 pack-year smoking history presenting with cough and weight loss. CT chest shows a 4.5 cm right upper lobe mass with mediastinal lymphadenopathy.',
+            'note_type': 'clinical_note'
+        },
+        {
+            'date': base_date + timedelta(days=7),
+            'text': 'CT-guided lung biopsy performed. Pathology shows adenocarcinoma, moderately differentiated.',
+            'note_type': 'pathology_report'
+        },
+        {
+            'date': base_date + timedelta(days=14),
+            'text': 'PET/CT shows FDG-avid right upper lobe mass (SUVmax 12.3), right hilar nodes (SUVmax 8.7), and mediastinal nodes (SUVmax 9.2). No distant metastatic disease identified.',
+            'note_type': 'imaging_report'
+        },
+        {
+            'date': base_date + timedelta(days=21),
+            'text': '''Next-generation sequencing (NGS) performed on lung biopsy specimen.
+Results: EGFR exon 19 deletion (L747_A750delinsP) detected.
+Other findings: TP53 p.R273H mutation, MYC amplification (copy number gain).
+PD-L1 expression by immunohistochemistry: 75% tumor proportion score.
+TMB: 4 mutations/Mb (low).
+No ALK, ROS1, BRAF, MET, RET, or KRAS alterations detected.''',
+            'note_type': 'ngs_report'
+        },
+        {
+            'date': base_date + timedelta(days=28),
+            'text': 'Mediastinoscopy with biopsy of station 4R and 7 lymph nodes. Pathology confirms metastatic adenocarcinoma. Clinical stage: T2aN2M0, stage IIIA.',
+            'note_type': 'pathology_report'
+        },
+        {
+            'date': base_date + timedelta(days=42),
+            'text': 'Patient underwent concurrent chemoradiation with carboplatin/pemetrexed and 60 Gy radiation to primary tumor and mediastinum. Tolerated well with grade 2 esophagitis.',
+            'note_type': 'clinical_note'
+        },
+        {
+            'date': base_date + timedelta(days=112),
+            'text': 'Post-treatment CT chest shows near-complete response of primary tumor (now 1.2 cm) and resolution of lymphadenopathy. Started consolidation durvalumab.',
+            'note_type': 'imaging_report'
+        },
+        {
+            'date': base_date + timedelta(days=280),
+            'text': 'Surveillance CT shows new liver lesions (segment 6 and 7, largest 2.3 cm) and increase in size of lung primary to 3.1 cm. Progression of disease.',
+            'note_type': 'imaging_report'
+        },
+        {
+            'date': base_date + timedelta(days=287),
+            'text': 'Patient now has metastatic NSCLC (stage IV). ECOG performance status 1. Discussed treatment options. Given EGFR mutation, recommend EGFR TKI therapy.',
+            'note_type': 'clinical_note'
+        },
+        {
+            'date': base_date + timedelta(days=294),
+            'text': 'Started osimertinib 80 mg daily for EGFR-mutant metastatic NSCLC.',
+            'note_type': 'clinical_note'
+        },
+        {
+            'date': base_date + timedelta(days=378),
+            'text': 'Restaging CT shows partial response. Liver lesions decreased to 1.2 and 0.9 cm. Primary lung tumor stable at 2.8 cm. Tolerating osimertinib well with mild diarrhea and dry skin.',
+            'note_type': 'imaging_report'
+        },
+        {
+            'date': base_date + timedelta(days=560),
+            'text': 'Patient reports increased fatigue and back pain over past 3 weeks.',
+            'note_type': 'clinical_note'
+        },
+        {
+            'date': base_date + timedelta(days=567),
+            'text': '''CT chest/abdomen/pelvis shows:
+- Progression of liver metastases (segment 6: 3.8 cm, previously 1.2 cm; segment 7: 2.9 cm, previously 0.9 cm)
+- New liver lesions in segments 4 and 5
+- Lung primary increased to 4.2 cm
+- New small pleural effusion
+Assessment: Progressive disease on osimertinib.''',
+            'note_type': 'imaging_report'
+        },
+        {
+            'date': base_date + timedelta(days=574),
+            'text': 'MRI brain with contrast shows no brain metastases. Patient has progressive EGFR-mutant NSCLC after first-line osimertinib. ECOG PS 1. Discussing clinical trial options for second-line therapy.',
+            'note_type': 'clinical_note'
+        }
+    ]
+    df = pd.DataFrame(notes)
+    df.to_csv('sample_patient_notes.csv', index=False)
+    print(f"✓ Created sample_patient_notes.csv with {len(df)} notes")
+    return df
+def create_sample_patient_summary():
+    """Create a sample patient summary text file."""
+    summary = """Cancer type: Non-small cell lung cancer (NSCLC)
+Histology: Adenocarcinoma, moderately differentiated
+Stage at diagnosis: Stage IIIA (T2aN2M0)
+Current extent: Metastatic (stage IV) with liver metastases
+Biomarkers:
+- EGFR exon 19 deletion (L747_A750delinsP)
+- TP53 p.R273H mutation
+- MYC amplification
+- PD-L1 75% TPS
+- TMB: 4 mutations/Mb (low)
+Treatment history:
+# 1/28/2023 - 4/15/2023: Concurrent chemoradiation (carboplatin/pemetrexed with 60 Gy)
+# 4/22/2023 - 10/5/2023: Consolidation durvalumab
+# 10/19/2023 - present: Osimertinib 80 mg daily for metastatic disease
+Disease course:
+- Initial diagnosis: January 2023, stage IIIA
+- Near-complete response to chemoradiation
+- Progression to stage IV in September 2023 (liver metastases)
+- Partial response to osimertinib
+- Current progression on osimertinib (July 2024) after ~9 months of therapy
+Current status:
+- ECOG performance status: 1
+- Progressive disease with liver metastases
+- No brain metastases on recent MRI
+Boilerplate:
+No evidence of brain metastases (MRI brain 7/22/2024).
+No history of pneumonitis, interstitial lung disease, congestive heart failure, HIV, or hepatitis infection documented.
+Adequate performance status (ECOG 1).
+"""
+    with open('sample_patient_summary.txt', 'w') as f:
+        f.write(summary)
+    print(f"✓ Created sample_patient_summary.txt")
+    return summary
+if __name__ == "__main__":
+    print("Generating sample data for Clinical Trial Matching Pipeline...\n")
+    create_sample_trials()
+    create_sample_patient_notes()
+    create_sample_patient_summary()
+    print("\n✓ All sample files created successfully!")
+    print("\nFiles generated:")
+    print("  - sample_trials.csv (5 clinical trials)")
+    print("  - sample_patient_notes.csv (14 clinical notes)")
+    print("  - sample_patient_summary.txt (pre-made summary)")
+    print("\nYou can now use these files to test the Gradio application.")

launch.py ADDED Viewed

	@@ -0,0 +1,122 @@

+#!/usr/bin/env python3
+"""
+Launch script for Clinical Trial Matching Pipeline
+Checks dependencies and provides helpful startup information.
+"""
+import sys
+import subprocess
+import importlib.util
+def check_package(package_name, display_name=None):
+    """Check if a package is installed."""
+    if display_name is None:
+        display_name = package_name
+    spec = importlib.util.find_spec(package_name)
+    if spec is None:
+        return False, display_name
+    return True, display_name
+def check_dependencies():
+    """Check if all required dependencies are installed."""
+    required_packages = [
+        ('gradio', 'gradio'),
+        ('pandas', 'pandas'),
+        ('numpy', 'numpy'),
+        ('torch', 'PyTorch'),
+        ('transformers', 'transformers'),
+        ('sentence_transformers', 'sentence-transformers'),
+    ]
+    optional_packages = [
+        ('vllm', 'vLLM (for faster LLM inference)'),
+    ]
+    print("Checking dependencies...\n")
+    missing = []
+    for package, display in required_packages:
+        installed, name = check_package(package, display)
+        status = "✓" if installed else "✗"
+        print(f"  {status} {name}")
+        if not installed:
+            missing.append(package)
+    print("\nOptional packages:")
+    for package, display in optional_packages:
+        installed, name = check_package(package, display)
+        status = "✓" if installed else "○"
+        print(f"  {status} {name}")
+    if missing:
+        print(f"\n❌ Missing required packages: {', '.join(missing)}")
+        print("\nInstall with:")
+        print(f"  pip install {' '.join(missing)}")
+        print("\nOr install all requirements:")
+        print("  pip install -r requirements.txt")
+        return False
+    print("\n✓ All required dependencies installed!")
+    return True
+def check_cuda():
+    """Check CUDA availability."""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            print(f"\n🚀 CUDA available!")
+            print(f"   GPU count: {torch.cuda.device_count()}")
+            for i in range(torch.cuda.device_count()):
+                print(f"   GPU {i}: {torch.cuda.get_device_name(i)}")
+            return True
+        else:
+            print("\n⚠️  CUDA not available - running on CPU")
+            print("   For better performance, install PyTorch with CUDA:")
+            print("   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")
+            return False
+    except ImportError:
+        return False
+def print_startup_info():
+    """Print helpful startup information."""
+    print("\n" + "="*70)
+    print("Clinical Trial Matching Pipeline")
+    print("="*70)
+    print("\nStarting Gradio web interface...")
+    print("\nOnce started, the interface will be available at:")
+    print("  Local:   http://localhost:7860")
+    print("  Network: http://0.0.0.0:7860")
+    print("\nPress Ctrl+C to stop the server.")
+    print("\n" + "="*70 + "\n")
+def main():
+    """Main launch function."""
+    # Check dependencies
+    if not check_dependencies():
+        sys.exit(1)
+    # Check CUDA
+    check_cuda()
+    # Print startup info
+    print_startup_info()
+    # Launch the app
+    try:
+        import trial_matching_app
+        # The app will launch automatically when imported
+    except KeyboardInterrupt:
+        print("\n\nShutting down gracefully...")
+        sys.exit(0)
+    except Exception as e:
+        print(f"\n❌ Error launching application: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

preembed_trials.py ADDED Viewed

	@@ -0,0 +1,261 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Pre-embed Clinical Trials Script
+This script pre-processes and embeds a clinical trial database,
+saving the results to disk for faster loading in the main application.
+Usage:
+    python preembed_trials.py --trials trials.csv --embedder path/to/embedder --output trial_embeddings
+    python preembed_trials.py --trials /data1/ken/meta/2024/v17b/trial_space_lineitems.csv --embedder /ksg/kehl_mm_data/meta/2024/v17/v17_models/reranker_round2.model --output trial_embeddings --device cuda:2
+This will create:
+    - trial_embeddings_data.pkl: Trial dataframe
+    - trial_embeddings_vectors.npy: Embedding vectors
+    - trial_embeddings_metadata.json: Metadata about the embedding process
+"""
+import argparse
+import pandas as pd
+import numpy as np
+import torch
+import json
+import re
+from pathlib import Path
+from datetime import datetime
+from typing import Tuple
+from sentence_transformers import SentenceTransformer
+from transformers import AutoTokenizer
+def truncate_text(text: str, tokenizer, max_tokens: int = 1500) -> str:
+    """Truncate text to a maximum number of tokens."""
+    return tokenizer.decode(
+        tokenizer.encode(text, add_special_tokens=True, truncation=True, max_length=max_tokens),
+        skip_special_tokens=True
+    )
+def load_trials(file_path: str) -> pd.DataFrame:
+    """Load trials from CSV or Excel file."""
+    print(f"\n{'='*70}")
+    print(f"Loading trial database from: {file_path}")
+    print(f"{'='*70}")
+    if file_path.endswith('.csv'):
+        df = pd.read_csv(file_path)
+    elif file_path.endswith(('.xlsx', '.xls')):
+        df = pd.read_excel(file_path)
+    else:
+        raise ValueError("Unsupported file format. Use CSV or Excel.")
+    # Check required columns
+    required_cols = ['nct_id', 'this_space', 'trial_text', 'trial_boilerplate_text']
+    missing = [col for col in required_cols if col not in df.columns]
+    if missing:
+        raise ValueError(f"Missing required columns: {', '.join(missing)}")
+    print(f"✓ Loaded {len(df)} trials")
+    print(f"  Columns: {', '.join(df.columns.tolist())}")
+    # Clean data
+    original_count = len(df)
+    df = df[~df['this_space'].isnull()].copy()
+    df['trial_boilerplate_text'] = df['trial_boilerplate_text'].fillna('')
+    if len(df) < original_count:
+        print(f"  ⚠ Removed {original_count - len(df)} trials with missing 'this_space'")
+    return df
+def embed_trials(df: pd.DataFrame, embedder_path: str, device: str = None) -> Tuple[np.ndarray, str]:
+    """Embed trials using the specified embedder model."""
+    print(f"\n{'='*70}")
+    print(f"Loading embedder model: {embedder_path}")
+    print(f"{'='*70}")
+    if device is None:
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Device: {device}")
+    # Load embedder
+    embedder_model = SentenceTransformer(embedder_path, device=device, trust_remote_code=True)
+    embedder_tokenizer = AutoTokenizer.from_pretrained(embedder_path, trust_remote_code=True)
+    print(f"✓ Embedder loaded")
+    # Set the instruction prompt
+    try:
+        embedder_model.prompts['query'] = (
+            "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+            "that are reasonable for that patient; or, given a clinical trial option, "
+            "retrieve cancer patients who are reasonable candidates for that trial."
+        )
+    except:
+        pass
+    try:
+        embedder_model.max_seq_length = 1500
+    except:
+        pass
+    print(f"\n{'='*70}")
+    print(f"Embedding {len(df)} trials")
+    print(f"{'='*70}")
+    # Prepare texts for embedding
+    df['this_space_trunc'] = df['this_space'].apply(
+        lambda x: truncate_text(str(x), embedder_tokenizer, max_tokens=1500)
+    )
+    # Add instruction prefix
+    prefix = (
+        "Instruct: Given a cancer patient summary, retrieve clinical trial options "
+        "that are reasonable for that patient; or, given a clinical trial option, "
+        "retrieve cancer patients who are reasonable candidates for that trial. "
+    )
+    texts_to_embed = [prefix + txt for txt in df['this_space_trunc'].tolist()]
+    print(f"  Text length stats:")
+    print(f"    Mean: {np.mean([len(t) for t in texts_to_embed]):.0f} chars")
+    print(f"    Max: {max([len(t) for t in texts_to_embed])} chars")
+    # Embed with progress bar
+    with torch.no_grad():
+        embeddings = embedder_model.encode(
+            texts_to_embed,
+            batch_size=64,
+            convert_to_tensor=True,
+            normalize_embeddings=True,
+            show_progress_bar=True,
+            prompt='query'
+        )
+    embeddings_np = embeddings.cpu().numpy()
+    print(f"✓ Embedding complete")
+    print(f"  Shape: {embeddings_np.shape}")
+    print(f"  Dtype: {embeddings_np.dtype}")
+    return embeddings_np, embedder_path
+def save_embeddings(df: pd.DataFrame, embeddings: np.ndarray, output_prefix: str, embedder_path: str):
+    """Save trial data, embeddings, and metadata to disk."""
+    print(f"\n{'='*70}")
+    print(f"Saving to: {output_prefix}_*")
+    print(f"{'='*70}")
+    output_path = Path(output_prefix).parent
+    output_path.mkdir(parents=True, exist_ok=True)
+    # Save dataframe
+    df_file = f"{output_prefix}_data.pkl"
+    df.to_pickle(df_file)
+    print(f"✓ Saved trial dataframe: {df_file}")
+    print(f"  Size: {Path(df_file).stat().st_size / 1024 / 1024:.2f} MB")
+    # Save embeddings
+    embeddings_file = f"{output_prefix}_vectors.npy"
+    np.save(embeddings_file, embeddings)
+    print(f"✓ Saved embeddings: {embeddings_file}")
+    print(f"  Size: {Path(embeddings_file).stat().st_size / 1024 / 1024:.2f} MB")
+    # Save metadata
+    metadata = {
+        "created_at": datetime.now().isoformat(),
+        "embedder_model": embedder_path,
+        "num_trials": len(df),
+        "embedding_dim": embeddings.shape[1],
+        "nct_ids": df['nct_id'].tolist()[:10] + ["..."] if len(df) > 10 else df['nct_id'].tolist(),
+        "embedding_dtype": str(embeddings.dtype),
+        "normalized": True
+    }
+    metadata_file = f"{output_prefix}_metadata.json"
+    with open(metadata_file, 'w') as f:
+        json.dump(metadata, f, indent=2)
+    print(f"✓ Saved metadata: {metadata_file}")
+    print(f"\n{'='*70}")
+    print(f"PRE-EMBEDDING COMPLETE")
+    print(f"{'='*70}")
+    print(f"\nTo use these pre-embedded trials in your app:")
+    print(f"1. Update config.py with:")
+    print(f"   PREEMBEDDED_TRIALS = '{output_prefix}'")
+    print(f"2. Restart the application")
+    print(f"\nThe app will automatically load these embeddings on startup!")
+def main():
+    parser = argparse.ArgumentParser(
+        description="Pre-embed clinical trials for faster loading",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python preembed_trials.py --trials data/trials.csv --embedder models/embedder --output embeddings/trial_embeddings
+  python preembed_trials.py --trials trials.xlsx --embedder Qwen/Qwen3-Embedding-0.6B --output trial_embeddings --device cuda
+        """
+    )
+    parser.add_argument(
+        '--trials',
+        type=str,
+        required=True,
+        help='Path to trial database (CSV or Excel)'
+    )
+    parser.add_argument(
+        '--embedder',
+        type=str,
+        required=True,
+        help='Path to embedder model or HuggingFace model name'
+    )
+    parser.add_argument(
+        '--output',
+        type=str,
+        required=True,
+        help='Output prefix for saved files (e.g., "trial_embeddings" will create trial_embeddings_data.pkl, etc.)'
+    )
+    parser.add_argument(
+        '--device',
+        type=str,
+        default=None,
+        #choices=['cuda', 'cpu'],
+        help='Device to use for embedding (default: auto-detect)'
+    )
+    args = parser.parse_args()
+    print(f"\n{'='*70}")
+    print(f"CLINICAL TRIAL PRE-EMBEDDING SCRIPT")
+    print(f"{'='*70}")
+    print(f"Trial Database: {args.trials}")
+    print(f"Embedder Model: {args.embedder}")
+    print(f"Output Prefix:  {args.output}")
+    print(f"{'='*70}\n")
+    try:
+        # Load trials
+        df = load_trials(args.trials)
+        # Embed trials
+        embeddings, embedder_path = embed_trials(df, args.embedder, args.device)
+        # Save everything
+        save_embeddings(df, embeddings, args.output, embedder_path)
+        print(f"\n✓ SUCCESS!")
+    except Exception as e:
+        print(f"\n✗ ERROR: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return 1
+    return 0
+if __name__ == "__main__":
+    exit(main())

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+gradio>=4.0.0
+pandas>=2.0.0
+numpy>=1.24.0
+torch>=2.0.0
+transformers>=4.35.0
+sentence-transformers>=2.2.0
+openpyxl>=3.1.0
+xlrd>=2.0.0
+# Optional but recommended for faster LLM inference
+vllm>=0.5.0
+# For CUDA support (if using GPU)
+# Install PyTorch with CUDA separately:
+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

trial_embeddings_data.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c733ad315bba227bca0aecf3f7e4947a9383f312520ec87c785d6110afb826a3
+size 362844655

trial_embeddings_metadata.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "created_at": "2025-10-28T16:17:07.203636",
+  "embedder_model": "/ksg/kehl_mm_data/meta/2024/v17/v17_models/reranker_round2.model",
+  "num_trials": 39266,
+  "embedding_dim": 1024,
+  "nct_ids": [
+    "NCT00001160",
+    "NCT00001160",
+    "NCT00001186",
+    "NCT00001186",
+    "NCT00001238",
+    "NCT00001238",
+    "NCT00001238",
+    "NCT00001238",
+    "NCT00001238",
+    "NCT00001238",
+    "..."
+  ],
+  "embedding_dtype": "float32",
+  "normalized": true
+}

trial_embeddings_vectors.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:42f6e5f85a93a8b95ec04ea59c57d4be73d042fb483522c144a7a6f0720c4379
+size 160833664