monajm36
/

ohca-classifier-v3

Model card Files Files and versions

xet

Community

monajm36 commited on Sep 12, 2025

Commit

a5114c2

unverified ·

1 Parent(s): e6890e6

Update ohca_inference.py

Browse files

Files changed (1) hide show

src/ohca_inference.py +379 -158

src/ohca_inference.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# OHCA Inference Module
-# Apply pre-trained OHCA classifier to new datasets
 import pandas as pd
 import numpy as np
@@ -8,6 +8,7 @@ import torch.nn.functional as F
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
 import os
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import warnings
 warnings.filterwarnings('ignore')
@@ -17,7 +18,7 @@ warnings.filterwarnings('ignore')
 # =============================================================================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Inference Module - Using device: {DEVICE}")
 # =============================================================================
 # INFERENCE DATASET CLASS
@@ -61,12 +62,58 @@ class OHCAInferenceDataset(Dataset):
         }
 # =============================================================================
-# MODEL LOADING FUNCTIONS
 # =============================================================================
 def load_ohca_model(model_path):
     """
-    Load pre-trained OHCA model and tokenizer
     Args:
         model_path: Path to saved model directory
@@ -74,7 +121,7 @@ def load_ohca_model(model_path):
     Returns:
         tuple: (model, tokenizer)
     """
-    print(f"📂 Loading OHCA model from: {model_path}")
     if not os.path.exists(model_path):
         raise FileNotFoundError(f"Model not found at: {model_path}")
@@ -86,7 +133,7 @@ def load_ohca_model(model_path):
         model = model.to(DEVICE)
         model.eval()
-        print("✅ Model loaded successfully")
         print(f"   Device: {DEVICE}")
         print(f"   Model type: {type(model).__name__}")
@@ -96,26 +143,53 @@ def load_ohca_model(model_path):
         raise RuntimeError(f"Failed to load model: {str(e)}")
 # =============================================================================
-# INFERENCE FUNCTIONS
 # =============================================================================
-def run_inference(model, tokenizer, inference_df, batch_size=16,
-                 output_path=None, probability_threshold=0.5):
     """
-    Run OHCA inference on new data
     Args:
         model: Pre-trained OHCA model
         tokenizer: Model tokenizer
         inference_df: DataFrame with columns ['hadm_id', 'clean_text']
         batch_size: Batch size for inference
         output_path: Optional path to save results CSV
-        probability_threshold: Threshold for binary predictions
     Returns:
-        DataFrame: Results with probabilities and predictions
     """
-    print(f"🔍 Running OHCA inference on {len(inference_df):,} cases...")
     # Validate input data
     required_cols = ['hadm_id', 'clean_text']
@@ -126,7 +200,7 @@ def run_inference(model, tokenizer, inference_df, batch_size=16,
     # Remove any rows with missing data
     clean_df = inference_df.dropna(subset=required_cols).copy()
     if len(clean_df) < len(inference_df):
-        print(f"⚠️  Removed {len(inference_df) - len(clean_df)} rows with missing data")
     # Create dataset and dataloader
     inference_dataset = OHCAInferenceDataset(clean_df, tokenizer)
@@ -158,47 +232,47 @@ def run_inference(model, tokenizer, inference_df, batch_size=16,
         'ohca_probability': all_probabilities
     })
-    # Add predictions with different thresholds
     results_df['prediction_050'] = (results_df['ohca_probability'] >= 0.5).astype(int)
     results_df['prediction_070'] = (results_df['ohca_probability'] >= 0.7).astype(int)
     results_df['prediction_090'] = (results_df['ohca_probability'] >= 0.9).astype(int)
-    results_df['prediction_custom'] = (results_df['ohca_probability'] >= probability_threshold).astype(int)
-    # Add confidence categories
-    def categorize_confidence(prob):
-        if prob >= 0.9:
-            return "Very High"
-        elif prob >= 0.7:
-            return "High"
-        elif prob >= 0.3:
-            return "Medium"
-        elif prob >= 0.1:
-            return "Low"
-        else:
-            return "Very Low"
-    results_df['confidence_category'] = results_df['ohca_probability'].apply(categorize_confidence)
     # Sort by probability (highest first)
     results_df = results_df.sort_values('ohca_probability', ascending=False).reset_index(drop=True)
-    # Print summary
-    print(f"\n📊 Inference Results Summary:")
     print(f"   Total cases processed: {len(results_df):,}")
     print(f"   Mean OHCA probability: {results_df['ohca_probability'].mean():.4f}")
-    print(f"   Max OHCA probability: {results_df['ohca_probability'].max():.3f}")
-    print(f"   Min OHCA probability: {results_df['ohca_probability'].min():.3f}")
-    # Probability distribution
-    print(f"\n🎯 Probability Distribution:")
-    thresholds = [0.9, 0.8, 0.7, 0.6, 0.5, 0.3, 0.1]
-    for threshold in thresholds:
-        count = (results_df['ohca_probability'] >= threshold).sum()
         pct = count / len(results_df) * 100
-        print(f"   ≥{threshold}: {count:,} cases ({pct:.2f}%)")
-    # Confidence categories
-    print(f"\n📈 Confidence Distribution:")
     conf_dist = results_df['confidence_category'].value_counts()
     for category, count in conf_dist.items():
         pct = count / len(results_df) * 100
@@ -206,98 +280,203 @@ def run_inference(model, tokenizer, inference_df, batch_size=16,
     # Save results if path provided
     if output_path:
         results_df.to_csv(output_path, index=False)
-        print(f"\n💾 Results saved to: {output_path}")
     return results_df
-def get_high_confidence_cases(results_df, threshold=0.8, max_cases=100):
     """
-    Extract high-confidence OHCA predictions for manual review
     Args:
-        results_df: Results from run_inference()
-        threshold: Minimum probability threshold
-        max_cases: Maximum number of cases to return
     Returns:
-        DataFrame: High-confidence cases sorted by probability
     """
-    high_conf = results_df[results_df['ohca_probability'] >= threshold].copy()
-    high_conf = high_conf.head(max_cases)
-    print(f"🎯 Found {len(high_conf)} high-confidence cases (≥{threshold})")
-    return high_conf
-def analyze_predictions(results_df, original_df=None):
     """
-    Analyze prediction patterns and provide clinical insights
     Args:
-        results_df: Results from run_inference()
-        original_df: Optional original dataframe to merge with results
     Returns:
-        dict: Analysis summary
     """
-    print("📋 Analyzing prediction patterns...")
     # Basic statistics
     stats = {
         'total_cases': len(results_df),
         'mean_probability': results_df['ohca_probability'].mean(),
         'std_probability': results_df['ohca_probability'].std(),
         'median_probability': results_df['ohca_probability'].median(),
         'high_confidence_cases': (results_df['ohca_probability'] >= 0.8).sum(),
-        'predicted_ohca_050': results_df['prediction_050'].sum(),
-        'predicted_ohca_070': results_df['prediction_070'].sum(),
-        'predicted_ohca_090': results_df['prediction_090'].sum(),
     }
-    # Confidence distribution
-    conf_dist = results_df['confidence_category'].value_counts().to_dict()
-    # Print analysis
-    print(f"\n📊 Prediction Analysis:")
     print(f"   Total cases: {stats['total_cases']:,}")
     print(f"   Mean probability: {stats['mean_probability']:.4f}")
-    print(f"   Predicted OHCA (≥0.5): {stats['predicted_ohca_050']:,}")
-    print(f"   High confidence (≥0.8): {stats['high_confidence_cases']:,}")
-    if stats['predicted_ohca_050'] > 0:
-        prevalence = stats['predicted_ohca_050'] / stats['total_cases'] * 100
         print(f"   Estimated OHCA prevalence: {prevalence:.2f}%")
-    # Clinical recommendations
-    print(f"\n🏥 Clinical Recommendations:")
-    if stats['high_confidence_cases'] > 0:
-        print(f"   • Priority review: {stats['high_confidence_cases']} high-confidence cases")
-    if stats['predicted_ohca_070'] > 0:
-        print(f"   • Clinical review: {stats['predicted_ohca_070']} cases ≥0.7 probability")
-    uncertain_cases = ((results_df['ohca_probability'] >= 0.3) &
-                      (results_df['ohca_probability'] < 0.7)).sum()
-    if uncertain_cases > 0:
-        print(f"   • Manual review suggested: {uncertain_cases} uncertain cases")
     return {
         'statistics': stats,
         'confidence_distribution': conf_dist,
-        'high_confidence_cases': results_df[results_df['ohca_probability'] >= 0.8]
     }
 # =============================================================================
-# BATCH PROCESSING FUNCTIONS
 # =============================================================================
-def process_large_dataset(model_path, data_path, output_path,
-                         chunk_size=10000, batch_size=16):
     """
-    Process large datasets in chunks to avoid memory issues
     Args:
-        model_path: Path to trained model
         data_path: Path to input CSV file
         output_path: Path for output results
         chunk_size: Number of rows per chunk
@@ -306,10 +485,10 @@ def process_large_dataset(model_path, data_path, output_path,
     Returns:
         str: Path to completed results file
     """
-    print(f"🔄 Processing large dataset in chunks of {chunk_size:,}...")
-    # Load model once
-    model, tokenizer = load_ohca_model(model_path)
     # Read data in chunks
     chunk_results = []
@@ -317,11 +496,11 @@ def process_large_dataset(model_path, data_path, output_path,
     for chunk_df in pd.read_csv(data_path, chunksize=chunk_size):
         chunk_num += 1
-        print(f"\n📦 Processing chunk {chunk_num} ({len(chunk_df):,} rows)...")
-        # Run inference on chunk
-        chunk_result = run_inference(
-            model, tokenizer, chunk_df,
             batch_size=batch_size, output_path=None
         )
@@ -330,18 +509,24 @@ def process_large_dataset(model_path, data_path, output_path,
         # Save intermediate results
         temp_path = f"{output_path}.chunk_{chunk_num}.csv"
         chunk_result.to_csv(temp_path, index=False)
-        print(f"💾 Chunk {chunk_num} saved to: {temp_path}")
     # Combine all chunks
-    print(f"\n🔗 Combining {len(chunk_results)} chunks...")
     final_results = pd.concat(chunk_results, ignore_index=True)
     # Sort by probability and save
     final_results = final_results.sort_values('ohca_probability', ascending=False)
     final_results.to_csv(output_path, index=False)
-    print(f"✅ Complete results saved to: {output_path}")
-    print(f"📊 Total cases processed: {len(final_results):,}")
     # Clean up intermediate files
     for i in range(1, chunk_num + 1):
@@ -351,60 +536,56 @@ def process_large_dataset(model_path, data_path, output_path,
     return output_path
 # =============================================================================
-# CONVENIENCE FUNCTIONS
 # =============================================================================
-def quick_inference(model_path, data_path, output_path=None):
-    """
-    Quick inference function for simple use cases
-    Args:
-        model_path: Path to trained model
-        data_path: Path to input CSV (or DataFrame)
-        output_path: Optional output path
-    Returns:
-        DataFrame: Inference results
-    """
-    print("🚀 Quick OHCA Inference")
-    # Load model
-    model, tokenizer = load_ohca_model(model_path)
-    # Load data
-    if isinstance(data_path, str):
-        df = pd.read_csv(data_path)
-        print(f"📂 Loaded {len(df):,} cases from {data_path}")
-    else:
-        df = data_path.copy()
-        print(f"📊 Processing {len(df):,} cases from DataFrame")
-    # Run inference
-    results = run_inference(model, tokenizer, df, output_path=output_path)
-    # Quick summary
-    ohca_cases = (results['ohca_probability'] >= 0.5).sum()
-    high_conf = (results['ohca_probability'] >= 0.8).sum()
-    print(f"\n✅ Quick Summary:")
-    print(f"   Predicted OHCA cases: {ohca_cases:,}")
-    print(f"   High confidence: {high_conf:,}")
-    return results
 def test_model_on_sample(model_path, sample_texts):
     """
-    Test model on a few sample texts for quick validation
     Args:
         model_path: Path to trained model
         sample_texts: List of text strings or dict with hadm_id: text
     Returns:
-        DataFrame: Test results
     """
-    print("🧪 Testing model on sample texts...")
     # Prepare test data
     if isinstance(sample_texts, dict):
@@ -418,18 +599,28 @@ def test_model_on_sample(model_path, sample_texts):
             for i, text in enumerate(sample_texts, 1)
         ])
-    # Run inference
-    model, tokenizer = load_ohca_model(model_path)
-    results = run_inference(model, tokenizer, test_df, output_path=None)
-    # Print results
-    print(f"\n🔍 Test Results:")
     for _, row in results.iterrows():
         prob = row['ohca_probability']
-        pred = "OHCA" if prob >= 0.5 else "Non-OHCA"
         conf = row['confidence_category']
-        print(f"   {row['hadm_id']}: {pred} (prob={prob:.3f}, {conf})")
         # Show text preview
         text_preview = test_df[test_df['hadm_id']==row['hadm_id']]['clean_text'].iloc[0]
@@ -438,18 +629,48 @@ def test_model_on_sample(model_path, sample_texts):
     return results
 # =============================================================================
 # EXAMPLE USAGE
 # =============================================================================
 if __name__ == "__main__":
-    print("OHCA Inference Module")
-    print("="*25)
-    print("This module provides inference capabilities for pre-trained OHCA models.")
-    print("\nMain functions:")
-    print("• load_ohca_model() - Load pre-trained model")
-    print("• run_inference() - Run inference on new data")
-    print("• quick_inference() - Simple inference function")
-    print("• process_large_dataset() - Handle large datasets")
-    print("• test_model_on_sample() - Test on sample texts")
-    print("\nSee examples/ folder for detailed usage examples.")

+# OHCA Inference Module v3.0 - Improved with Optimal Threshold Support
+# Apply pre-trained OHCA classifier to new datasets using optimal thresholds
 import pandas as pd
 import numpy as np
 from torch.utils.data import DataLoader, Dataset
 from tqdm import tqdm
 import os
+import json
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 import warnings
 warnings.filterwarnings('ignore')
 # =============================================================================
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Inference Module v3.0 - Using device: {DEVICE}")
 # =============================================================================
 # INFERENCE DATASET CLASS
         }
 # =============================================================================
+# IMPROVED MODEL LOADING FUNCTIONS
 # =============================================================================
+def load_ohca_model_with_metadata(model_path):
+    """
+    Load pre-trained OHCA model, tokenizer, and metadata (including optimal threshold).
+    This addresses the data scientist's feedback about using consistent thresholds.
+    Args:
+        model_path: Path to saved model directory
+    Returns:
+        tuple: (model, tokenizer, optimal_threshold, metadata)
+    """
+    print(f"Loading OHCA model with metadata from: {model_path}")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model not found at: {model_path}")
+    try:
+        # Load tokenizer and model
+        tokenizer = AutoTokenizer.from_pretrained(model_path)
+        model = AutoModelForSequenceClassification.from_pretrained(model_path)
+        model = model.to(DEVICE)
+        model.eval()
+        # Load metadata with optimal threshold
+        metadata_path = os.path.join(model_path, 'model_metadata.json')
+        if os.path.exists(metadata_path):
+            with open(metadata_path, 'r') as f:
+                metadata = json.load(f)
+            optimal_threshold = metadata.get('optimal_threshold', 0.5)
+            print(f"Loaded optimal threshold: {optimal_threshold:.3f}")
+            print(f"Model version: {metadata.get('model_version', 'unknown')}")
+        else:
+            print("Warning: No metadata file found. Using default threshold of 0.5")
+            optimal_threshold = 0.5
+            metadata = {'optimal_threshold': 0.5, 'model_version': 'legacy'}
+        print("Model loaded successfully")
+        print(f"   Device: {DEVICE}")
+        print(f"   Model type: {type(model).__name__}")
+        print(f"   Optimal threshold: {optimal_threshold:.3f}")
+        return model, tokenizer, optimal_threshold, metadata
+    except Exception as e:
+        raise RuntimeError(f"Failed to load model: {str(e)}")
 def load_ohca_model(model_path):
     """
+    Backward compatibility function - loads model without metadata
     Args:
         model_path: Path to saved model directory
     Returns:
         tuple: (model, tokenizer)
     """
+    print(f"Loading OHCA model from: {model_path}")
     if not os.path.exists(model_path):
         raise FileNotFoundError(f"Model not found at: {model_path}")
         model = model.to(DEVICE)
         model.eval()
+        print("Model loaded successfully (legacy mode)")
         print(f"   Device: {DEVICE}")
         print(f"   Model type: {type(model).__name__}")
         raise RuntimeError(f"Failed to load model: {str(e)}")
 # =============================================================================
+# IMPROVED INFERENCE FUNCTIONS
 # =============================================================================
+def categorize_confidence_with_optimal_threshold(prob, optimal_threshold):
     """
+    Categorize confidence levels relative to optimal threshold
+    Args:
+        prob: Probability score
+        optimal_threshold: Optimal threshold from training
+    Returns:
+        tuple: (confidence_category, clinical_priority)
+    """
+    if prob >= 0.9:
+        return "Very High", "Immediate Review"
+    elif prob >= 0.7:
+        return "High", "Priority Review"
+    elif prob >= optimal_threshold:
+        return "Medium-High", "Clinical Review"
+    elif prob >= 0.3:
+        return "Medium", "Consider Review"
+    elif prob >= 0.1:
+        return "Low", "Routine Processing"
+    else:
+        return "Very Low", "Routine Processing"
+def run_inference_with_optimal_threshold(model, tokenizer, inference_df,
+                                       optimal_threshold=0.5, batch_size=16,
+                                       output_path=None):
+    """
+    Run OHCA inference using the optimal threshold from training.
+    This addresses the data scientist's feedback about threshold consistency.
     Args:
         model: Pre-trained OHCA model
         tokenizer: Model tokenizer
         inference_df: DataFrame with columns ['hadm_id', 'clean_text']
+        optimal_threshold: Optimal threshold from validation set
         batch_size: Batch size for inference
         output_path: Optional path to save results CSV
     Returns:
+        DataFrame: Results with probabilities and predictions using optimal threshold
     """
+    print(f"Running OHCA inference on {len(inference_df):,} cases...")
+    print(f"Using optimal threshold: {optimal_threshold:.3f}")
     # Validate input data
     required_cols = ['hadm_id', 'clean_text']
     # Remove any rows with missing data
     clean_df = inference_df.dropna(subset=required_cols).copy()
     if len(clean_df) < len(inference_df):
+        print(f"Warning: Removed {len(inference_df) - len(clean_df)} rows with missing data")
     # Create dataset and dataloader
     inference_dataset = OHCAInferenceDataset(clean_df, tokenizer)
         'ohca_probability': all_probabilities
     })
+    # Add prediction using optimal threshold (primary prediction)
+    results_df['ohca_prediction'] = (results_df['ohca_probability'] >= optimal_threshold).astype(int)
+    results_df['optimal_threshold_used'] = optimal_threshold
+    # Add legacy predictions for comparison
     results_df['prediction_050'] = (results_df['ohca_probability'] >= 0.5).astype(int)
     results_df['prediction_070'] = (results_df['ohca_probability'] >= 0.7).astype(int)
     results_df['prediction_090'] = (results_df['ohca_probability'] >= 0.9).astype(int)
+    # Add improved confidence categories and clinical priorities
+    confidence_info = [categorize_confidence_with_optimal_threshold(prob, optimal_threshold)
+                      for prob in results_df['ohca_probability']]
+    results_df['confidence_category'] = [info[0] for info in confidence_info]
+    results_df['clinical_priority'] = [info[1] for info in confidence_info]
+    # Add interpretation column
+    results_df['interpretation'] = results_df.apply(
+        lambda row: f"OHCA detected (p={row['ohca_probability']:.3f})"
+        if row['ohca_prediction'] == 1
+        else f"No OHCA (p={row['ohca_probability']:.3f})", axis=1
+    )
     # Sort by probability (highest first)
     results_df = results_df.sort_values('ohca_probability', ascending=False).reset_index(drop=True)
+    # Print improved summary
+    print(f"\nInference Results Summary:")
     print(f"   Total cases processed: {len(results_df):,}")
     print(f"   Mean OHCA probability: {results_df['ohca_probability'].mean():.4f}")
+    print(f"   OHCA detected (optimal threshold): {results_df['ohca_prediction'].sum():,}")
+    print(f"   Detection rate: {results_df['ohca_prediction'].mean()*100:.2f}%")
+    # Clinical priority distribution
+    print(f"\nClinical Priority Distribution:")
+    priority_dist = results_df['clinical_priority'].value_counts()
+    for priority, count in priority_dist.items():
         pct = count / len(results_df) * 100
+        print(f"   {priority}: {count:,} cases ({pct:.1f}%)")
+    # Confidence distribution
+    print(f"\nConfidence Distribution:")
     conf_dist = results_df['confidence_category'].value_counts()
     for category, count in conf_dist.items():
         pct = count / len(results_df) * 100
     # Save results if path provided
     if output_path:
+        # Add metadata to the saved file
+        results_df['inference_date'] = pd.Timestamp.now().isoformat()
         results_df.to_csv(output_path, index=False)
+        print(f"\nResults saved to: {output_path}")
     return results_df
+def run_inference(model, tokenizer, inference_df, batch_size=16,
+                 output_path=None, probability_threshold=0.5):
+    """
+    Legacy inference function for backward compatibility
     """
+    print("Warning: Using legacy inference function. Consider upgrading to run_inference_with_optimal_threshold()")
+    return run_inference_with_optimal_threshold(
+        model, tokenizer, inference_df, probability_threshold, batch_size, output_path
+    )
+# =============================================================================
+# IMPROVED CONVENIENCE FUNCTIONS
+# =============================================================================
+def quick_inference_with_optimal_threshold(model_path, data_path, output_path=None):
+    """
+    Quick inference function that automatically uses the optimal threshold.
+    This is the recommended way to run inference with v3.0 models.
     Args:
+        model_path: Path to trained model (must include metadata)
+        data_path: Path to input CSV (or DataFrame)
+        output_path: Optional output path
     Returns:
+        DataFrame: Inference results using optimal threshold
     """
+    print("Quick OHCA Inference v3.0 with Optimal Threshold")
+    # Load model with metadata
+    model, tokenizer, optimal_threshold, metadata = load_ohca_model_with_metadata(model_path)
+    # Load data
+    if isinstance(data_path, str):
+        df = pd.read_csv(data_path)
+        print(f"Loaded {len(df):,} cases from {data_path}")
+    else:
+        df = data_path.copy()
+        print(f"Processing {len(df):,} cases from DataFrame")
+    # Run inference with optimal threshold
+    results = run_inference_with_optimal_threshold(
+        model, tokenizer, df, optimal_threshold, output_path=output_path
+    )
+    # Enhanced summary
+    ohca_cases = results['ohca_prediction'].sum()
+    high_priority = (results['clinical_priority'] == 'Immediate Review').sum()
+    priority = (results['clinical_priority'] == 'Priority Review').sum()
+    print(f"\nEnhanced Summary:")
+    print(f"   OHCA detected (optimal threshold): {ohca_cases:,}")
+    print(f"   Immediate review needed: {high_priority:,}")
+    print(f"   Priority review needed: {priority:,}")
+    print(f"   Model version: {metadata.get('model_version', 'unknown')}")
+    print(f"   Optimal threshold used: {optimal_threshold:.3f}")
+    return results
+def quick_inference(model_path, data_path, output_path=None):
     """
+    Backward compatible quick inference function.
+    Automatically detects if model has metadata and uses optimal threshold if available.
     Args:
+        model_path: Path to trained model
+        data_path: Path to input CSV (or DataFrame)
+        output_path: Optional output path
     Returns:
+        DataFrame: Inference results
     """
+    print("Quick OHCA Inference")
+    # Try to load with metadata first
+    metadata_path = os.path.join(model_path, 'model_metadata.json')
+    if os.path.exists(metadata_path):
+        print("Detected v3.0 model with metadata - using optimal threshold")
+        return quick_inference_with_optimal_threshold(model_path, data_path, output_path)
+    else:
+        print("Detected legacy model - using default threshold 0.5")
+        # Load model without metadata
+        model, tokenizer = load_ohca_model(model_path)
+        # Load data
+        if isinstance(data_path, str):
+            df = pd.read_csv(data_path)
+            print(f"Loaded {len(df):,} cases from {data_path}")
+        else:
+            df = data_path.copy()
+            print(f"Processing {len(df):,} cases from DataFrame")
+        # Run inference with default threshold
+        results = run_inference_with_optimal_threshold(
+            model, tokenizer, df, optimal_threshold=0.5, output_path=output_path
+        )
+        # Quick summary
+        ohca_cases = results['ohca_prediction'].sum()
+        high_conf = (results['ohca_probability'] >= 0.8).sum()
+        print(f"\nQuick Summary:")
+        print(f"   Predicted OHCA cases: {ohca_cases:,}")
+        print(f"   High confidence: {high_conf:,}")
+        return results
+def analyze_predictions_enhanced(results_df):
+    """
+    Enhanced prediction analysis with optimal threshold insights
+    Args:
+        results_df: Results from inference with optimal threshold
+    Returns:
+        dict: Enhanced analysis summary
+    """
+    print("Analyzing prediction patterns with optimal threshold insights...")
+    optimal_threshold = results_df['optimal_threshold_used'].iloc[0] if 'optimal_threshold_used' in results_df.columns else 0.5
     # Basic statistics
     stats = {
         'total_cases': len(results_df),
+        'optimal_threshold_used': optimal_threshold,
         'mean_probability': results_df['ohca_probability'].mean(),
         'std_probability': results_df['ohca_probability'].std(),
         'median_probability': results_df['ohca_probability'].median(),
+        'ohca_detected_optimal': results_df.get('ohca_prediction', []).sum(),
         'high_confidence_cases': (results_df['ohca_probability'] >= 0.8).sum(),
+        'predicted_ohca_050': results_df.get('prediction_050', []).sum(),
+        'predicted_ohca_070': results_df.get('prediction_070', []).sum(),
+        'predicted_ohca_090': results_df.get('prediction_090', []).sum(),
     }
+    # Clinical priority distribution
+    if 'clinical_priority' in results_df.columns:
+        priority_dist = results_df['clinical_priority'].value_counts().to_dict()
+    else:
+        priority_dist = {}
+    # Confidence distribution
+    if 'confidence_category' in results_df.columns:
+        conf_dist = results_df['confidence_category'].value_counts().to_dict()
+    else:
+        conf_dist = {}
+    # Print enhanced analysis
+    print(f"\nEnhanced Prediction Analysis:")
     print(f"   Total cases: {stats['total_cases']:,}")
+    print(f"   Optimal threshold used: {stats['optimal_threshold_used']:.3f}")
     print(f"   Mean probability: {stats['mean_probability']:.4f}")
+    print(f"   OHCA detected (optimal): {stats['ohca_detected_optimal']:,}")
+    if stats['ohca_detected_optimal'] > 0:
+        prevalence = stats['ohca_detected_optimal'] / stats['total_cases'] * 100
         print(f"   Estimated OHCA prevalence: {prevalence:.2f}%")
+    # Comparison with static thresholds
+    print(f"\nThreshold Comparison:")
+    print(f"   Optimal threshold ({optimal_threshold:.3f}): {stats['ohca_detected_optimal']:,} cases")
+    print(f"   Static threshold (0.5): {stats['predicted_ohca_050']:,} cases")
+    print(f"   Static threshold (0.7): {stats['predicted_ohca_070']:,} cases")
+    # Clinical recommendations
+    print(f"\nClinical Recommendations:")
+    if priority_dist:
+        for priority, count in priority_dist.items():
+            if count > 0:
+                print(f"   {priority}: {count:,} cases")
     return {
         'statistics': stats,
+        'clinical_priority_distribution': priority_dist,
         'confidence_distribution': conf_dist,
+        'optimal_threshold': optimal_threshold,
+        'high_confidence_cases': results_df[results_df['ohca_probability'] >= 0.8] if len(results_df) > 0 else pd.DataFrame()
     }
 # =============================================================================
+# ENHANCED BATCH PROCESSING
 # =============================================================================
+def process_large_dataset_with_optimal_threshold(model_path, data_path, output_path,
+                                                chunk_size=10000, batch_size=16):
     """
+    Process large datasets using optimal threshold from model metadata
     Args:
+        model_path: Path to trained model with metadata
         data_path: Path to input CSV file
         output_path: Path for output results
         chunk_size: Number of rows per chunk
     Returns:
         str: Path to completed results file
     """
+    print(f"Processing large dataset in chunks of {chunk_size:,} with optimal threshold...")
+    # Load model with metadata once
+    model, tokenizer, optimal_threshold, metadata = load_ohca_model_with_metadata(model_path)
     # Read data in chunks
     chunk_results = []
     for chunk_df in pd.read_csv(data_path, chunksize=chunk_size):
         chunk_num += 1
+        print(f"\nProcessing chunk {chunk_num} ({len(chunk_df):,} rows)...")
+        # Run inference on chunk with optimal threshold
+        chunk_result = run_inference_with_optimal_threshold(
+            model, tokenizer, chunk_df, optimal_threshold,
             batch_size=batch_size, output_path=None
         )
         # Save intermediate results
         temp_path = f"{output_path}.chunk_{chunk_num}.csv"
         chunk_result.to_csv(temp_path, index=False)
+        print(f"Chunk {chunk_num} saved to: {temp_path}")
     # Combine all chunks
+    print(f"\nCombining {len(chunk_results)} chunks...")
     final_results = pd.concat(chunk_results, ignore_index=True)
     # Sort by probability and save
     final_results = final_results.sort_values('ohca_probability', ascending=False)
+    # Add final metadata
+    final_results['model_version'] = metadata.get('model_version', 'unknown')
+    final_results['processing_date'] = pd.Timestamp.now().isoformat()
     final_results.to_csv(output_path, index=False)
+    print(f"Complete results saved to: {output_path}")
+    print(f"Total cases processed: {len(final_results):,}")
+    print(f"OHCA detected with optimal threshold: {final_results['ohca_prediction'].sum():,}")
     # Clean up intermediate files
     for i in range(1, chunk_num + 1):
     return output_path
+# Legacy batch processing function
+def process_large_dataset(model_path, data_path, output_path,
+                         chunk_size=10000, batch_size=16):
+    """Legacy function for backward compatibility"""
+    metadata_path = os.path.join(model_path, 'model_metadata.json')
+    if os.path.exists(metadata_path):
+        return process_large_dataset_with_optimal_threshold(
+            model_path, data_path, output_path, chunk_size, batch_size
+        )
+    else:
+        print("Warning: Legacy model detected. Using default threshold processing.")
+        # Fall back to original implementation
+        model, tokenizer = load_ohca_model(model_path)
+        chunk_results = []
+        chunk_num = 0
+        for chunk_df in pd.read_csv(data_path, chunksize=chunk_size):
+            chunk_num += 1
+            print(f"\nProcessing chunk {chunk_num} ({len(chunk_df):,} rows)...")
+            chunk_result = run_inference_with_optimal_threshold(
+                model, tokenizer, chunk_df, optimal_threshold=0.5,
+                batch_size=batch_size, output_path=None
+            )
+            chunk_results.append(chunk_result)
+        final_results = pd.concat(chunk_results, ignore_index=True)
+        final_results = final_results.sort_values('ohca_probability', ascending=False)
+        final_results.to_csv(output_path, index=False)
+        return output_path
 # =============================================================================
+# ENHANCED TESTING FUNCTIONS
 # =============================================================================
 def test_model_on_sample(model_path, sample_texts):
     """
+    Test model on sample texts using optimal threshold if available
     Args:
         model_path: Path to trained model
         sample_texts: List of text strings or dict with hadm_id: text
     Returns:
+        DataFrame: Test results with optimal threshold predictions
     """
+    print("Testing model on sample texts...")
     # Prepare test data
     if isinstance(sample_texts, dict):
             for i, text in enumerate(sample_texts, 1)
         ])
+    # Try to load with metadata
+    metadata_path = os.path.join(model_path, 'model_metadata.json')
+    if os.path.exists(metadata_path):
+        model, tokenizer, optimal_threshold, metadata = load_ohca_model_with_metadata(model_path)
+        results = run_inference_with_optimal_threshold(
+            model, tokenizer, test_df, optimal_threshold, output_path=None
+        )
+    else:
+        model, tokenizer = load_ohca_model(model_path)
+        results = run_inference_with_optimal_threshold(
+            model, tokenizer, test_df, optimal_threshold=0.5, output_path=None
+        )
+    # Print enhanced results
+    print(f"\nTest Results:")
     for _, row in results.iterrows():
         prob = row['ohca_probability']
+        pred = "OHCA" if row['ohca_prediction'] == 1 else "Non-OHCA"
         conf = row['confidence_category']
+        priority = row['clinical_priority']
+        print(f"   {row['hadm_id']}: {pred} (p={prob:.3f}, {conf}, {priority})")
         # Show text preview
         text_preview = test_df[test_df['hadm_id']==row['hadm_id']]['clean_text'].iloc[0]
     return results
+# =============================================================================
+# LEGACY FUNCTIONS FOR BACKWARD COMPATIBILITY
+# =============================================================================
+def get_high_confidence_cases(results_df, threshold=0.8, max_cases=100):
+    """Extract high-confidence OHCA predictions for manual review"""
+    high_conf = results_df[results_df['ohca_probability'] >= threshold].copy()
+    high_conf = high_conf.head(max_cases)
+    print(f"Found {len(high_conf)} high-confidence cases (≥{threshold})")
+    return high_conf
+def analyze_predictions(results_df, original_df=None):
+    """Legacy analysis function - redirects to enhanced version"""
+    return analyze_predictions_enhanced(results_df)
 # =============================================================================
 # EXAMPLE USAGE
 # =============================================================================
 if __name__ == "__main__":
+    print("OHCA Inference Module v3.0 - Enhanced with Optimal Threshold Support")
+    print("="*75)
+    print("Key improvements:")
+    print("✅ Automatic optimal threshold loading and usage")
+    print("✅ Enhanced confidence categories based on optimal threshold")
+    print("✅ Clinical priority recommendations")
+    print("✅ Backward compatibility with legacy models")
+    print("✅ Enhanced analysis and reporting")
+    print()
+    print("Main functions:")
+    print("• quick_inference_with_optimal_threshold() - Recommended for v3.0 models")
+    print("• load_ohca_model_with_metadata() - Load model with optimal threshold")
+    print("• run_inference_with_optimal_threshold() - Enhanced inference")
+    print("• process_large_dataset_with_optimal_threshold() - Batch processing")
+    print("• analyze_predictions_enhanced() - Enhanced prediction analysis")
+    print()
+    print("Legacy functions (maintained for compatibility):")
+    print("• quick_inference() - Auto-detects model version")
+    print("• load_ohca_model() - Basic model loading")
+    print("• run_inference() - Basic inference")
+    print()
+    print("See examples/ folder for detailed usage examples.")
+    print("="*75)