Add user-friendly scripts for training and prediction workflows

Files changed (3) hide show

examples/scripts/predict_ohca.py +137 -0
examples/scripts/prepare_data.py +118 -0
examples/scripts/train_from_labeled_data.py +155 -0

examples/scripts/predict_ohca.py ADDED Viewed

	@@ -0,0 +1,137 @@

+#!/usr/bin/env python3
+"""
+Apply OHCA Classifier to New Discharge Notes
+This script applies a trained OHCA classifier to new discharge notes.
+Input data should have columns: hadm_id, clean_text
+"""
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
+import pandas as pd
+from ohca_inference import quick_inference
+def validate_discharge_data(df):
+    """Validate that discharge data has required columns"""
+    required_cols = ['hadm_id', 'clean_text']
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Check for missing values
+    missing_ids = df['hadm_id'].isna().sum()
+    missing_text = df['clean_text'].isna().sum()
+    if missing_ids > 0:
+        print(f"Warning: {missing_ids} rows have missing hadm_id")
+    if missing_text > 0:
+        print(f"Warning: {missing_text} rows have missing clean_text")
+    print(f"Data validation:")
+    print(f"  Total discharge notes: {len(df)}")
+    print(f"  Valid records: {len(df.dropna(subset=required_cols))}")
+def predict_ohca(model_path, data_path, output_path=None):
+    """
+    Apply OHCA model to discharge notes
+    Args:
+        model_path: Path to trained model
+        data_path: Path to CSV with discharge notes
+        output_path: Where to save results (optional)
+    """
+    print("OHCA Classifier Prediction")
+    print("="*30)
+    # Validate model exists
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model not found: {model_path}")
+    print(f"Model: {model_path}")
+    print(f"Data: {data_path}")
+    # Load and validate data
+    df = pd.read_csv(data_path)
+    validate_discharge_data(df)
+    # Set default output path
+    if output_path is None:
+        base_name = os.path.splitext(os.path.basename(data_path))[0]
+        output_path = f"{base_name}_ohca_predictions.csv"
+    print(f"Output: {output_path}")
+    # Run inference
+    print(f"\nRunning OHCA prediction on {len(df)} discharge notes...")
+    results = quick_inference(
+        model_path=model_path,
+        data_path=df,
+        output_path=output_path
+    )
+    # Analyze results
+    if 'ohca_prediction' in results.columns:
+        ohca_detected = results['ohca_prediction'].sum()
+        threshold_used = results.get('optimal_threshold_used', [0.5]).iloc[0]
+    else:
+        # Fallback for legacy models
+        ohca_detected = (results['ohca_probability'] >= 0.5).sum()
+        threshold_used = 0.5
+    high_confidence = (results['ohca_probability'] >= 0.8).sum()
+    very_high_confidence = (results['ohca_probability'] >= 0.9).sum()
+    print(f"\nResults Summary:")
+    print(f"  Total cases analyzed: {len(results)}")
+    print(f"  OHCA detected: {ohca_detected} ({ohca_detected/len(results)*100:.1f}%)")
+    print(f"  High confidence (≥0.8): {high_confidence}")
+    print(f"  Very high confidence (≥0.9): {very_high_confidence}")
+    print(f"  Threshold used: {threshold_used:.3f}")
+    # Show highest probability cases
+    print(f"\nTop 5 highest probability cases:")
+    top_cases = results.nlargest(5, 'ohca_probability')
+    for _, row in top_cases.iterrows():
+        print(f"  {row['hadm_id']}: {row['ohca_probability']:.3f}")
+    print(f"\nResults saved to: {output_path}")
+    # Clinical recommendations
+    if very_high_confidence > 0:
+        print(f"\nClinical Recommendations:")
+        print(f"  → {very_high_confidence} cases need immediate review (≥90% probability)")
+    if high_confidence > very_high_confidence:
+        print(f"  → {high_confidence - very_high_confidence} cases need priority review (80-90% probability)")
+    return results
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Apply OHCA classifier to discharge notes')
+    parser.add_argument('model_path', help='Path to trained model directory')
+    parser.add_argument('data_path', help='Path to CSV file with discharge notes')
+    parser.add_argument('--output', help='Output CSV path (default: auto-generated)')
+    args = parser.parse_args()
+    if not os.path.exists(args.model_path):
+        print(f"Error: Model not found: {args.model_path}")
+        print("Train a model first using: python scripts/train_from_labeled_data.py")
+        sys.exit(1)
+    if not os.path.exists(args.data_path):
+        print(f"Error: Data file not found: {args.data_path}")
+        print("\nYour CSV file should have columns:")
+        print("  hadm_id: Unique admission identifier")
+        print("  clean_text: Discharge note text")
+        sys.exit(1)
+    try:
+        predict_ohca(args.model_path, args.data_path, args.output)
+    except Exception as e:
+        print(f"Prediction failed: {e}")
+        sys.exit(1)

examples/scripts/prepare_data.py ADDED Viewed

	@@ -0,0 +1,118 @@

+#!/usr/bin/env python3
+"""
+Data Preparation Helper for OHCA Classifier
+This script helps prepare your data in the correct format for training or inference.
+"""
+import pandas as pd
+import sys
+def prepare_labeled_data(input_path, output_path=None):
+    """Prepare manually labeled data for training"""
+    print("Preparing labeled data for training...")
+    df = pd.read_csv(input_path)
+    print(f"Loaded {len(df)} records")
+    print(f"Columns: {list(df.columns)}")
+    # Interactive column mapping
+    required_cols = ['hadm_id', 'clean_text', 'ohca_label']
+    column_mapping = {}
+    for req_col in required_cols:
+        if req_col not in df.columns:
+            print(f"\nColumn '{req_col}' not found.")
+            print(f"Available columns: {list(df.columns)}")
+            mapped_col = input(f"Which column should be used for '{req_col}'? ")
+            if mapped_col in df.columns:
+                column_mapping[mapped_col] = req_col
+            else:
+                print(f"Column '{mapped_col}' not found. Skipping...")
+    # Apply mapping
+    if column_mapping:
+        df = df.rename(columns=column_mapping)
+        print(f"Applied column mapping: {column_mapping}")
+    # Add missing optional columns
+    if 'subject_id' not in df.columns:
+        df['subject_id'] = df['hadm_id']
+        print("Added subject_id column (copied from hadm_id)")
+    if 'confidence' not in df.columns:
+        df['confidence'] = 4
+        print("Added default confidence scores")
+    # Validate and clean
+    df = df.dropna(subset=['hadm_id', 'clean_text', 'ohca_label'])
+    # Set output path
+    if output_path is None:
+        base_name = input_path.replace('.csv', '')
+        output_path = f"{base_name}_prepared.csv"
+    df.to_csv(output_path, index=False)
+    print(f"\nData prepared successfully:")
+    print(f"  Output: {output_path}")
+    print(f"  Records: {len(df)}")
+    print(f"  OHCA cases: {(df['ohca_label']==1).sum()}")
+    print(f"  Columns: {list(df.columns)}")
+def prepare_discharge_notes(input_path, output_path=None):
+    """Prepare discharge notes for inference"""
+    print("Preparing discharge notes for inference...")
+    df = pd.read_csv(input_path)
+    print(f"Loaded {len(df)} records")
+    print(f"Columns: {list(df.columns)}")
+    # Interactive column mapping
+    required_cols = ['hadm_id', 'clean_text']
+    column_mapping = {}
+    for req_col in required_cols:
+        if req_col not in df.columns:
+            print(f"\nColumn '{req_col}' not found.")
+            print(f"Available columns: {list(df.columns)}")
+            mapped_col = input(f"Which column should be used for '{req_col}'? ")
+            if mapped_col in df.columns:
+                column_mapping[mapped_col] = req_col
+    # Apply mapping
+    if column_mapping:
+        df = df.rename(columns=column_mapping)
+        print(f"Applied column mapping: {column_mapping}")
+    # Clean data
+    df = df.dropna(subset=['hadm_id', 'clean_text'])
+    # Set output path
+    if output_path is None:
+        base_name = input_path.replace('.csv', '')
+        output_path = f"{base_name}_prepared.csv"
+    df.to_csv(output_path, index=False)
+    print(f"\nDischarge notes prepared:")
+    print(f"  Output: {output_path}")
+    print(f"  Records: {len(df)}")
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage:")
+        print("  python scripts/prepare_data.py labeled <input.csv>     # For training data")
+        print("  python scripts/prepare_data.py discharge <input.csv>   # For inference data")
+        sys.exit(1)
+    data_type = sys.argv[1]
+    input_path = sys.argv[2]
+    if data_type == "labeled":
+        prepare_labeled_data(input_path)
+    elif data_type == "discharge":
+        prepare_discharge_notes(input_path)
+    else:
+        print("Data type must be 'labeled' or 'discharge'")
+        sys.exit(1)

examples/scripts/train_from_labeled_data.py ADDED Viewed

	@@ -0,0 +1,155 @@

+#!/usr/bin/env python3
+"""
+Train OHCA Classifier from Pre-labeled Data
+This script trains a v3.0 OHCA classifier using your manually labeled data.
+Your data should have columns: hadm_id, clean_text, ohca_label (and optionally subject_id, confidence)
+"""
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src'))
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from ohca_training_pipeline import prepare_training_data, train_ohca_model, find_optimal_threshold, save_model_with_metadata
+def validate_labeled_data(df):
+    """Validate that the labeled data has required columns and format"""
+    required_cols = ['hadm_id', 'clean_text', 'ohca_label']
+    missing_cols = [col for col in required_cols if col not in df.columns]
+    if missing_cols:
+        raise ValueError(f"Missing required columns: {missing_cols}")
+    # Check ohca_label values
+    unique_labels = df['ohca_label'].unique()
+    if not set(unique_labels).issubset({0, 1}):
+        raise ValueError(f"ohca_label must be 0 or 1, found: {unique_labels}")
+    print(f"Data validation passed:")
+    print(f"  Total cases: {len(df)}")
+    print(f"  OHCA cases (label=1): {(df['ohca_label']==1).sum()}")
+    print(f"  Non-OHCA cases (label=0): {(df['ohca_label']==0).sum()}")
+    print(f"  OHCA prevalence: {(df['ohca_label']==1).mean():.1%}")
+def train_from_labeled_data(data_path, model_save_path="./trained_ohca_model",
+                           test_size=0.2, num_epochs=3):
+    """
+    Train OHCA model from pre-labeled data
+    Args:
+        data_path: Path to CSV with labeled data
+        model_save_path: Where to save the trained model
+        test_size: Fraction to use for validation (default 0.2 = 20%)
+        num_epochs: Number of training epochs
+    """
+    print("OHCA Classifier Training from Pre-labeled Data")
+    print("="*50)
+    # Load and validate data
+    print(f"Loading labeled data from: {data_path}")
+    df = pd.read_csv(data_path)
+    # Add missing columns if needed
+    if 'subject_id' not in df.columns:
+        print("Adding subject_id column (using hadm_id as patient ID)")
+        df['subject_id'] = df['hadm_id']
+    if 'confidence' not in df.columns:
+        print("Adding default confidence scores")
+        df['confidence'] = 4  # Default confidence
+    validate_labeled_data(df)
+    # Split into train/validation
+    print(f"\nSplitting data (train: {1-test_size:.0%}, validation: {test_size:.0%})")
+    train_df, val_df = train_test_split(
+        df, test_size=test_size,
+        stratify=df['ohca_label'],
+        random_state=42
+    )
+    print(f"Training data: {len(train_df)} cases ({(train_df['ohca_label']==1).sum()} OHCA)")
+    print(f"Validation data: {len(val_df)} cases ({(val_df['ohca_label']==1).sum()} OHCA)")
+    # Save as temporary Excel files
+    temp_train = 'temp_train_data.xlsx'
+    temp_val = 'temp_val_data.xlsx'
+    train_df.to_excel(temp_train, index=False)
+    val_df.to_excel(temp_val, index=False)
+    try:
+        # Prepare training datasets
+        print("\nPreparing training datasets...")
+        train_dataset, val_dataset, train_df_balanced, val_df_clean, tokenizer = prepare_training_data(
+            temp_train, temp_val
+        )
+        # Train the model
+        print(f"\nTraining model for {num_epochs} epochs...")
+        model, trained_tokenizer = train_ohca_model(
+            train_dataset, val_dataset, train_df_balanced, tokenizer,
+            num_epochs=num_epochs,
+            save_path=model_save_path
+        )
+        # Find optimal threshold
+        print("\nFinding optimal threshold...")
+        optimal_threshold, val_metrics = find_optimal_threshold(
+            model, trained_tokenizer, val_df_clean
+        )
+        # Save model with metadata
+        print("\nSaving model with metadata...")
+        test_metrics = {'message': 'Trained on user-provided labeled data', 'test_set_size': 0}
+        save_model_with_metadata(
+            model, trained_tokenizer, optimal_threshold,
+            val_metrics, test_metrics, model_save_path
+        )
+        print(f"\nTraining completed successfully!")
+        print(f"Model saved to: {model_save_path}")
+        print(f"Optimal threshold: {optimal_threshold:.3f}")
+        print(f"Validation F1-score: {val_metrics['f1_score']:.3f}")
+        return {
+            'model_path': model_save_path,
+            'optimal_threshold': optimal_threshold,
+            'metrics': val_metrics
+        }
+    finally:
+        # Clean up temporary files
+        for temp_file in [temp_train, temp_val]:
+            if os.path.exists(temp_file):
+                os.remove(temp_file)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description='Train OHCA classifier from labeled data')
+    parser.add_argument('data_path', help='Path to CSV file with labeled data')
+    parser.add_argument('--model_path', default='./trained_ohca_model',
+                       help='Where to save trained model (default: ./trained_ohca_model)')
+    parser.add_argument('--epochs', type=int, default=3,
+                       help='Number of training epochs (default: 3)')
+    parser.add_argument('--test_size', type=float, default=0.2,
+                       help='Validation split fraction (default: 0.2)')
+    args = parser.parse_args()
+    if not os.path.exists(args.data_path):
+        print(f"Error: Data file not found: {args.data_path}")
+        print("\nYour CSV file should have columns:")
+        print("  hadm_id: Unique admission identifier")
+        print("  clean_text: Discharge note text")
+        print("  ohca_label: 1 for OHCA, 0 for non-OHCA")
+        print("  subject_id: Patient ID (optional - will use hadm_id if missing)")
+        sys.exit(1)
+    try:
+        train_from_labeled_data(args.data_path, args.model_path, args.test_size, args.epochs)
+    except Exception as e:
+        print(f"Training failed: {e}")
+        sys.exit(1)