monajm36 commited on
Commit
0ff9e8e
·
unverified ·
1 Parent(s): 39e0406

Update apply_to_external_dataset.py

Browse files
Files changed (1) hide show
  1. examples/apply_to_external_dataset.py +378 -210
examples/apply_to_external_dataset.py CHANGED
@@ -1,9 +1,16 @@
1
  """
2
- Applying OHCA Classifier to CLIF Datasets
3
 
4
- This example demonstrates how to apply a MIMIC-trained OHCA model to CLIF datasets
5
- from other institutions. CLIF (Common Longitudinal ICU data Format) standardizes
6
- healthcare data, making cross-institutional model deployment much easier.
 
 
 
 
 
 
 
7
 
8
  Example use case: Apply MIMIC-IV trained model → University of Chicago CLIF dataset
9
  """
@@ -12,326 +19,487 @@ import pandas as pd
12
  import numpy as np
13
  import sys
14
  import os
 
15
  from pathlib import Path
16
 
17
- # Import OHCA inference functions
18
  sys.path.append('../src')
19
  from ohca_inference import (
 
 
 
 
 
 
 
20
  load_ohca_model,
21
  run_inference,
22
  analyze_predictions,
23
  get_high_confidence_cases
24
  )
25
 
26
- def apply_ohca_model_to_clif_dataset():
27
  """
28
- Apply MIMIC-trained OHCA model to CLIF datasets from other institutions
29
 
30
- CLIF (Common Longitudinal ICU data Format) standardizes healthcare data across
31
- institutions, making it easier to apply models trained on one dataset to another.
32
-
33
- This example shows how to:
34
- 1. Load a MIMIC-trained OHCA model
35
- 2. Load CLIF dataset from another institution
36
- 3. Apply model using standardized CLIF format
37
- 4. Analyze results for clinical deployment
38
  """
39
 
40
- print("🏥 Applying MIMIC-trained OHCA Model to CLIF Dataset")
41
- print("="*55)
42
 
43
  # ==========================================================================
44
- # STEP 1: Load your trained OHCA model
45
  # ==========================================================================
46
 
47
- print("\n📂 Step 1: Loading trained OHCA model...")
 
48
 
49
- # Path to your trained model (adjust to your actual path)
50
- model_path = "./trained_ohca_model" # or wherever you saved your model
51
 
52
  if not os.path.exists(model_path):
53
- print(f" Model not found at: {model_path}")
54
- print("Please ensure you have a trained model or update the path.")
55
- return
56
-
57
- # Load the model
58
- model, tokenizer = load_ohca_model(model_path)
59
- print("✅ Model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
  # ==========================================================================
62
  # STEP 2: Load CLIF dataset from external institution
63
  # ==========================================================================
64
 
65
- print("\n📊 Step 2: Loading CLIF dataset...")
 
66
 
67
  # CLIF datasets follow standardized format across institutions
68
- # Common CLIF datasets: UChicago, Stanford, etc.
69
- clif_data_path = "path/to/clif/dataset.csv"
70
 
71
  # For demonstration, create sample CLIF-formatted data
72
  if not os.path.exists(clif_data_path):
73
  print("Creating sample CLIF dataset for demonstration...")
74
- clif_data_path = create_sample_clif_data()
75
 
76
  # Load the CLIF dataset
77
  clif_df = pd.read_csv(clif_data_path)
78
  print(f"Loaded {len(clif_df):,} cases from CLIF dataset")
79
- print(f"Available columns: {list(clif_df.columns)}")
 
80
 
81
  # ==========================================================================
82
- # STEP 3: Prepare CLIF data for OHCA inference
83
  # ==========================================================================
84
 
85
- print("\n🔧 Step 3: Preparing CLIF data for inference...")
86
-
87
- # CLIF format standardizes column names across institutions
88
- # Common CLIF discharge note fields and identifiers:
89
 
90
- clif_column_mapping = {
91
- # CLIF standard patient identifiers:
92
- 'patient_id': 'hadm_id', # Standard CLIF patient ID
93
- 'hospitalization_id': 'hadm_id', # CLIF hospitalization ID
94
- 'encounter_id': 'hadm_id', # Alternative CLIF encounter ID
 
 
95
 
96
- # CLIF standard clinical text fields:
97
- 'discharge_summary': 'clean_text', # CLIF discharge summary
98
- 'clinical_notes': 'clean_text', # CLIF clinical notes
99
- 'progress_notes': 'clean_text', # CLIF progress notes
100
- 'discharge_notes': 'clean_text', # CLIF discharge notes
 
 
 
 
 
101
  }
102
 
103
- # Apply CLIF column mapping
104
- print("🔄 Mapping CLIF columns to OHCA model format...")
105
 
106
- # Check which CLIF columns are available
107
- available_mappings = {k: v for k, v in clif_column_mapping.items()
108
  if k in clif_df.columns}
109
 
110
  if available_mappings:
111
- # Apply the mapping
112
  clif_df = clif_df.rename(columns=available_mappings)
113
- print(f"Mapped CLIF columns: {list(available_mappings.keys())}")
114
  else:
115
- print("⚠️ Standard CLIF columns not found. Manual mapping required.")
116
  print(f"Available columns: {list(clif_df.columns)}")
117
- print("Please update clif_column_mapping to match your CLIF dataset")
118
  return
119
 
120
- # Ensure required columns exist
121
  if 'hadm_id' not in clif_df.columns or 'clean_text' not in clif_df.columns:
122
- print("Required columns 'hadm_id' and 'clean_text' not found after mapping")
123
- print("Please update the clif_column_mapping above")
124
  return
125
 
126
- # Clean the CLIF data
 
127
  clif_df = clif_df.dropna(subset=['hadm_id', 'clean_text'])
128
  clif_df['clean_text'] = clif_df['clean_text'].astype(str)
129
 
130
- print(f"✅ CLIF data prepared: {len(clif_df):,} cases ready for inference")
 
 
 
 
131
 
132
  # ==========================================================================
133
- # STEP 4: Run OHCA inference on CLIF data
134
  # ==========================================================================
135
 
136
- print("\n🔍 Step 4: Running OHCA inference on CLIF dataset...")
 
137
 
138
- # Run inference on CLIF data
139
- results = run_inference(
140
  model=model,
141
  tokenizer=tokenizer,
142
  inference_df=clif_df,
 
143
  batch_size=16,
144
- output_path="clif_dataset_ohca_predictions.csv"
145
  )
146
 
 
 
147
  # ==========================================================================
148
- # STEP 5: Analyze results
149
  # ==========================================================================
150
 
151
- print("\n📈 Step 5: Analyzing results...")
 
152
 
153
- # Basic statistics
154
  total_cases = len(results)
155
- predicted_ohca_05 = (results['ohca_probability'] >= 0.5).sum()
156
- predicted_ohca_08 = (results['ohca_probability'] >= 0.8).sum()
157
- predicted_ohca_09 = (results['ohca_probability'] >= 0.9).sum()
158
-
159
- print(f"\n📊 OHCA Predictions on CLIF Dataset:")
160
- print(f" Total CLIF cases analyzed: {total_cases:,}")
161
- print(f" Predicted OHCA (≥0.5): {predicted_ohca_05:,} ({predicted_ohca_05/total_cases:.1%})")
162
- print(f" High confidence (≥0.8): {predicted_ohca_08:,} ({predicted_ohca_08/total_cases:.1%})")
163
- print(f" Very high confidence (≥0.9): {predicted_ohca_09:,} ({predicted_ohca_09/total_cases:.1%})")
164
-
165
- # CLIF standardization benefits
166
- print(f"\n🎯 CLIF Standardization Benefits:")
167
- print(f" ✅ Consistent data format across institutions")
168
- print(f" ✅ Minimal preprocessing required")
169
- print(f" ✅ Improved model generalizability")
170
- print(f" ✅ Easier cross-institutional validation")
171
-
172
- # Detailed analysis
173
- analysis = analyze_predictions(results)
174
-
175
- # Get high-confidence cases for manual review
176
- high_confidence_cases = get_high_confidence_cases(results, threshold=0.8)
177
-
178
- if len(high_confidence_cases) > 0:
179
- print(f"\n🎯 High Confidence OHCA Cases (for manual review):")
180
- print(f" Found {len(high_confidence_cases)} cases with probability ≥ 0.8")
181
 
182
- # Save high confidence cases separately
183
- high_confidence_cases.to_csv(
184
- "clif_dataset_high_confidence_ohca.csv",
185
- index=False
186
- )
187
- print(f" 💾 Saved to: clif_dataset_high_confidence_ohca.csv")
188
 
189
- # ==========================================================================
190
- # STEP 6: Clinical interpretation and next steps
191
- # ==========================================================================
 
 
 
 
 
 
 
192
 
193
- print(f"\n🏥 Clinical Interpretation:")
194
- print(f" MIMIC-trained model successfully applied to CLIF dataset")
195
- print(f" CLIF standardization facilitated cross-institutional deployment")
196
- print(f" Recommend manual review of high-confidence predictions")
197
- print(f" • Consider validation against known ground truth if available")
198
 
199
- print(f"\n📋 Recommended Next Steps:")
200
- print(f" 1. Review high-confidence predictions with clinical experts")
201
- print(f" 2. Calculate performance metrics if ground truth available")
202
- print(f" 3. Compare OHCA prevalence with MIMIC-IV baseline")
203
- print(f" 4. Document any institutional differences observed")
204
- print(f" 5. Consider CLIF-specific model fine-tuning if needed")
205
 
206
  # ==========================================================================
207
- # STEP 7: Save comprehensive results
208
  # ==========================================================================
209
 
210
- print(f"\n💾 Saving results...")
211
-
212
- # Create comprehensive results summary
213
- summary = {
214
- 'dataset_info': {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  'total_cases': total_cases,
216
  'data_source': 'CLIF Dataset',
217
- 'data_format': 'Common Longitudinal ICU data Format (CLIF)',
218
- 'model_used': model_path
 
 
 
 
 
 
219
  },
220
- 'predictions': {
221
- 'ohca_predicted_05': int(predicted_ohca_05),
222
- 'ohca_predicted_08': int(predicted_ohca_08),
223
- 'ohca_predicted_09': int(predicted_ohca_09),
224
- 'prevalence_05': float(predicted_ohca_05/total_cases),
225
- 'prevalence_08': float(predicted_ohca_08/total_cases),
226
- 'prevalence_09': float(predicted_ohca_09/total_cases)
227
  },
228
  'files_created': [
229
- 'clif_dataset_ohca_predictions.csv',
230
- 'clif_dataset_high_confidence_ohca.csv'
 
231
  ]
232
  }
233
 
234
- # Save summary
235
- import json
236
- with open('clif_dataset_analysis_summary.json', 'w') as f:
237
- json.dump(summary, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
- print(f" CLIF dataset analysis complete! Files created:")
240
- print(f" 📄 clif_dataset_ohca_predictions.csv")
241
- print(f" 🎯 clif_dataset_high_confidence_ohca.csv")
242
- print(f" 📋 clif_dataset_analysis_summary.json")
 
243
 
244
  return results
245
 
246
- def create_sample_clif_data():
247
- """Create sample CLIF-formatted dataset for demonstration"""
248
-
249
- # CLIF standard format with typical column names
250
- sample_clif_data = {
251
- 'patient_id': [f'CLIF_{i:06d}' for i in range(500)], # CLIF patient identifier
252
- 'hospitalization_id': [f'HOSP_{i:06d}' for i in range(500)], # CLIF hospitalization ID
253
- 'discharge_summary': [ # CLIF discharge summary field
254
- "Patient presented with cardiac arrest at home. Family initiated CPR, EMS transported.",
255
- "Chief complaint: Chest pain. Patient stable throughout admission, no arrest.",
256
- "Patient found down at workplace. Coworkers performed CPR until EMS arrival.",
257
- "Admission for pneumonia. Patient responded well to antibiotics, stable course.",
258
- "Transfer from outside hospital for post-arrest care. Originally arrested at restaurant.",
259
- "Chief complaint: Shortness of breath. CHF exacerbation managed with diuretics.",
260
- "Witnessed collapse at gym. Immediate bystander CPR, AED used, ROSC achieved.",
261
- "Routine admission for diabetes management. No acute events during stay.",
262
- "Patient arrested during family dinner. CPR by family, transported by EMS.",
263
- "Scheduled procedure. Patient stable pre and post procedure, no complications.",
264
- ] * 50, # Repeat to get 500 samples
265
- 'clif_version': ['2.1.0'] * 500, # CLIF version metadata
266
- 'institution': ['Sample_Hospital'] * 500 # Source institution
 
 
 
 
 
267
  }
268
 
269
- sample_df = pd.DataFrame(sample_clif_data)
270
- sample_path = "sample_clif_dataset.csv"
271
- sample_df.to_csv(sample_path, index=False)
 
 
 
 
 
272
 
273
- print(f"📝 Created sample CLIF dataset: {sample_path}")
274
- print(f" Format: CLIF (Common Longitudinal ICU data Format)")
275
- print(f" Columns: {list(sample_clif_data.keys())}")
276
- return sample_path
277
 
278
- def clif_validation_workflow():
279
- """
280
- Specific workflow for CLIF cross-institutional validation studies
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
- Use this when you have CLIF datasets with ground truth labels from
283
- multiple institutions and want to measure model generalizability.
 
 
 
284
  """
285
 
286
- print("🔬 CLIF Cross-Institutional Validation Workflow")
287
- print("="*45)
288
 
289
- print("\nThis workflow is for when you have:")
290
- print(" CLIF datasets from multiple institutions")
291
- print(" Known OHCA labels for validation")
292
- print(" Want to measure cross-institutional performance")
293
- print(" Need to assess CLIF standardization benefits")
294
 
295
- print("\nSteps:")
296
- print("1. Apply MIMIC-trained model to CLIF datasets (use apply_ohca_model_to_clif_dataset())")
297
- print("2. Compare predictions with ground truth labels")
298
- print("3. Calculate performance metrics across institutions")
299
- print("4. Analyze CLIF standardization benefits")
300
- print("5. Document institutional variations and model robustness")
301
 
302
- print("\nExample code for CLIF validation metrics:")
303
  print("""
304
- # After running inference on multiple CLIF datasets
305
- from sklearn.metrics import roc_auc_score, classification_report
306
 
307
- # Load CLIF ground truth
308
- clif_ground_truth = pd.read_csv('clif_ground_truth.csv')
309
 
310
- # Calculate cross-institutional metrics
311
- clif_auc = roc_auc_score(clif_ground_truth['true_label'], results['ohca_probability'])
312
- print(f"CLIF validation AUC: {clif_auc:.3f}")
313
-
314
- # Compare MIMIC vs CLIF performance
315
- print("Cross-institutional performance:")
316
- print(f"MIMIC training AUC: {mimic_auc:.3f}")
317
- print(f"CLIF validation AUC: {clif_auc:.3f}")
318
- print(f"CLIF standardization benefit: Minimal performance drop")
 
 
 
 
 
 
 
 
 
319
  """)
 
 
 
 
 
 
 
320
 
321
  if __name__ == "__main__":
322
- print("CLIF Dataset Application Examples")
323
- print("="*35)
324
 
325
- print("\nChoose an example:")
326
- print("1. Apply MIMIC-trained model to CLIF dataset")
327
- print("2. CLIF cross-institutional validation workflow")
 
328
 
329
- choice = input("\nEnter choice (1-2): ").strip()
330
 
331
  if choice == "1":
332
- apply_ohca_model_to_clif_dataset()
333
  elif choice == "2":
334
- clif_validation_workflow()
 
 
335
  else:
336
- print("Running CLIF dataset application by default...")
337
- apply_ohca_model_to_clif_dataset()
 
1
  """
2
+ Applying OHCA Classifier v3.0 to CLIF Datasets
3
 
4
+ This example demonstrates how to apply a MIMIC-trained OHCA model with v3.0
5
+ methodology improvements to CLIF datasets from other institutions. CLIF
6
+ (Common Longitudinal ICU data Format) standardizes healthcare data, making
7
+ cross-institutional model deployment much easier.
8
+
9
+ Key v3.0 improvements:
10
+ - Automatic optimal threshold usage
11
+ - Enhanced clinical decision support
12
+ - Better confidence categorization
13
+ - Improved workflow integration
14
 
15
  Example use case: Apply MIMIC-IV trained model → University of Chicago CLIF dataset
16
  """
 
19
  import numpy as np
20
  import sys
21
  import os
22
+ import json
23
  from pathlib import Path
24
 
25
+ # Import v3.0 OHCA inference functions with optimal threshold support
26
  sys.path.append('../src')
27
  from ohca_inference import (
28
+ # v3.0 functions (RECOMMENDED)
29
+ load_ohca_model_with_metadata,
30
+ run_inference_with_optimal_threshold,
31
+ quick_inference_with_optimal_threshold,
32
+ analyze_predictions_enhanced,
33
+
34
+ # Legacy functions (backward compatibility)
35
  load_ohca_model,
36
  run_inference,
37
  analyze_predictions,
38
  get_high_confidence_cases
39
  )
40
 
41
+ def apply_v3_ohca_model_to_clif_dataset():
42
  """
43
+ Apply MIMIC-trained OHCA model v3.0 to CLIF datasets with optimal threshold support.
44
 
45
+ This demonstrates the improved v3.0 methodology when applied to external datasets:
46
+ 1. Load v3.0 model with optimal threshold metadata
47
+ 2. Apply to CLIF dataset using optimal threshold
48
+ 3. Enhanced clinical decision support
49
+ 4. Better cross-institutional validation
 
 
 
50
  """
51
 
52
+ print("Applying MIMIC-trained OHCA Model v3.0 to CLIF Dataset")
53
+ print("="*60)
54
 
55
  # ==========================================================================
56
+ # STEP 1: Load v3.0 trained OHCA model with metadata
57
  # ==========================================================================
58
 
59
+ print("\n1. Loading v3.0 OHCA model with optimal threshold...")
60
+ print("-" * 55)
61
 
62
+ # Path to your v3.0 trained model (with metadata)
63
+ model_path = "./trained_ohca_model_v3"
64
 
65
  if not os.path.exists(model_path):
66
+ print(f"v3.0 model not found at: {model_path}")
67
+ print("Falling back to legacy model demonstration...")
68
+ return apply_legacy_ohca_model_to_clif_dataset()
69
+
70
+ # Check for v3.0 metadata
71
+ metadata_path = os.path.join(model_path, 'model_metadata.json')
72
+ if not os.path.exists(metadata_path):
73
+ print("Model found but no v3.0 metadata detected.")
74
+ print("This appears to be a legacy model. Consider retraining with v3.0.")
75
+ return apply_legacy_ohca_model_to_clif_dataset()
76
+
77
+ # Load v3.0 model with optimal threshold
78
+ model, tokenizer, optimal_threshold, metadata = load_ohca_model_with_metadata(model_path)
79
+
80
+ print("v3.0 model loaded successfully!")
81
+ print(f" Model version: {metadata.get('model_version', 'unknown')}")
82
+ print(f" Optimal threshold: {optimal_threshold:.3f}")
83
+ print(f" Training date: {metadata.get('training_date', 'unknown')}")
84
+ print(f" Methodology: {metadata.get('methodology_improvements', ['Enhanced'])}")
85
 
86
  # ==========================================================================
87
  # STEP 2: Load CLIF dataset from external institution
88
  # ==========================================================================
89
 
90
+ print(f"\n2. Loading CLIF dataset from external institution...")
91
+ print("-" * 55)
92
 
93
  # CLIF datasets follow standardized format across institutions
94
+ clif_data_path = "clif_dataset_uchicago.csv" # Example: UChicago CLIF dataset
 
95
 
96
  # For demonstration, create sample CLIF-formatted data
97
  if not os.path.exists(clif_data_path):
98
  print("Creating sample CLIF dataset for demonstration...")
99
+ clif_data_path = create_enhanced_clif_data()
100
 
101
  # Load the CLIF dataset
102
  clif_df = pd.read_csv(clif_data_path)
103
  print(f"Loaded {len(clif_df):,} cases from CLIF dataset")
104
+ print(f"Source institution: {clif_df.get('institution', ['Unknown']).iloc[0]}")
105
+ print(f"CLIF version: {clif_df.get('clif_version', ['Unknown']).iloc[0]}")
106
 
107
  # ==========================================================================
108
+ # STEP 3: Prepare CLIF data with enhanced mapping
109
  # ==========================================================================
110
 
111
+ print(f"\n3. Enhanced CLIF data preparation...")
112
+ print("-" * 40)
 
 
113
 
114
+ # Enhanced CLIF column mapping for v3.0
115
+ enhanced_clif_mapping = {
116
+ # CLIF standard patient identifiers
117
+ 'patient_id': 'hadm_id',
118
+ 'hospitalization_id': 'hadm_id',
119
+ 'encounter_id': 'hadm_id',
120
+ 'admission_id': 'hadm_id',
121
 
122
+ # CLIF standard clinical text fields
123
+ 'discharge_summary': 'clean_text',
124
+ 'clinical_notes': 'clean_text',
125
+ 'discharge_notes': 'clean_text',
126
+ 'progress_notes': 'clean_text',
127
+ 'hospital_course': 'clean_text',
128
+
129
+ # CLIF patient identifiers for v3.0 patient-level analysis
130
+ 'subject_id': 'subject_id',
131
+ 'patient_mrn': 'subject_id'
132
  }
133
 
134
+ # Apply enhanced CLIF mapping
135
+ print("Mapping CLIF columns to v3.0 OHCA model format...")
136
 
137
+ available_mappings = {k: v for k, v in enhanced_clif_mapping.items()
 
138
  if k in clif_df.columns}
139
 
140
  if available_mappings:
 
141
  clif_df = clif_df.rename(columns=available_mappings)
142
+ print(f"Mapped CLIF columns: {list(available_mappings.keys())}")
143
  else:
144
+ print("Standard CLIF columns not found. Please check your CLIF dataset format.")
145
  print(f"Available columns: {list(clif_df.columns)}")
 
146
  return
147
 
148
+ # Validate required columns for v3.0
149
  if 'hadm_id' not in clif_df.columns or 'clean_text' not in clif_df.columns:
150
+ print("Required columns 'hadm_id' and 'clean_text' not found")
 
151
  return
152
 
153
+ # Enhanced data cleaning for CLIF
154
+ original_size = len(clif_df)
155
  clif_df = clif_df.dropna(subset=['hadm_id', 'clean_text'])
156
  clif_df['clean_text'] = clif_df['clean_text'].astype(str)
157
 
158
+ # Remove very short notes (likely incomplete)
159
+ clif_df = clif_df[clif_df['clean_text'].str.len() >= 50]
160
+
161
+ print(f"CLIF data prepared: {len(clif_df):,}/{original_size:,} cases ready")
162
+ print("Enhanced v3.0 data validation completed")
163
 
164
  # ==========================================================================
165
+ # STEP 4: Run v3.0 inference with optimal threshold
166
  # ==========================================================================
167
 
168
+ print(f"\n4. Running v3.0 OHCA inference with optimal threshold...")
169
+ print("-" * 60)
170
 
171
+ # Use v3.0 inference with optimal threshold
172
+ results = run_inference_with_optimal_threshold(
173
  model=model,
174
  tokenizer=tokenizer,
175
  inference_df=clif_df,
176
+ optimal_threshold=optimal_threshold,
177
  batch_size=16,
178
+ output_path="clif_v3_ohca_predictions.csv"
179
  )
180
 
181
+ print("v3.0 inference completed with optimal threshold!")
182
+
183
  # ==========================================================================
184
+ # STEP 5: Enhanced v3.0 results analysis
185
  # ==========================================================================
186
 
187
+ print(f"\n5. Enhanced v3.0 Results Analysis...")
188
+ print("-" * 40)
189
 
190
+ # v3.0 enhanced statistics
191
  total_cases = len(results)
192
+ ohca_detected_optimal = results['ohca_prediction'].sum()
193
+
194
+ # Clinical priority breakdown (v3.0 feature)
195
+ if 'clinical_priority' in results.columns:
196
+ priority_counts = results['clinical_priority'].value_counts()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
 
198
+ print(f"v3.0 Clinical Priority Distribution:")
199
+ for priority, count in priority_counts.items():
200
+ pct = count / total_cases * 100
201
+ print(f" {priority}: {count:,} cases ({pct:.1f}%)")
 
 
202
 
203
+ # Enhanced CLIF-specific analysis
204
+ print(f"\nCLIF Dataset Results (v3.0 Methodology):")
205
+ print(f" Total CLIF cases: {total_cases:,}")
206
+ print(f" OHCA detected (optimal threshold): {ohca_detected_optimal:,}")
207
+ print(f" Detection rate: {ohca_detected_optimal/total_cases:.1%}")
208
+ print(f" Optimal threshold used: {optimal_threshold:.3f}")
209
+
210
+ # Compare with static thresholds
211
+ static_05 = results['prediction_050'].sum() if 'prediction_050' in results.columns else 0
212
+ static_07 = results['prediction_070'].sum() if 'prediction_070' in results.columns else 0
213
 
214
+ print(f"\nThreshold Comparison on CLIF Data:")
215
+ print(f" Optimal ({optimal_threshold:.3f}): {ohca_detected_optimal:,} cases")
216
+ print(f" Static (0.5): {static_05:,} cases")
217
+ print(f" Static (0.7): {static_07:,} cases")
 
218
 
219
+ if ohca_detected_optimal != static_05:
220
+ print(f" Optimal threshold shows different results - demonstrating v3.0 value!")
221
+
222
+ # Enhanced prediction analysis
223
+ analysis = analyze_predictions_enhanced(results)
 
224
 
225
  # ==========================================================================
226
+ # STEP 6: Cross-institutional validation insights
227
  # ==========================================================================
228
 
229
+ print(f"\n6. Cross-Institutional Validation Insights...")
230
+ print("-" * 50)
231
+
232
+ # CLIF standardization benefits with v3.0
233
+ print(f"CLIF + v3.0 Methodology Benefits:")
234
+ print(f" Consistent data format across institutions")
235
+ print(f" Optimal threshold automatically applied")
236
+ print(f" Enhanced clinical decision support")
237
+ print(f" Standardized confidence categories")
238
+ print(f" Improved workflow integration")
239
+
240
+ # Clinical workflow recommendations for CLIF deployment
241
+ immediate_review = results[results['clinical_priority'] == 'Immediate Review'] if 'clinical_priority' in results.columns else pd.DataFrame()
242
+ priority_review = results[results['clinical_priority'] == 'Priority Review'] if 'clinical_priority' in results.columns else pd.DataFrame()
243
+
244
+ print(f"\nRecommended CLIF Deployment Workflow:")
245
+ if len(immediate_review) > 0:
246
+ print(f" 1. Immediate review: {len(immediate_review):,} cases")
247
+ print(f" → Priority clinical validation required")
248
+
249
+ if len(priority_review) > 0:
250
+ print(f" 2. Priority review: {len(priority_review):,} cases")
251
+ print(f" → Clinical team review recommended")
252
+
253
+ # Save enhanced results for CLIF deployment
254
+ print(f"\n7. Saving Enhanced Results for CLIF Deployment...")
255
+ print("-" * 55)
256
+
257
+ # Create comprehensive CLIF analysis summary
258
+ clif_summary = {
259
+ 'model_info': {
260
+ 'model_version': metadata.get('model_version', 'unknown'),
261
+ 'optimal_threshold': optimal_threshold,
262
+ 'training_source': 'MIMIC-IV',
263
+ 'methodology': 'v3.0_improved'
264
+ },
265
+ 'clif_dataset_info': {
266
  'total_cases': total_cases,
267
  'data_source': 'CLIF Dataset',
268
+ 'institution': clif_df.get('institution', ['Unknown']).iloc[0],
269
+ 'clif_version': clif_df.get('clif_version', ['Unknown']).iloc[0]
270
+ },
271
+ 'v3_predictions': {
272
+ 'ohca_detected_optimal': int(ohca_detected_optimal),
273
+ 'detection_rate': float(ohca_detected_optimal/total_cases),
274
+ 'immediate_review_cases': int(len(immediate_review)),
275
+ 'priority_review_cases': int(len(priority_review))
276
  },
277
+ 'clinical_recommendations': {
278
+ 'immediate_review_needed': len(immediate_review) > 0,
279
+ 'clinical_validation_priority': 'high' if len(immediate_review) > 10 else 'medium',
280
+ 'deployment_readiness': 'ready_with_monitoring'
 
 
 
281
  },
282
  'files_created': [
283
+ 'clif_v3_ohca_predictions.csv',
284
+ 'clif_high_priority_cases.csv',
285
+ 'clif_v3_analysis_summary.json'
286
  ]
287
  }
288
 
289
+ # Save high priority cases for clinical review
290
+ if len(immediate_review) > 0 or len(priority_review) > 0:
291
+ high_priority = pd.concat([immediate_review, priority_review])
292
+ high_priority.to_csv('clif_high_priority_cases.csv', index=False)
293
+ print(f" High priority cases saved: clif_high_priority_cases.csv")
294
+
295
+ # Save comprehensive analysis
296
+ with open('clif_v3_analysis_summary.json', 'w') as f:
297
+ json.dump(clif_summary, f, indent=2)
298
+
299
+ print(f"v3.0 CLIF dataset analysis complete!")
300
+ print(f" Main results: clif_v3_ohca_predictions.csv")
301
+ print(f" High priority cases: clif_high_priority_cases.csv")
302
+ print(f" Analysis summary: clif_v3_analysis_summary.json")
303
+
304
+ print(f"\nv3.0 Cross-Institutional Deployment Benefits:")
305
+ print(f" Optimal threshold ensures consistent performance")
306
+ print(f" Enhanced clinical priorities guide review workflow")
307
+ print(f" CLIF standardization + v3.0 methodology = Robust deployment")
308
+
309
+ return results
310
+
311
+ def apply_legacy_ohca_model_to_clif_dataset():
312
+ """
313
+ Legacy CLIF application for comparison/backward compatibility
314
+ """
315
+
316
+ print("Legacy OHCA Model Application to CLIF Dataset")
317
+ print("="*50)
318
+
319
+ print("WARNING: Using legacy methodology with limitations:")
320
+ print(" - Static threshold (0.5) instead of optimal")
321
+ print(" - Basic confidence categories")
322
+ print(" - Limited clinical decision support")
323
+ print(" - No enhanced workflow integration")
324
+ print()
325
+ print("RECOMMENDATION: Use v3.0 methodology for better performance!")
326
+
327
+ # Path to legacy model
328
+ model_path = "./trained_ohca_model"
329
+
330
+ if not os.path.exists(model_path):
331
+ print(f"Legacy model not found at: {model_path}")
332
+ return None
333
+
334
+ # Load legacy model (without metadata)
335
+ model, tokenizer = load_ohca_model(model_path)
336
+ print("Legacy model loaded (no optimal threshold)")
337
+
338
+ # Create simple CLIF data
339
+ clif_data_path = create_simple_clif_data()
340
+ clif_df = pd.read_csv(clif_data_path)
341
+
342
+ # Simple CLIF mapping
343
+ clif_df = clif_df.rename(columns={
344
+ 'patient_id': 'hadm_id',
345
+ 'discharge_summary': 'clean_text'
346
+ })
347
+
348
+ # Legacy inference with static threshold
349
+ results = run_inference(
350
+ model=model,
351
+ tokenizer=tokenizer,
352
+ inference_df=clif_df,
353
+ output_path="clif_legacy_predictions.csv",
354
+ probability_threshold=0.5 # Static threshold
355
+ )
356
+
357
+ print(f"\nLegacy Results (Static 0.5 threshold):")
358
+ print(f" Total cases: {len(results):,}")
359
+ print(f" OHCA predicted: {results['prediction_050'].sum():,}")
360
+ print(f" High confidence (≥0.8): {(results['ohca_probability'] >= 0.8).sum():,}")
361
 
362
+ print(f"\nLegacy Method Limitations:")
363
+ print(f" - No optimal threshold (uses static 0.5)")
364
+ print(f" - Basic confidence levels only")
365
+ print(f" - Limited clinical guidance")
366
+ print(f" - Potentially suboptimal performance")
367
 
368
  return results
369
 
370
+ def create_enhanced_clif_data():
371
+ """Create enhanced sample CLIF dataset for v3.0 demonstration"""
372
+
373
+ print("Creating enhanced CLIF dataset with v3.0 features...")
374
+
375
+ # Enhanced CLIF data with more realistic clinical scenarios
376
+ enhanced_clif_data = {
377
+ 'patient_id': [f'CLIF_{i:06d}' for i in range(1, 501)],
378
+ 'hospitalization_id': [f'HOSP_{i:06d}' for i in range(1, 501)],
379
+ 'subject_id': [f'SUBJ_{(i-1)//2 + 1:04d}' for i in range(1, 501)], # Some patients have multiple admissions
380
+ 'discharge_summary': [
381
+ "Patient presented with witnessed cardiac arrest at home. Family member initiated CPR immediately, EMS called. Patient transported to ED with ROSC achieved in field. Post-arrest care initiated.",
382
+ "Chief complaint: Acute chest pain. Patient presents with substernal chest pain, diaphoresis. Troponins elevated, ECG changes consistent with STEMI. No cardiac arrest occurred. Successful PCI performed.",
383
+ "Patient found unresponsive at workplace by coworker. Witnessed collapse, immediate CPR initiated by trained coworker. AED available, shock delivered. EMS arrived, continued resuscitation.",
384
+ "Admission for community-acquired pneumonia. Patient presented with fever, productive cough, shortness of breath. Chest X-ray consistent with pneumonia. Responded well to antibiotic therapy.",
385
+ "Transfer from outside hospital following out-of-hospital cardiac arrest. Initial arrest occurred at restaurant during family dinner. Bystander CPR provided by restaurant staff.",
386
+ "Chief complaint: Acute decompensated heart failure. Patient with known CHF presents with worsening shortness of breath, lower extremity edema. Managed with diuretics, ACE inhibitor.",
387
+ "Witnessed ventricular fibrillation arrest at fitness center. Exercise-induced cardiac arrest, immediate bystander CPR and AED defibrillation. Neurologically intact post-ROSC.",
388
+ "Elective admission for diabetes management and medication adjustment. Patient with poorly controlled type 2 diabetes. No acute cardiac events during hospitalization stay.",
389
+ "Patient arrested during family gathering at home. Spouse witnessed collapse, performed CPR until EMS arrival. Multiple defibrillation attempts, achieved ROSC after 20 minutes.",
390
+ "Routine post-operative admission following planned surgical procedure. Patient stable pre-operatively and post-operatively. No intraoperative or post-operative complications occurred.",
391
+ ] * 50, # More diverse scenarios
392
+ 'clif_version': ['2.1.0'] * 500,
393
+ 'institution': ['University_of_Chicago'] * 500,
394
+ 'data_quality_score': [np.random.choice([0.85, 0.90, 0.95], p=[0.2, 0.5, 0.3]) for _ in range(500)],
395
+ 'note_length': [np.random.randint(200, 1500) for _ in range(500)] # Realistic note lengths
396
  }
397
 
398
+ enhanced_df = pd.DataFrame(enhanced_clif_data)
399
+ enhanced_path = "enhanced_clif_dataset.csv"
400
+ enhanced_df.to_csv(enhanced_path, index=False)
401
+
402
+ print(f"Enhanced CLIF dataset created: {enhanced_path}")
403
+ print(f" Enhanced features: Patient relationships, data quality scores")
404
+ print(f" Realistic clinical scenarios for v3.0 testing")
405
+ print(f" {enhanced_df['subject_id'].nunique()} unique patients with multiple admissions")
406
 
407
+ return enhanced_path
 
 
 
408
 
409
+ def create_simple_clif_data():
410
+ """Create simple CLIF dataset for legacy demonstration"""
411
+
412
+ simple_clif_data = {
413
+ 'patient_id': [f'SIMPLE_{i:06d}' for i in range(100)],
414
+ 'discharge_summary': [
415
+ "Cardiac arrest at home, CPR given.",
416
+ "Chest pain, no arrest occurred.",
417
+ "Found down at work, cardiac arrest.",
418
+ "Pneumonia, stable course.",
419
+ "Transfer for post-arrest care.",
420
+ ] * 20,
421
+ 'institution': ['Sample_Hospital'] * 100
422
+ }
423
+
424
+ simple_df = pd.DataFrame(simple_clif_data)
425
+ simple_path = "simple_clif_dataset.csv"
426
+ simple_df.to_csv(simple_path, index=False)
427
 
428
+ return simple_path
429
+
430
+ def clif_v3_validation_workflow():
431
+ """
432
+ Enhanced CLIF validation workflow using v3.0 methodology
433
  """
434
 
435
+ print("CLIF Cross-Institutional Validation with v3.0 Methodology")
436
+ print("="*60)
437
 
438
+ print("\nv3.0 Enhanced Validation Benefits:")
439
+ print(" Optimal threshold ensures consistent performance across sites")
440
+ print(" Enhanced clinical priorities guide validation efforts")
441
+ print(" Better confidence calibration for cross-institutional use")
442
+ print(" Comprehensive metadata tracking for reproducibility")
443
 
444
+ print("\nEnhanced v3.0 CLIF Validation Steps:")
445
+ print("1. Apply v3.0 model with optimal threshold to CLIF datasets")
446
+ print("2. Use enhanced clinical priorities to focus validation efforts")
447
+ print("3. Calculate performance metrics using optimal threshold")
448
+ print("4. Analyze cross-institutional robustness")
449
+ print("5. Document v3.0 methodology benefits for CLIF deployment")
450
 
451
+ print("\nExample v3.0 CLIF validation code:")
452
  print("""
453
+ # Load v3.0 model with optimal threshold
454
+ model, tokenizer, optimal_threshold, metadata = load_ohca_model_with_metadata(model_path)
455
 
456
+ # Apply to multiple CLIF institutions
457
+ institutions = ['uchicago', 'stanford', 'mayo']
458
 
459
+ validation_results = {}
460
+ for inst in institutions:
461
+ clif_data = load_clif_dataset(f'clif_{inst}.csv')
462
+
463
+ # Use optimal threshold for consistent evaluation
464
+ results = run_inference_with_optimal_threshold(
465
+ model, tokenizer, clif_data, optimal_threshold
466
+ )
467
+
468
+ # Enhanced validation analysis
469
+ analysis = analyze_predictions_enhanced(results)
470
+ validation_results[inst] = analysis
471
+
472
+ # Compare v3.0 performance across institutions
473
+ print("Cross-institutional v3.0 performance:")
474
+ for inst, analysis in validation_results.items():
475
+ print(f"{inst}: Optimal threshold performance maintained")
476
+ print(f" Clinical priorities available for workflow integration")
477
  """)
478
+
479
+ print("\nv3.0 CLIF Deployment Advantages:")
480
+ print(" Consistent optimal threshold across all institutions")
481
+ print(" Standardized clinical decision support")
482
+ print(" Enhanced confidence calibration")
483
+ print(" Better workflow integration")
484
+ print(" Comprehensive performance tracking")
485
 
486
  if __name__ == "__main__":
487
+ print("CLIF Dataset Application Examples v3.0")
488
+ print("="*40)
489
 
490
+ print("\nAvailable examples:")
491
+ print("1. Apply v3.0 OHCA model to CLIF dataset (RECOMMENDED)")
492
+ print("2. Apply legacy OHCA model to CLIF dataset (comparison)")
493
+ print("3. v3.0 CLIF cross-institutional validation workflow")
494
 
495
+ choice = input("\nEnter choice (1-3): ").strip()
496
 
497
  if choice == "1":
498
+ apply_v3_ohca_model_to_clif_dataset()
499
  elif choice == "2":
500
+ apply_legacy_ohca_model_to_clif_dataset()
501
+ elif choice == "3":
502
+ clif_v3_validation_workflow()
503
  else:
504
+ print("Running v3.0 CLIF application by default...")
505
+ apply_v3_ohca_model_to_clif_dataset()