|
|
""" |
|
|
NLP OHCA Classifier v3.0 - Improved Methodology |
|
|
A BERT-based classifier for detecting Out-of-Hospital Cardiac Arrest (OHCA) |
|
|
cases in medical discharge notes using improved machine learning methodology. |
|
|
|
|
|
Key Improvements in v3.0: |
|
|
- Patient-level data splits to prevent data leakage |
|
|
- Proper train/validation/test methodology |
|
|
- Optimal threshold finding and usage |
|
|
- Larger annotation samples for better performance |
|
|
- Unbiased evaluation framework |
|
|
|
|
|
This package contains two main modules: |
|
|
1. ohca_training_pipeline: Complete training pipeline with improved methodology |
|
|
2. ohca_inference: Apply pre-trained models with optimal threshold support |
|
|
""" |
|
|
|
|
|
|
|
|
from .ohca_training_pipeline import ( |
|
|
|
|
|
create_patient_level_splits, |
|
|
complete_improved_training_pipeline, |
|
|
complete_annotation_and_train_v3, |
|
|
find_optimal_threshold, |
|
|
evaluate_on_test_set, |
|
|
save_model_with_metadata, |
|
|
|
|
|
|
|
|
create_training_sample, |
|
|
prepare_training_data, |
|
|
train_ohca_model, |
|
|
evaluate_model, |
|
|
complete_training_pipeline, |
|
|
complete_annotation_and_train, |
|
|
|
|
|
|
|
|
OHCATrainingDataset |
|
|
) |
|
|
|
|
|
|
|
|
from .ohca_inference import ( |
|
|
|
|
|
load_ohca_model_with_metadata, |
|
|
run_inference_with_optimal_threshold, |
|
|
quick_inference_with_optimal_threshold, |
|
|
process_large_dataset_with_optimal_threshold, |
|
|
analyze_predictions_enhanced, |
|
|
|
|
|
|
|
|
load_ohca_model, |
|
|
run_inference, |
|
|
quick_inference, |
|
|
process_large_dataset, |
|
|
test_model_on_sample, |
|
|
get_high_confidence_cases, |
|
|
analyze_predictions, |
|
|
|
|
|
|
|
|
OHCAInferenceDataset |
|
|
) |
|
|
|
|
|
__version__ = "3.0.0" |
|
|
__author__ = "Mona Moukaddem" |
|
|
__email__ = "your.email@example.com" |
|
|
|
|
|
|
|
|
__improved_training_functions__ = [ |
|
|
"create_patient_level_splits", |
|
|
"complete_improved_training_pipeline", |
|
|
"complete_annotation_and_train_v3", |
|
|
"find_optimal_threshold", |
|
|
"evaluate_on_test_set", |
|
|
"save_model_with_metadata" |
|
|
] |
|
|
|
|
|
__improved_inference_functions__ = [ |
|
|
"load_ohca_model_with_metadata", |
|
|
"run_inference_with_optimal_threshold", |
|
|
"quick_inference_with_optimal_threshold", |
|
|
"process_large_dataset_with_optimal_threshold", |
|
|
"analyze_predictions_enhanced" |
|
|
] |
|
|
|
|
|
|
|
|
__legacy_training_functions__ = [ |
|
|
"create_training_sample", |
|
|
"prepare_training_data", |
|
|
"train_ohca_model", |
|
|
"evaluate_model", |
|
|
"complete_training_pipeline", |
|
|
"complete_annotation_and_train", |
|
|
"OHCATrainingDataset" |
|
|
] |
|
|
|
|
|
__legacy_inference_functions__ = [ |
|
|
"load_ohca_model", |
|
|
"run_inference", |
|
|
"quick_inference", |
|
|
"process_large_dataset", |
|
|
"test_model_on_sample", |
|
|
"get_high_confidence_cases", |
|
|
"analyze_predictions", |
|
|
"OHCAInferenceDataset" |
|
|
] |
|
|
|
|
|
|
|
|
__all__ = ( |
|
|
__improved_training_functions__ + |
|
|
__improved_inference_functions__ + |
|
|
__legacy_training_functions__ + |
|
|
__legacy_inference_functions__ |
|
|
) |
|
|
|
|
|
|
|
|
__methodology_version__ = "3.0" |
|
|
__improvements__ = [ |
|
|
"Patient-level data splits prevent data leakage", |
|
|
"Proper train/validation/test methodology", |
|
|
"Optimal threshold finding and consistent usage", |
|
|
"Larger annotation samples (800 train + 200 val)", |
|
|
"Unbiased evaluation on independent test set", |
|
|
"Enhanced clinical decision support", |
|
|
"Backward compatibility with legacy models" |
|
|
] |
|
|
|
|
|
def get_version_info(): |
|
|
"""Return detailed version and methodology information""" |
|
|
return { |
|
|
'version': __version__, |
|
|
'methodology_version': __methodology_version__, |
|
|
'improvements': __improvements__, |
|
|
'author': __author__, |
|
|
'recommended_functions': { |
|
|
'training': 'complete_improved_training_pipeline', |
|
|
'inference': 'quick_inference_with_optimal_threshold' |
|
|
} |
|
|
} |
|
|
|
|
|
def print_welcome_message(): |
|
|
"""Print welcome message with key improvements""" |
|
|
print("="*60) |
|
|
print("NLP OHCA Classifier v3.0 - Improved Methodology") |
|
|
print("="*60) |
|
|
print("Key improvements addressing data scientist feedback:") |
|
|
for improvement in __improvements__: |
|
|
print(f"✅ {improvement}") |
|
|
print() |
|
|
print("Recommended functions:") |
|
|
print("• Training: complete_improved_training_pipeline()") |
|
|
print("• Inference: quick_inference_with_optimal_threshold()") |
|
|
print() |
|
|
print("Legacy functions maintained for backward compatibility.") |
|
|
print("="*60) |
|
|
|
|
|
|
|
|
print_welcome_message() |
|
|
|