Upload folder using huggingface_hub
Browse files- README.md +44 -0
- label_encoders.pkl +3 -0
- lgbm_model.txt +0 -0
- metadata.json +20 -0
- scaler_combined.pkl +3 -0
- scaler_struct.pkl +3 -0
README.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Hybrid Readmission Classifier
|
| 2 |
+
|
| 3 |
+
## Model Description
|
| 4 |
+
This is a hybrid model combining:
|
| 5 |
+
- **Text embeddings**: Extracted from emilyalsentzer/Bio_ClinicalBERT
|
| 6 |
+
- **Structured features**: Patient demographics and admission details
|
| 7 |
+
- **Classifier**: LightGBM gradient boosting
|
| 8 |
+
|
| 9 |
+
## Performance
|
| 10 |
+
- ROC-AUC: 0.6602
|
| 11 |
+
- AUPRC: 0.3334
|
| 12 |
+
- Accuracy: 0.6386
|
| 13 |
+
- F1-Score: 0.3871
|
| 14 |
+
|
| 15 |
+
## Model Artifacts
|
| 16 |
+
- `lgbm_model.txt`: LightGBM model
|
| 17 |
+
- `scaler_combined.pkl`: StandardScaler for combined features
|
| 18 |
+
- `scaler_struct.pkl`: StandardScaler for structured features
|
| 19 |
+
- `label_encoders.pkl`: Label encoders for categorical features
|
| 20 |
+
- `metadata.json`: Model metadata
|
| 21 |
+
|
| 22 |
+
## Usage
|
| 23 |
+
```python
|
| 24 |
+
import joblib
|
| 25 |
+
import numpy as np
|
| 26 |
+
from transformers import AutoTokenizer, AutoModel
|
| 27 |
+
|
| 28 |
+
# Load components
|
| 29 |
+
lgbm_model = joblib.load('lgbm_model.txt')
|
| 30 |
+
scaler = joblib.load('scaler_combined.pkl')
|
| 31 |
+
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
|
| 32 |
+
embedding_model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
|
| 33 |
+
|
| 34 |
+
# Extract embeddings
|
| 35 |
+
embeddings = embedding_model(tokenizer(text)['input_ids'])
|
| 36 |
+
|
| 37 |
+
# Combine with structured features and predict
|
| 38 |
+
X = np.hstack([embeddings, structured_features])
|
| 39 |
+
X_scaled = scaler.transform(X)
|
| 40 |
+
predictions = lgbm_model.predict(X_scaled)
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
## License
|
| 44 |
+
MIT
|
label_encoders.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4787788a49fd683cfca16eddd1dae6c543c53d2c650daaa8291776676775536d
|
| 3 |
+
size 1431
|
lgbm_model.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
metadata.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "hybrid_classifier",
|
| 3 |
+
"description": "Hybrid model combining ClinicalBERT embeddings + structured features with LightGBM",
|
| 4 |
+
"embedding_model": "emilyalsentzer/Bio_ClinicalBERT",
|
| 5 |
+
"embedding_dim": 768,
|
| 6 |
+
"structured_features": [
|
| 7 |
+
"insurance",
|
| 8 |
+
"admission_type",
|
| 9 |
+
"admission_location",
|
| 10 |
+
"age"
|
| 11 |
+
],
|
| 12 |
+
"max_seq_length": 512,
|
| 13 |
+
"metrics": {
|
| 14 |
+
"roc_auc": 0.6602300974779907,
|
| 15 |
+
"auprc": 0.3334125955302034,
|
| 16 |
+
"accuracy": 0.638584487090394,
|
| 17 |
+
"f1_score": 0.38711870552508726
|
| 18 |
+
},
|
| 19 |
+
"created_with": "hybrid.py"
|
| 20 |
+
}
|
scaler_combined.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9364887585d9388847f7da78cb9d547e0f75eaa5446b29542b335aaad020080b
|
| 3 |
+
size 19103
|
scaler_struct.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e64c9022301dc233f3100af19fe478925153e0f25bd7bc3813c85a91a871d3ae
|
| 3 |
+
size 671
|