File size: 9,263 Bytes
86b932c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
import os
import sys
import json
import logging
import joblib
import torch
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from matplotlib import pyplot as plt
from torch.utils.data import TensorDataset, DataLoader

_PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if str(_PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(_PROJECT_ROOT))

from src.models.lstm_model import BiLSTMClassifier, pad_sequences
from src.stage2_preprocessing import KerasStyleTokenizer
import sys
setattr(sys.modules['__main__'], 'KerasStyleTokenizer', KerasStyleTokenizer)

from transformers import AutoTokenizer, AutoModelForSequenceClassification

logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(name)s | %(levelname)s | %(message)s")
logger = logging.getLogger("meta_classifier")

def build_meta_features(df, lr_proba, lstm_proba, distil_proba, roberta_proba, is_train=True, preprocessor=None):
    """
    Construct the meta-feature matrix.
    If is_train is True, preprocessor is fit on the categorical columns.
    """
    df_meta = pd.DataFrame({
        "lr_proba": lr_proba,
        "lstm_proba": lstm_proba,
        "distilbert_proba": distil_proba,
        "roberta_proba": roberta_proba,
        "word_count": df["word_count"],
        "has_date": df["has_date"].astype(int),
        "freshness_score": df["freshness_score"]
    })
    
    # Categoricals to encode
    cats = df[["text_length_bucket", "source_domain"]].fillna("unknown")
    
    if is_train:
        preprocessor = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
        cat_features = preprocessor.fit_transform(cats)
    else:
        cat_features = preprocessor.transform(cats)
        
    X_meta = np.hstack((df_meta.values, cat_features))
    return X_meta, preprocessor


def train_meta_classifier(cfg, splits_dir, models_dir):
    save_dir = os.path.join(models_dir, "meta_classifier")
    os.makedirs(save_dir, exist_ok=True)
    
    logger.info("Loading dataset splits...")
    train_df = pd.read_csv(os.path.join(splits_dir, "df_train.csv"))
    val_df = pd.read_csv(os.path.join(splits_dir, "df_val.csv"))
    
    y_train = train_df["binary_label"].values
    y_val = val_df["binary_label"].values
    
    # ── 1. Load OOF predictions for Train Set ──
    logger.info("Gathering base model OOF predictions...")
    try:
        lr_oof = np.load(os.path.join(models_dir, "logistic_model", "lr_oof.npy"))
        lstm_oof = np.load(os.path.join(models_dir, "lstm_model", "lstm_oof.npy"))
        distil_oof = np.load(os.path.join(models_dir, "distilbert_model", "distilbert_oof.npy"))
        roberta_oof = np.load(os.path.join(models_dir, "roberta_model", "roberta_oof.npy"))
    except FileNotFoundError as e:
        logger.error(f"Missing OOF file: {e}. Please ensure all base models have trained completely.")
        return
        
    roberta_oof = roberta_oof * 0.92
        
    X_meta_train, meta_preprocessor = build_meta_features(
        train_df, lr_oof, lstm_oof, distil_oof, roberta_oof, is_train=True
    )
    
    # ── 2. Dynamically Generate Val predictions ──
    # Since we need a val set for early stopping, we predict them here.
    logger.info("Generating base model predictions for Validation set...")
    
    # Logistic
    lr_pipeline = joblib.load(os.path.join(models_dir, "logistic_model", "logistic_model.pkl"))
    lr_val = lr_pipeline.predict_proba(val_df)[:, 1]
    
    # LSTM
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    import pickle
    with open(os.path.join(models_dir, "tokenizer.pkl"), "rb") as f:
        tok = pickle.load(f)
    glove_path = os.path.join(_PROJECT_ROOT, cfg["paths"]["glove_path"])
    from src.models.lstm_model import load_glove_embeddings
    emb_matrix, vocab_size = load_glove_embeddings(glove_path, tok.word_index)
    
    maxlen = cfg.get("preprocessing", {}).get("lstm_max_len", 512)
    X_val_seq = tok.texts_to_sequences(val_df["clean_text"].fillna(""))
    X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen, padding='post')
    
    lstm_model = BiLSTMClassifier(vocab_size, emb_matrix).to(device)
    lstm_model.load_state_dict(torch.load(os.path.join(models_dir, "lstm_model", "model.pt"), map_location=device))
    lstm_model.eval()
    
    val_loader = DataLoader(TensorDataset(torch.from_numpy(X_val_pad).long()), batch_size=64, shuffle=False)
    lstm_val_preds = []
    with torch.no_grad():
        for x_b in val_loader:
            logits = lstm_model(x_b[0].to(device))
            lstm_val_preds.extend(torch.sigmoid(logits).cpu().numpy())
    lstm_val = np.array(lstm_val_preds)
    
    # DistilBERT
    d_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "distilbert_model"))
    d_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "distilbert_model")).to(device)
    d_mod.eval()
    
    distil_val = []
    with torch.no_grad():
        for text in val_df["clean_text"].fillna(""):
            inputs = d_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            out = d_mod(**inputs)
            distil_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item())
    distil_val = np.array(distil_val)
    
    # RoBERTa
    r_tok = AutoTokenizer.from_pretrained(os.path.join(models_dir, "roberta_model"))
    r_mod = AutoModelForSequenceClassification.from_pretrained(os.path.join(models_dir, "roberta_model")).to(device)
    r_mod.eval()
    
    roberta_val = []
    with torch.no_grad():
        for text in val_df["clean_text"].fillna(""):
            inputs = r_tok(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            out = r_mod(**inputs)
            roberta_val.append(torch.softmax(out.logits, dim=-1)[0, 1].item())
    roberta_val = np.array(roberta_val) * 0.92
    
    X_meta_val, _ = build_meta_features(
        val_df, lr_val, lstm_val, distil_val, roberta_val, is_train=False, preprocessor=meta_preprocessor
    )
    
    # ── 3. Train Meta-Classifier (XGBoost) ──
    logger.info("Training XGBoost meta-classifier...")
    xgb = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=5,
        eval_metric='logloss',
        early_stopping_rounds=20,
        random_state=42
    )
    
    xgb.fit(
        X_meta_train, y_train,
        eval_set=[(X_meta_val, y_val)],
        verbose=False
    )
    logger.info(f"XGBoost best iteration: {xgb.best_iteration}")
    
    # ── 4. Calibrate Probabilities ──
    logger.info("Calibrating final probabilities via CalibratedClassifierCV on Val set...")
    # 'prefit' means it will only use X_meta_val to calibrate the output
    calibrated_meta = CalibratedClassifierCV(estimator=xgb, method='sigmoid', cv='prefit')
    calibrated_meta.fit(X_meta_val, y_val)
    
    # Final Val Score Check
    final_val_probas = calibrated_meta.predict_proba(X_meta_val)[:, 1]
    
    # For short texts, dampen confidence toward 0.5 (more uncertain)
    # rather than making a confident wrong prediction
    for i in range(len(final_val_probas)):
        if val_df["word_count"].iloc[i] < 50:
            final_val_probas[i] = 0.5 + (final_val_probas[i] - 0.5) * 0.6
            
    final_val_preds = (final_val_probas >= 0.55).astype(int)
    
    logger.info("Final Meta-Classifier Classification Report:\n" + classification_report(y_val, final_val_preds))
    roc_auc = roc_auc_score(y_val, final_val_probas)
    logger.info(f"ROC-AUC: {roc_auc:.4f}")
    
    from src.models.logistic_model import plot_and_save_cm
    plot_and_save_cm(
        y_val, 
        final_val_preds, 
        os.path.join(save_dir, "cm.png"),
        title="XGBoost Meta-Classifier Confusion Matrix"
    )
    
    bucket_acc = {}
    for b in ["short", "medium", "long"]:
        b_mask = (val_df["text_length_bucket"] == b).values
        if b_mask.sum() > 0:
            acc = (final_val_preds[b_mask] == y_val[b_mask]).mean()
            bucket_acc[b] = acc
            
    metrics = {
        "roc_auc": float(roc_auc),
        "bucket_accuracy": {k: float(v) for k, v in bucket_acc.items()}
    }
    with open(os.path.join(save_dir, "metrics.json"), "w") as f:
        json.dump(metrics, f, indent=2)
        
    # Save Model Bundle (Pre-processor + Calibrated XGBoost)
    bundle = {
        "preprocessor": meta_preprocessor,
        "model": calibrated_meta
    }
    joblib.dump(bundle, os.path.join(save_dir, "meta_classifier.pkl"))
    logger.info("Saved Meta-Classifier bundle.")

if __name__ == "__main__":
    import yaml
    cfg_path = os.path.join(_PROJECT_ROOT, "config", "config.yaml")
    with open(cfg_path, "r", encoding="utf-8") as file:
        config = yaml.safe_load(file)
        
    train_meta_classifier(
        config, 
        os.path.join(_PROJECT_ROOT, config["paths"]["splits_dir"]),
        os.path.join(_PROJECT_ROOT, config["paths"]["models_dir"])
    )