File size: 6,143 Bytes
c78c2c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
"""
Evaluate all PhilVerify classifiers on the held-out validation split.

Prints per-class precision/recall/F1, confusion matrix, and a side-by-side
accuracy summary for all model variants:

  Classical (trained on train split):
    BoW + LogReg
    BoW + LogReg + Lemma
    TF-IDF + LogReg  (legacy SEED_DATA baseline)
    TF-IDF + NB
    TF-IDF + NB + Lemma
    LDA features + LogReg

  Transformer (loaded from saved checkpoints):
    XLM-RoBERTa
    Tagalog-RoBERTa
    Ensemble (XLM-R + Tagalog-RoBERTa)

Usage:
    cd PhilVerify
    python -m ml.eval
    python -m ml.eval --seed 42 --train-ratio 0.8 --skip-lda-analysis
"""
import argparse
import logging

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from ml.bow_classifier import BoWClassifier
from ml.dataset import LABEL_NAMES, get_split
from ml.ensemble_classifier import EnsembleClassifier
from ml.lda_analysis import LDAFeatureClassifier, run_topic_analysis
from ml.naive_bayes_classifier import NaiveBayesClassifier
from ml.tagalog_roberta_classifier import TagalogRobertaClassifier
from ml.tfidf_classifier import TFIDFClassifier
from ml.xlm_roberta_classifier import ModelNotFoundError, XLMRobertaClassifier

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
logger = logging.getLogger(__name__)

LABEL_LIST = [LABEL_NAMES[i] for i in sorted(LABEL_NAMES)]


def evaluate_classifier(name: str, clf, samples: list) -> dict:
    true_labels, pred_labels = [], []
    for s in samples:
        result = clf.predict(s.text)
        true_labels.append(LABEL_NAMES[s.label])
        pred_labels.append(result.verdict)

    print(f"\n{'='*62}")
    print(f"  {name}")
    print(f"{'='*62}")
    print(classification_report(true_labels, pred_labels, labels=LABEL_LIST, zero_division=0))

    print("Confusion matrix (rows = true, cols = predicted):")
    print(f"  {'':14}", "  ".join(f"{lbl[:6]:>6}" for lbl in LABEL_LIST))
    cm = confusion_matrix(true_labels, pred_labels, labels=LABEL_LIST)
    for row_label, row in zip(LABEL_LIST, cm):
        print(f"  {row_label:<14}", "  ".join(f"{v:>6}" for v in row))

    acc = accuracy_score(true_labels, pred_labels)
    return {"name": name, "accuracy": acc}


def main() -> None:
    parser = argparse.ArgumentParser(description="Evaluate PhilVerify classifiers")
    parser.add_argument("--seed", type=int, default=42,
                        help="Random seed (must match training seed)")
    parser.add_argument("--train-ratio", type=float, default=0.8,
                        help="Train split ratio (must match training)")
    parser.add_argument("--skip-lda-analysis", action="store_true",
                        help="Skip the LDA topic analysis printout")
    args = parser.parse_args()

    train_samples, val_samples = get_split(train_ratio=args.train_ratio, seed=args.seed)
    logger.info(
        "Train: %d samples  |  Val: %d samples  (seed=%d, train_ratio=%.1f)",
        len(train_samples), len(val_samples), args.seed, args.train_ratio,
    )

    # ── LDA topic analysis (printed before classifier comparison) ────────────
    if not args.skip_lda_analysis:
        run_topic_analysis(train_samples)

    results: list[dict] = []

    # ── Classical baselines (all trained on train_samples for fair comparison) ─

    results.append(evaluate_classifier(
        "BoW + LogReg",
        BoWClassifier(train_samples),
        val_samples,
    ))

    results.append(evaluate_classifier(
        "BoW + LogReg + Lemma",
        BoWClassifier(train_samples, lemmatize=True),
        val_samples,
    ))

    # Legacy baseline (trains on internal SEED_DATA, not the split β€” included for reference)
    results.append(evaluate_classifier(
        "TF-IDF + LogReg  [legacy SEED_DATA]",
        TFIDFClassifier(),
        val_samples,
    ))

    results.append(evaluate_classifier(
        "TF-IDF + NB",
        NaiveBayesClassifier(train_samples),
        val_samples,
    ))

    results.append(evaluate_classifier(
        "TF-IDF + NB + Lemma",
        NaiveBayesClassifier(train_samples, lemmatize=True),
        val_samples,
    ))

    results.append(evaluate_classifier(
        "LDA features + LogReg",
        LDAFeatureClassifier(train_samples),
        val_samples,
    ))

    # ── Transformer models ───────────────────────────────────────────────────
    xlmr = None
    try:
        xlmr = XLMRobertaClassifier()
        results.append(evaluate_classifier("XLM-RoBERTa", xlmr, val_samples))
    except ModelNotFoundError:
        logger.warning("XLM-RoBERTa checkpoint not found β€” skipping")

    tl = None
    try:
        tl = TagalogRobertaClassifier()
        results.append(evaluate_classifier("Tagalog-RoBERTa", tl, val_samples))
    except ModelNotFoundError:
        logger.warning("Tagalog-RoBERTa checkpoint not found β€” skipping")

    if xlmr is not None and tl is not None:
        ensemble = EnsembleClassifier([xlmr, tl])
        results.append(evaluate_classifier(
            "Ensemble (XLM-R + Tagalog-RoBERTa)", ensemble, val_samples
        ))

    # ── Summary table ────────────────────────────────────────────────────────
    print(f"\n{'='*62}")
    print("  Summary")
    print(f"{'='*62}")
    print(f"  {'Model':<44} {'Accuracy':>8}")
    print(f"  {'-'*44} {'-'*8}")

    classical_done = False
    for r in results:
        is_transformer = any(
            kw in r["name"] for kw in ("XLM", "RoBERTa", "Tagalog", "Ensemble")
        )
        if is_transformer and not classical_done:
            print()  # blank separator between classical and transformer sections
            classical_done = True
        print(f"  {r['name']:<44} {r['accuracy'] * 100:>7.1f}%")

    best = max(results, key=lambda r: r["accuracy"])
    print(f"\n  Best: {best['name']}  ({best['accuracy'] * 100:.1f}%)")
    print()


if __name__ == "__main__":
    main()