Spaces:
Running
Running
File size: 13,016 Bytes
b1c84b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 | """
Tests for Phase 10 β XLM-RoBERTa fine-tuning components.
These tests are designed to pass whether or not the fine-tuned model has
been generated (ml/train_xlmr.py has been run). Tests that require an actual
checkpoint are skipped when ml/models/xlmr_model/ is absent.
"""
import sys
from pathlib import Path
import pytest
# Ensure the PhilVerify package root is importable
sys.path.insert(0, str(Path(__file__).parent.parent))
from ml.dataset import (
DATASET,
LABEL_NAMES,
NUM_LABELS,
get_dataset,
get_split,
class_weights,
Sample,
)
XLMR_MODEL_DIR = Path(__file__).parent.parent / "ml" / "models" / "xlmr_model"
MODEL_PRESENT = XLMR_MODEL_DIR.exists()
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Dataset tests (always run)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestDataset:
def test_has_minimum_samples(self):
"""Dataset contains at least 90 samples across all 3 classes."""
assert len(DATASET) >= 90
def test_all_labels_present(self):
labels = {s.label for s in DATASET}
assert labels == {0, 1, 2}, "All three label classes must be present"
def test_minimum_samples_per_class(self):
"""Each class has at least 25 samples for meaningful fine-tuning."""
from collections import Counter
counts = Counter(s.label for s in DATASET)
for label in range(NUM_LABELS):
assert counts[label] >= 25, (
f"Class {LABEL_NAMES[label]} has only {counts[label]} samples"
)
def test_no_empty_texts(self):
for s in DATASET:
assert s.text.strip(), "All samples must have non-empty text"
def test_all_label_ids_valid(self):
for s in DATASET:
assert s.label in LABEL_NAMES, f"Invalid label: {s.label}"
def test_tagalog_samples_present(self):
"""Filipino/Tagalog samples must exist (dataset is multilingual)."""
tagalog_keywords = {"ayon", "sinabi", "nagbigay", "ang", "ng", "sa"}
tagalog_count = sum(
1 for s in DATASET
if any(kw in s.text.lower().split() for kw in tagalog_keywords)
)
assert tagalog_count >= 15, (
f"Expected at least 15 Tagalog samples, found {tagalog_count}"
)
def test_get_dataset_returns_all(self):
ds = get_dataset()
assert len(ds) == len(DATASET)
def test_get_split_sizes(self):
train, val = get_split(train_ratio=0.8)
total = len(train) + len(val)
assert total == len(DATASET), "split must account for all samples"
assert len(train) > len(val), "train set must be larger"
def test_get_split_is_stratified(self):
"""Both train and val splits contain all 3 classes."""
from collections import Counter
train, val = get_split(train_ratio=0.8)
train_labels = Counter(s.label for s in train)
val_labels = Counter(s.label for s in val)
for label in range(NUM_LABELS):
assert train_labels[label] > 0, f"Class {label} absent in train split"
assert val_labels[label] > 0, f"Class {label} absent in val split"
def test_get_split_reproducible(self):
"""Same seed produces same split."""
train_a, val_a = get_split(seed=7)
train_b, val_b = get_split(seed=7)
assert [s.text for s in train_a] == [s.text for s in train_b]
def test_class_weights_positive(self):
train, _ = get_split()
weights = class_weights(train)
assert len(weights) == NUM_LABELS
for w in weights:
assert w > 0, "All class weights must be positive"
def test_class_weights_inversely_proportional(self):
"""
Minority classes must have higher weight than majority.
(May not hold when all classes are equal, so check ordering only
when counts differ by at least 2).
"""
from collections import Counter
train, _ = get_split()
counts = Counter(s.label for s in train)
weights = class_weights(train)
# If class i has fewer samples than class j, i should have >= weight
for i in range(NUM_LABELS):
for j in range(NUM_LABELS):
if counts[i] < counts[j] - 2:
assert weights[i] >= weights[j], (
f"Class {i} (count={counts[i]}) should have >= weight "
f"than class {j} (count={counts[j]})"
)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Classifier instantiation tests (always run)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestXLMRClassifierInstantiation:
def test_model_not_found_raises_correct_error(self, tmp_path, monkeypatch):
"""When checkpoint dir is missing, ModelNotFoundError is raised."""
from ml.xlm_roberta_classifier import XLMRobertaClassifier, ModelNotFoundError
import ml.xlm_roberta_classifier as xlmr_mod
monkeypatch.setattr(xlmr_mod, "MODEL_DIR", tmp_path / "nonexistent")
with pytest.raises(ModelNotFoundError):
XLMRobertaClassifier()
def test_model_not_found_is_file_not_found_subclass(self):
"""ModelNotFoundError must be catchable as FileNotFoundError."""
from ml.xlm_roberta_classifier import ModelNotFoundError
assert issubclass(ModelNotFoundError, FileNotFoundError)
def test_engine_falls_back_to_tfidf_when_xlmr_absent(self, monkeypatch):
"""
scoring.engine.run_verification uses TF-IDF when no XLMR checkpoint.
We verify it still produces a valid VerificationResponse.
"""
import ml.xlm_roberta_classifier as xlmr_mod
from pathlib import Path
import tempfile
# Point MODEL_DIR at missing path so XLMRobertaClassifier raises
monkeypatch.setattr(xlmr_mod, "MODEL_DIR", Path(tempfile.mkdtemp()) / "missing")
# Run a small verification β should complete without exception
import asyncio
from scoring.engine import run_verification
result = asyncio.run(run_verification("Libreng kuryente na simula bukas ayon sa Pangulo"))
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
assert 0 <= result.final_score <= 100
assert result.layer1 is not None
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Classifier prediction tests (skipped when model absent)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
@pytest.mark.skipif(not MODEL_PRESENT, reason="XLM-RoBERTa checkpoint not found β run ml/train_xlmr.py first")
class TestXLMRClassifierPredict:
@pytest.fixture(scope="class")
def classifier(self):
from ml.xlm_roberta_classifier import XLMRobertaClassifier
return XLMRobertaClassifier()
def test_predict_returns_layer1_result(self, classifier):
from ml.xlm_roberta_classifier import Layer1Result
result = classifier.predict("DOH confirms 200 new COVID cases in Metro Manila")
assert isinstance(result, Layer1Result)
def test_verdict_is_valid_string(self, classifier):
from ml.xlm_roberta_classifier import LABEL_NAMES
result = classifier.predict("Rappler: BSP keeps rate at 6.5 percent")
assert result.verdict in LABEL_NAMES.values()
def test_confidence_in_range(self, classifier):
result = classifier.predict("GRABE! Libreng kuryente na simula bukas!")
assert 0.0 <= result.confidence <= 100.0
def test_triggered_features_are_strings(self, classifier):
result = classifier.predict("SHOCKING: Senator caught stealing in Senate vault")
assert isinstance(result.triggered_features, list)
assert all(isinstance(f, str) for f in result.triggered_features)
def test_handles_empty_ish_input_gracefully(self, classifier):
# Very short inputs should not crash
result = classifier.predict("ok")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_handles_tagalog_input(self, classifier):
result = classifier.predict("Ayon sa DOH, bumaba na ang bilang ng bagong kaso ng COVID sa Pilipinas")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
assert 0.0 <= result.confidence <= 100.0
def test_handles_taglish_input(self, classifier):
result = classifier.predict("Kinumpirma ng MalacaΓ±ang ang bagong EO about minimum wage increase")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_fake_news_correctly_classified(self, classifier):
"""
Obvious fake-news patterns should lean toward Likely Fake.
This is a sanity test, not a hard assertion β model may vary.
"""
result = classifier.predict(
"TOTOO! Bill Gates microchip natuklasan sa bakuna β PANGANIB!"
)
# Just check it doesn't crash and returns a valid result
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_credible_news_correctly_classified(self, classifier):
result = classifier.predict(
"PSA reports Philippine GDP grew 5.2 percent in Q3 2025 based on official statistics"
)
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Training script unit tests (no actual training β just imports + data loading)
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestTrainingScript:
def test_parse_args_defaults(self):
"""train_xlmr.parse_args returns expected defaults with empty argv."""
import ml.train_xlmr as train_mod
import argparse
# Patch sys.argv for argparse
import sys
orig = sys.argv
sys.argv = ["train_xlmr.py"]
try:
args = train_mod.parse_args()
finally:
sys.argv = orig
assert args.epochs == 5
assert args.lr == 2e-5
assert args.batch_size == 8
assert args.keep_top_n == 2
assert args.no_freeze is False
assert args.seed == 42
def test_philverify_dataset_class_needs_torch(self):
"""PhilVerifyDataset should work with tokenizer+samples (no network call)."""
import torch
from ml.dataset import get_split
from ml.train_xlmr import PhilVerifyDataset
train_samples, _ = get_split()
# Use a minimal mock tokenizer to avoid hitting the network
class MockTokenizer:
def __call__(self, texts, **kwargs):
n = len(texts)
return {
"input_ids": torch.zeros(n, 8, dtype=torch.long),
"attention_mask": torch.ones(n, 8, dtype=torch.long),
}
ds = PhilVerifyDataset(train_samples, MockTokenizer())
assert len(ds) == len(train_samples)
item = ds[0]
assert "input_ids" in item
assert "attention_mask" in item
assert "labels" in item
assert int(item["labels"].item()) in (0, 1, 2)
def test_freeze_lower_layers_import(self):
"""freeze_lower_layers is importable and callable."""
from ml.train_xlmr import freeze_lower_layers
assert callable(freeze_lower_layers)
def test_evaluate_import(self):
"""evaluate function is importable."""
from ml.train_xlmr import evaluate
assert callable(evaluate)
|