philverify-api / tests /test_xlm_roberta.py
Ryan Christian D. Deniega
fix: cold start 502, favicon, verify state persistence
b1c84b5
"""
Tests for Phase 10 β€” XLM-RoBERTa fine-tuning components.
These tests are designed to pass whether or not the fine-tuned model has
been generated (ml/train_xlmr.py has been run). Tests that require an actual
checkpoint are skipped when ml/models/xlmr_model/ is absent.
"""
import sys
from pathlib import Path
import pytest
# Ensure the PhilVerify package root is importable
sys.path.insert(0, str(Path(__file__).parent.parent))
from ml.dataset import (
DATASET,
LABEL_NAMES,
NUM_LABELS,
get_dataset,
get_split,
class_weights,
Sample,
)
XLMR_MODEL_DIR = Path(__file__).parent.parent / "ml" / "models" / "xlmr_model"
MODEL_PRESENT = XLMR_MODEL_DIR.exists()
# ───────────────────────────────────────────────────────────────────────────────
# Dataset tests (always run)
# ───────────────────────────────────────────────────────────────────────────────
class TestDataset:
def test_has_minimum_samples(self):
"""Dataset contains at least 90 samples across all 3 classes."""
assert len(DATASET) >= 90
def test_all_labels_present(self):
labels = {s.label for s in DATASET}
assert labels == {0, 1, 2}, "All three label classes must be present"
def test_minimum_samples_per_class(self):
"""Each class has at least 25 samples for meaningful fine-tuning."""
from collections import Counter
counts = Counter(s.label for s in DATASET)
for label in range(NUM_LABELS):
assert counts[label] >= 25, (
f"Class {LABEL_NAMES[label]} has only {counts[label]} samples"
)
def test_no_empty_texts(self):
for s in DATASET:
assert s.text.strip(), "All samples must have non-empty text"
def test_all_label_ids_valid(self):
for s in DATASET:
assert s.label in LABEL_NAMES, f"Invalid label: {s.label}"
def test_tagalog_samples_present(self):
"""Filipino/Tagalog samples must exist (dataset is multilingual)."""
tagalog_keywords = {"ayon", "sinabi", "nagbigay", "ang", "ng", "sa"}
tagalog_count = sum(
1 for s in DATASET
if any(kw in s.text.lower().split() for kw in tagalog_keywords)
)
assert tagalog_count >= 15, (
f"Expected at least 15 Tagalog samples, found {tagalog_count}"
)
def test_get_dataset_returns_all(self):
ds = get_dataset()
assert len(ds) == len(DATASET)
def test_get_split_sizes(self):
train, val = get_split(train_ratio=0.8)
total = len(train) + len(val)
assert total == len(DATASET), "split must account for all samples"
assert len(train) > len(val), "train set must be larger"
def test_get_split_is_stratified(self):
"""Both train and val splits contain all 3 classes."""
from collections import Counter
train, val = get_split(train_ratio=0.8)
train_labels = Counter(s.label for s in train)
val_labels = Counter(s.label for s in val)
for label in range(NUM_LABELS):
assert train_labels[label] > 0, f"Class {label} absent in train split"
assert val_labels[label] > 0, f"Class {label} absent in val split"
def test_get_split_reproducible(self):
"""Same seed produces same split."""
train_a, val_a = get_split(seed=7)
train_b, val_b = get_split(seed=7)
assert [s.text for s in train_a] == [s.text for s in train_b]
def test_class_weights_positive(self):
train, _ = get_split()
weights = class_weights(train)
assert len(weights) == NUM_LABELS
for w in weights:
assert w > 0, "All class weights must be positive"
def test_class_weights_inversely_proportional(self):
"""
Minority classes must have higher weight than majority.
(May not hold when all classes are equal, so check ordering only
when counts differ by at least 2).
"""
from collections import Counter
train, _ = get_split()
counts = Counter(s.label for s in train)
weights = class_weights(train)
# If class i has fewer samples than class j, i should have >= weight
for i in range(NUM_LABELS):
for j in range(NUM_LABELS):
if counts[i] < counts[j] - 2:
assert weights[i] >= weights[j], (
f"Class {i} (count={counts[i]}) should have >= weight "
f"than class {j} (count={counts[j]})"
)
# ───────────────────────────────────────────────────────────────────────────────
# Classifier instantiation tests (always run)
# ───────────────────────────────────────────────────────────────────────────────
class TestXLMRClassifierInstantiation:
def test_model_not_found_raises_correct_error(self, tmp_path, monkeypatch):
"""When checkpoint dir is missing, ModelNotFoundError is raised."""
from ml.xlm_roberta_classifier import XLMRobertaClassifier, ModelNotFoundError
import ml.xlm_roberta_classifier as xlmr_mod
monkeypatch.setattr(xlmr_mod, "MODEL_DIR", tmp_path / "nonexistent")
with pytest.raises(ModelNotFoundError):
XLMRobertaClassifier()
def test_model_not_found_is_file_not_found_subclass(self):
"""ModelNotFoundError must be catchable as FileNotFoundError."""
from ml.xlm_roberta_classifier import ModelNotFoundError
assert issubclass(ModelNotFoundError, FileNotFoundError)
def test_engine_falls_back_to_tfidf_when_xlmr_absent(self, monkeypatch):
"""
scoring.engine.run_verification uses TF-IDF when no XLMR checkpoint.
We verify it still produces a valid VerificationResponse.
"""
import ml.xlm_roberta_classifier as xlmr_mod
from pathlib import Path
import tempfile
# Point MODEL_DIR at missing path so XLMRobertaClassifier raises
monkeypatch.setattr(xlmr_mod, "MODEL_DIR", Path(tempfile.mkdtemp()) / "missing")
# Run a small verification β€” should complete without exception
import asyncio
from scoring.engine import run_verification
result = asyncio.run(run_verification("Libreng kuryente na simula bukas ayon sa Pangulo"))
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
assert 0 <= result.final_score <= 100
assert result.layer1 is not None
# ───────────────────────────────────────────────────────────────────────────────
# Classifier prediction tests (skipped when model absent)
# ───────────────────────────────────────────────────────────────────────────────
@pytest.mark.skipif(not MODEL_PRESENT, reason="XLM-RoBERTa checkpoint not found β€” run ml/train_xlmr.py first")
class TestXLMRClassifierPredict:
@pytest.fixture(scope="class")
def classifier(self):
from ml.xlm_roberta_classifier import XLMRobertaClassifier
return XLMRobertaClassifier()
def test_predict_returns_layer1_result(self, classifier):
from ml.xlm_roberta_classifier import Layer1Result
result = classifier.predict("DOH confirms 200 new COVID cases in Metro Manila")
assert isinstance(result, Layer1Result)
def test_verdict_is_valid_string(self, classifier):
from ml.xlm_roberta_classifier import LABEL_NAMES
result = classifier.predict("Rappler: BSP keeps rate at 6.5 percent")
assert result.verdict in LABEL_NAMES.values()
def test_confidence_in_range(self, classifier):
result = classifier.predict("GRABE! Libreng kuryente na simula bukas!")
assert 0.0 <= result.confidence <= 100.0
def test_triggered_features_are_strings(self, classifier):
result = classifier.predict("SHOCKING: Senator caught stealing in Senate vault")
assert isinstance(result.triggered_features, list)
assert all(isinstance(f, str) for f in result.triggered_features)
def test_handles_empty_ish_input_gracefully(self, classifier):
# Very short inputs should not crash
result = classifier.predict("ok")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_handles_tagalog_input(self, classifier):
result = classifier.predict("Ayon sa DOH, bumaba na ang bilang ng bagong kaso ng COVID sa Pilipinas")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
assert 0.0 <= result.confidence <= 100.0
def test_handles_taglish_input(self, classifier):
result = classifier.predict("Kinumpirma ng MalacaΓ±ang ang bagong EO about minimum wage increase")
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_fake_news_correctly_classified(self, classifier):
"""
Obvious fake-news patterns should lean toward Likely Fake.
This is a sanity test, not a hard assertion β€” model may vary.
"""
result = classifier.predict(
"TOTOO! Bill Gates microchip natuklasan sa bakuna β€” PANGANIB!"
)
# Just check it doesn't crash and returns a valid result
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
def test_credible_news_correctly_classified(self, classifier):
result = classifier.predict(
"PSA reports Philippine GDP grew 5.2 percent in Q3 2025 based on official statistics"
)
assert result.verdict in ("Credible", "Unverified", "Likely Fake")
# ───────────────────────────────────────────────────────────────────────────────
# Training script unit tests (no actual training β€” just imports + data loading)
# ───────────────────────────────────────────────────────────────────────────────
class TestTrainingScript:
def test_parse_args_defaults(self):
"""train_xlmr.parse_args returns expected defaults with empty argv."""
import ml.train_xlmr as train_mod
import argparse
# Patch sys.argv for argparse
import sys
orig = sys.argv
sys.argv = ["train_xlmr.py"]
try:
args = train_mod.parse_args()
finally:
sys.argv = orig
assert args.epochs == 5
assert args.lr == 2e-5
assert args.batch_size == 8
assert args.keep_top_n == 2
assert args.no_freeze is False
assert args.seed == 42
def test_philverify_dataset_class_needs_torch(self):
"""PhilVerifyDataset should work with tokenizer+samples (no network call)."""
import torch
from ml.dataset import get_split
from ml.train_xlmr import PhilVerifyDataset
train_samples, _ = get_split()
# Use a minimal mock tokenizer to avoid hitting the network
class MockTokenizer:
def __call__(self, texts, **kwargs):
n = len(texts)
return {
"input_ids": torch.zeros(n, 8, dtype=torch.long),
"attention_mask": torch.ones(n, 8, dtype=torch.long),
}
ds = PhilVerifyDataset(train_samples, MockTokenizer())
assert len(ds) == len(train_samples)
item = ds[0]
assert "input_ids" in item
assert "attention_mask" in item
assert "labels" in item
assert int(item["labels"].item()) in (0, 1, 2)
def test_freeze_lower_layers_import(self):
"""freeze_lower_layers is importable and callable."""
from ml.train_xlmr import freeze_lower_layers
assert callable(freeze_lower_layers)
def test_evaluate_import(self):
"""evaluate function is importable."""
from ml.train_xlmr import evaluate
assert callable(evaluate)