Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / tests /test_improvements.py

Ryan Christian D. Deniega

feat: extension button placement, text extraction, OCR display + ML improvements

c78c2c1 about 1 month ago

19.6 kB

	"""
	Tests for the 5 NLP pipeline improvements:
	1. calamanCy NER fallback chain
	2. Tagalog-RoBERTa classifier (ModelNotFoundError)
	3. EnsembleClassifier
	4. EDA augmentation
	5. Sentence-scoring ClaimExtractor
	6. NLI stance detection (Rule 1.5)
	"""
	import sys
	from pathlib import Path
	from unittest.mock import patch, MagicMock

	import pytest

	# Ensure project root is on path
	sys.path.insert(0, str(Path(__file__).parent.parent))


	# ── Helpers ───────────────────────────────────────────────────────────────────

	def _make_sample(text: str, label: int = 0):
	from ml.dataset import Sample
	return Sample(text=text, label=label)


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 1 — EDA Augmentation
	# ══════════════════════════════════════════════════════════════════════════════

	class TestEDAugmentation:
	def test_empty_input_returns_empty(self):
	from ml.dataset import augment_samples
	assert augment_samples([]) == []

	def test_augment_produces_two_variants_per_sample(self):
	from ml.dataset import augment_samples
	samples = [_make_sample("DOH confirms 500 new COVID cases today", 0)]
	aug = augment_samples(samples, seed=42)
	# One deletion + one swap variant per sample
	assert len(aug) == 2

	def test_augmented_labels_match_originals(self):
	from ml.dataset import augment_samples
	samples = [
	_make_sample("Senate passes new bill on health care reform", 0),
	_make_sample("SHOCKING truth about vaccines hidden by government", 2),
	]
	aug = augment_samples(samples, seed=42)
	orig_labels = {s.label for s in samples}
	for a in aug:
	assert a.label in orig_labels

	def test_short_samples_skipped(self):
	from ml.dataset import augment_samples
	samples = [
	_make_sample("ok", 1), # 1 word — too short
	_make_sample("fake news", 2), # 2 words — too short
	]
	aug = augment_samples(samples, seed=42)
	assert aug == []

	def test_augmented_texts_differ_from_original(self):
	from ml.dataset import augment_samples
	original = "GRABE sinabi ng DOH na 200 bata ang nagkasakit sa bagong virus"
	samples = [_make_sample(original, 2)]
	aug = augment_samples(samples, seed=99)
	# At least one variant should differ
	assert any(a.text != original for a in aug)

	def test_augment_triples_training_set_size(self):
	from ml.dataset import get_split, augment_samples
	train, _ = get_split()
	aug = augment_samples(train, seed=42)
	# aug should be at most 2× train size (some short samples may be skipped)
	assert len(aug) >= len(train)
	assert len(aug) <= 2 * len(train)

	def test_augmented_samples_are_non_empty(self):
	from ml.dataset import augment_samples
	samples = [_make_sample("The senator confirmed signing the new law today", 0)]
	aug = augment_samples(samples, seed=42)
	for a in aug:
	assert len(a.text.strip()) > 0


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 2 — Sentence-scoring ClaimExtractor
	# ══════════════════════════════════════════════════════════════════════════════

	class TestClaimExtractor:
	def test_instantiates_without_loading_model(self):
	"""New ClaimExtractor has no lazy model loading at all."""
	from nlp.claim_extractor import ClaimExtractor
	ce = ClaimExtractor()
	# No _pipe, no _loaded attributes
	assert not hasattr(ce, '_pipe')
	assert not hasattr(ce, '_loaded')

	def test_passthrough_for_short_text(self):
	from nlp.claim_extractor import ClaimExtractor
	result = ClaimExtractor().extract("hi")
	assert result.method == "passthrough"
	assert result.claim == "hi"

	def test_sentence_scoring_method_on_informative_sentence(self):
	from nlp.claim_extractor import ClaimExtractor
	# Has a date, a verb, and named org — should score high
	text = "GRABE! Sinabi ng DOH noong Martes na 200 bata ang nagkasakit sa bagong virus sa Maynila."
	result = ClaimExtractor().extract(text)
	# Should pick the DOH sentence, not all text or just "GRABE!"
	assert result.method == "sentence_scoring"
	assert "DOH" in result.claim or "200" in result.claim

	def test_heuristic_fallback_when_no_scored_sentences(self):
	from nlp.claim_extractor import ClaimExtractor
	# Text with no dates, no numbers, no verbs
	text = "Wow amazing incredible unbelievable spectacular incomprehensible."
	result = ClaimExtractor().extract(text)
	assert result.method in ("sentence_heuristic", "sentence_scoring")

	def test_returns_claim_result_dataclass(self):
	from nlp.claim_extractor import ClaimExtractor, ClaimResult
	result = ClaimExtractor().extract("The president signed the new healthcare law today.")
	assert isinstance(result, ClaimResult)
	assert isinstance(result.claim, str)
	assert isinstance(result.method, str)

	def test_picks_specific_sentence_over_clickbait_opener(self):
	from nlp.claim_extractor import ClaimExtractor
	text = "OMG! Natuklasan ng mga siyentipiko na 5,000 tao ang namatay dahil sa bagong sakit ngayong Enero."
	result = ClaimExtractor().extract(text)
	# The specific claim (5000 deaths) should be preferred over "OMG!"
	assert "5,000" in result.claim or "siyentipiko" in result.claim or result.method == "sentence_scoring"


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 3 — TagalogRobertaClassifier
	# ══════════════════════════════════════════════════════════════════════════════

	class TestTagalogRobertaClassifier:
	def test_raises_model_not_found_when_checkpoint_missing(self, tmp_path, monkeypatch):
	"""ModelNotFoundError raised when checkpoint directory doesn't exist."""
	import ml.tagalog_roberta_classifier as mod
	monkeypatch.setattr(mod, "MODEL_DIR", tmp_path / "nonexistent_model")
	with pytest.raises(mod.ModelNotFoundError):
	mod.TagalogRobertaClassifier()

	def test_model_not_found_is_subclass_of_file_not_found(self):
	from ml.xlm_roberta_classifier import ModelNotFoundError
	assert issubclass(ModelNotFoundError, FileNotFoundError)

	def test_shares_same_model_not_found_error(self):
	"""Engine catches ModelNotFoundError from xlm_roberta_classifier —
	tagalog module re-uses the same class, so the same except clause catches it."""
	from ml.xlm_roberta_classifier import ModelNotFoundError as E1
	from ml.tagalog_roberta_classifier import ModelNotFoundError as E2
	assert E1 is E2


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 4 — EnsembleClassifier
	# ══════════════════════════════════════════════════════════════════════════════

	class TestEnsembleClassifier:
	def _make_stub(self, probs_list: list[float]):
	"""Return a stub classifier whose predict_probs returns fixed probabilities."""
	import torch
	stub = MagicMock()
	stub.predict_probs.return_value = (
	torch.tensor(probs_list, dtype=torch.float32),
	None,
	None,
	)
	stub._salient_tokens = MagicMock(return_value=["token1"])
	return stub

	def test_raises_value_error_for_empty_list(self):
	from ml.ensemble_classifier import EnsembleClassifier
	with pytest.raises(ValueError):
	EnsembleClassifier([])

	def test_single_classifier_returns_its_prediction(self):
	import torch
	from ml.ensemble_classifier import EnsembleClassifier
	stub = self._make_stub([0.7, 0.2, 0.1])
	ens = EnsembleClassifier([stub])
	result = ens.predict("any text")
	assert result.verdict == "Credible"
	assert abs(result.confidence - 70.0) < 1.0

	def test_two_classifiers_averages_probabilities(self):
	import torch
	from ml.ensemble_classifier import EnsembleClassifier
	# First: [0.8, 0.1, 0.1] → Credible 80%
	# Second: [0.4, 0.5, 0.1] → Unverified 50%
	# Average: [0.6, 0.3, 0.1] → Credible 60%
	stub1 = self._make_stub([0.8, 0.1, 0.1])
	stub2 = self._make_stub([0.4, 0.5, 0.1])
	ens = EnsembleClassifier([stub1, stub2])
	result = ens.predict("test text")
	assert result.verdict == "Credible"
	assert abs(result.confidence - 60.0) < 1.5

	def test_failing_classifier_gracefully_skipped(self):
	import torch
	from ml.ensemble_classifier import EnsembleClassifier
	good = self._make_stub([0.1, 0.1, 0.8]) # Likely Fake
	bad = MagicMock()
	bad.predict_probs.side_effect = RuntimeError("model failed")
	ens = EnsembleClassifier([good, bad])
	result = ens.predict("test text")
	# Should still get a result from the good classifier
	assert result.verdict == "Likely Fake"

	def test_all_classifiers_failing_returns_unverified_neutral(self):
	from ml.ensemble_classifier import EnsembleClassifier
	bad = MagicMock()
	bad.predict_probs.side_effect = RuntimeError("fail")
	ens = EnsembleClassifier([bad])
	result = ens.predict("test")
	assert result.verdict == "Unverified"
	assert result.confidence == 33.3

	def test_result_has_correct_type(self):
	import torch
	from ml.ensemble_classifier import EnsembleClassifier
	from ml.xlm_roberta_classifier import Layer1Result
	stub = self._make_stub([0.5, 0.3, 0.2])
	ens = EnsembleClassifier([stub])
	result = ens.predict("test")
	assert isinstance(result, Layer1Result)
	assert isinstance(result.triggered_features, list)


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 5 — NLI Stance Detection
	# ══════════════════════════════════════════════════════════════════════════════

	class TestNLIStanceDetector:
	def _reset_nli_cache(self):
	"""Reset the module-level NLI singleton between tests."""
	import evidence.stance_detector as mod
	mod._nli_pipe = None
	mod._nli_loaded = False

	def test_falls_through_to_keywords_when_nli_unavailable(self):
	"""When NLI model can't be loaded, keyword rules still work."""
	import evidence.stance_detector as mod
	self._reset_nli_cache()
	with patch.object(mod, '_get_nli', return_value=None):
	result = mod.detect_stance(
	claim="Vaccines are safe",
	article_title="Fact check: COVID vaccines proven effective",
	article_description="Experts confirm vaccines are safe and effective after extensive testing.",
	article_url="",
	similarity=0.7,
	)
	from evidence.stance_detector import Stance
	# "confirmed" in article → Supports keyword rule
	assert result.stance in (Stance.SUPPORTS, Stance.NOT_ENOUGH_INFO, Stance.REFUTES)
	# Should not crash

	def test_nli_supports_high_confidence(self):
	"""When NLI returns 'supports' at ≥0.65, stance is SUPPORTS with NLI reason."""
	import evidence.stance_detector as mod
	self._reset_nli_cache()
	mock_nli = MagicMock()
	mock_nli.return_value = {
	"labels": ["supports the claim", "contradicts the claim", "unrelated"],
	"scores": [0.82, 0.12, 0.06],
	}
	with patch.object(mod, '_get_nli', return_value=mock_nli):
	result = mod.detect_stance(
	claim="Government confirmed 500 new cases",
	article_title="Government says 500 new cases recorded",
	article_description="Officials confirmed today that 500 new cases were recorded nationwide.",
	similarity=0.75,
	)
	from evidence.stance_detector import Stance
	assert result.stance == Stance.SUPPORTS
	assert "NLI" in result.reason

	def test_nli_contradicts_high_confidence(self):
	"""When NLI returns 'contradicts' at ≥0.65, stance is REFUTES with NLI reason."""
	import evidence.stance_detector as mod
	self._reset_nli_cache()
	mock_nli = MagicMock()
	mock_nli.return_value = {
	"labels": ["contradicts the claim", "supports the claim", "unrelated"],
	"scores": [0.78, 0.15, 0.07],
	}
	with patch.object(mod, '_get_nli', return_value=mock_nli):
	result = mod.detect_stance(
	claim="There is no evidence of fraud",
	article_title="Evidence of widespread fraud found",
	article_description="Investigators found extensive evidence of fraud in the election.",
	similarity=0.6,
	)
	from evidence.stance_detector import Stance
	assert result.stance == Stance.REFUTES
	assert "NLI" in result.reason

	def test_nli_low_confidence_falls_through_to_keywords(self):
	"""NLI confidence < 0.65 — should fall through and use keyword rules."""
	import evidence.stance_detector as mod
	self._reset_nli_cache()
	mock_nli = MagicMock()
	mock_nli.return_value = {
	"labels": ["supports the claim", "contradicts the claim", "unrelated"],
	"scores": [0.45, 0.35, 0.20], # below 0.65 threshold
	}
	with patch.object(mod, '_get_nli', return_value=mock_nli):
	result = mod.detect_stance(
	claim="Senator is guilty of corruption",
	article_title="Fact check: False claim about senator",
	article_description="This claim has been debunked by multiple fact-checkers.",
	similarity=0.5,
	)
	from evidence.stance_detector import Stance
	# Keyword "debunked" should trigger REFUTES
	assert result.stance == Stance.REFUTES

	def test_short_description_skips_nli(self):
	"""Article description shorter than 30 chars → NLI skipped, no error."""
	import evidence.stance_detector as mod
	self._reset_nli_cache()
	mock_nli = MagicMock()
	with patch.object(mod, '_get_nli', return_value=mock_nli):
	result = mod.detect_stance(
	claim="Some claim",
	article_title="Short article",
	article_description="Short.", # <30 chars
	similarity=0.5,
	)
	# NLI should not have been called
	mock_nli.assert_not_called()


	# ══════════════════════════════════════════════════════════════════════════════
	# Part 6 — calamanCy NER Fallback Chain
	# ══════════════════════════════════════════════════════════════════════════════

	class TestCalamanCyNERFallback:
	def _fresh_extractor(self):
	"""Return a fresh (unloaded) EntityExtractor."""
	import importlib
	import nlp.ner
	importlib.reload(nlp.ner)
	return nlp.ner.EntityExtractor()

	def test_falls_back_to_spacy_when_calamancy_missing(self, monkeypatch):
	"""When calamancy import fails, _nlp is set via spaCy en_core_web_sm."""
	import nlp.ner as mod
	extractor = mod.EntityExtractor()
	extractor._loaded = False # force reload

	# Simulate calamancy not installed
	original_load = extractor._load_model.__func__

	def patched_load(self):
	self._loaded = True
	try:
	raise ImportError("No module named 'calamancy'")
	except ImportError:
	try:
	import spacy
	self._nlp = spacy.load("en_core_web_sm")
	except Exception:
	self._nlp = None

	import types
	extractor._load_model = types.MethodType(patched_load, extractor)
	extractor._load_model()
	# Either spaCy loaded successfully or fell back to None
	assert extractor._loaded is True

	def test_hint_based_fallback_when_both_unavailable(self):
	"""When both calamancy and spaCy fail, hint-based NER still works."""
	import nlp.ner as mod
	extractor = mod.EntityExtractor()
	extractor._loaded = True
	extractor._nlp = None # force hint-based path

	result = extractor.extract("Sinabi ni Marcos sa Davao tungkol sa DOH")
	assert isinstance(result.persons, list)
	assert isinstance(result.organizations, list)
	assert isinstance(result.locations, list)
	# Should find hint-based entities
	assert any("Marcos" in p for p in result.persons)

	def test_ner_result_method_reflects_path(self):
	"""method field on NERResult reflects which extraction path was used."""
	import nlp.ner as mod
	extractor = mod.EntityExtractor()
	extractor._loaded = True
	extractor._nlp = None

	result = extractor._hint_based_extract("Marcos is in Manila with DOH")
	assert result.method == "hints"

	def test_extract_with_no_model_returns_ner_result(self):
	from nlp.ner import EntityExtractor, NERResult
	e = EntityExtractor()
	e._loaded = True
	e._nlp = None
	result = e.extract("DOH confirmed 500 cases in Cebu on January 2026")
	assert isinstance(result, NERResult)
	assert len(result.dates) > 0 # Should find "January 2026"