philverify-api / tests /test_philverify.py
Ryan Christian D. Deniega
fix: cold start 502, favicon, verify state persistence
b1c84b5
"""
PhilVerify β€” Unit Tests
Covers: text preprocessor, language detector, clickbait detector, scoring engine,
and Phase 5 evidence modules (similarity, stance detection, domain credibility).
Run: pytest tests/ -v
"""
import sys
from pathlib import Path
# Ensure project root is on PYTHONPATH
sys.path.insert(0, str(Path(__file__).parent.parent))
import pytest
# ── TextPreprocessor ──────────────────────────────────────────────────────────
class TestTextPreprocessor:
def setup_method(self):
from nlp.preprocessor import TextPreprocessor
self.preprocessor = TextPreprocessor()
def test_lowercases_text(self):
result = self.preprocessor.clean("HELLO WORLD")
assert result == "hello world"
def test_strips_urls(self):
result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
assert "https://" not in result
assert "rappler.com" not in result
def test_strips_html_tags(self):
result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
assert "<" not in result and ">" not in result
def test_strips_mentions(self):
result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
assert "@" not in result
def test_removes_stopwords(self):
filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
assert "ang" not in filtered
assert "fake" in filtered
def test_normalizes_repeated_chars(self):
result = self.preprocessor.normalize("graaabe ang gaaalit ko")
assert "graaabe" not in result
def test_full_pipeline_returns_result(self):
from nlp.preprocessor import PreprocessResult
result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
assert isinstance(result, PreprocessResult)
assert result.char_count > 0
assert len(result.tokens) > 0
# ── LanguageDetector ──────────────────────────────────────────────────────────
class TestLanguageDetector:
def setup_method(self):
from nlp.language_detector import LanguageDetector
self.detector = LanguageDetector()
def test_detects_tagalog(self):
result = self.detector.detect(
"Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
)
assert result.language in ("Tagalog", "Taglish")
def test_detects_english(self):
result = self.detector.detect(
"The Supreme Court ruled in favor of the petition filed by the opposition."
)
assert result.language in ("English", "Taglish")
def test_detects_taglish(self):
result = self.detector.detect(
"Grabe ang news ngayon! The president announced na libre ang lahat!"
)
# Should detect either Taglish or remain consistent
assert result.language in ("Tagalog", "English", "Taglish")
def test_unknown_for_empty(self):
result = self.detector.detect("")
assert result.language == "Unknown"
def test_confidence_between_0_and_1(self):
result = self.detector.detect("Ang balita ay napakalaki!")
assert 0.0 <= result.confidence <= 1.0
# ── ClickbaitDetector ─────────────────────────────────────────────────────────
class TestClickbaitDetector:
def setup_method(self):
from nlp.clickbait import ClickbaitDetector
self.detector = ClickbaitDetector()
def test_detects_clickbait_all_caps(self):
result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
assert result.is_clickbait is True
assert result.score > 0.3
def test_detects_clickbait_tagalog(self):
result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
assert result.score > 0.3
def test_clean_headline_not_clickbait(self):
result = self.detector.detect(
"DOH reports 500 new cases as vaccination drive continues in Metro Manila"
)
assert result.is_clickbait is False
def test_score_between_0_and_1(self):
result = self.detector.detect("Breaking news today")
assert 0.0 <= result.score <= 1.0
# ── TF-IDF Classifier ─────────────────────────────────────────────────────────
class TestTFIDFClassifier:
def setup_method(self):
from ml.tfidf_classifier import TFIDFClassifier
self.clf = TFIDFClassifier()
self.clf.train()
def test_predict_returns_valid_verdict(self):
result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
assert result.verdict in ("Credible", "Unverified", "Fake")
def test_confidence_in_valid_range(self):
result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
assert 0.0 <= result.confidence <= 100.0
def test_triggered_features_are_strings(self):
result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
assert all(isinstance(f, str) for f in result.triggered_features)
def test_seed_fake_news_detected(self):
result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
# Should not be Credible for obvious fake claim
assert result.verdict in ("Unverified", "Fake", "Likely Fake")
# ── Scoring Engine (lightweight integration) ──────────────────────────────────
class TestScoringEngine:
"""Integration test β€” no API keys needed, evidence score defaults to 50."""
@pytest.mark.asyncio
async def test_verify_text_returns_response(self):
from scoring.engine import run_verification
from api.schemas import VerificationResponse
result = await run_verification(
"GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
input_type="text",
)
assert isinstance(result, VerificationResponse)
assert result.verdict is not None
assert 0.0 <= result.final_score <= 100.0
@pytest.mark.asyncio
async def test_verify_credible_text(self):
from scoring.engine import run_verification
result = await run_verification(
"DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
input_type="text",
)
assert result.final_score is not None
assert result.language is not None
@pytest.mark.asyncio
async def test_entities_extracted(self):
from scoring.engine import run_verification
result = await run_verification(
"President Marcos announced new policies in Manila regarding the AFP and PNP.",
input_type="text",
)
assert result.entities is not None
# ── Phase 5: Domain Credibility ───────────────────────────────────────────────
class TestDomainCredibility:
def setup_method(self):
from evidence.domain_credibility import lookup_domain, extract_domain, is_blacklisted, DomainTier
self.lookup = lookup_domain
self.extract = extract_domain
self.is_blacklisted = is_blacklisted
self.DomainTier = DomainTier
def test_rappler_is_tier1(self):
result = self.lookup("https://www.rappler.com/news/something")
assert result.tier == self.DomainTier.CREDIBLE
def test_inquirer_is_tier1(self):
result = self.lookup("inquirer.net")
assert result.tier == self.DomainTier.CREDIBLE
def test_known_fake_is_tier4(self):
result = self.lookup("duterte.news")
assert result.tier == self.DomainTier.KNOWN_FAKE
def test_unknown_domain_is_tier3(self):
result = self.lookup("some-totally-random-blog.ph")
assert result.tier == self.DomainTier.SUSPICIOUS
def test_blacklisted_returns_true(self):
assert self.is_blacklisted("maharlikanews.com") is True
def test_rappler_not_blacklisted(self):
assert self.is_blacklisted("rappler.com") is False
def test_extract_domain_strips_www(self):
assert self.extract("https://www.gmanetwork.com/news/story") == "gmanetwork.com"
def test_tier1_score_adjustment_positive(self):
result = self.lookup("rappler.com")
assert result.score_adjustment > 0
def test_tier4_score_adjustment_negative(self):
result = self.lookup("pinoyakoblog.com")
assert result.score_adjustment < 0
# ── Phase 5: Similarity ───────────────────────────────────────────────────────
class TestSimilarity:
def setup_method(self):
from evidence.similarity import compute_similarity, _jaccard_similarity, rank_articles_by_similarity
self.compute = compute_similarity
self.jaccard = _jaccard_similarity
self.rank = rank_articles_by_similarity
def test_identical_texts_score_1(self):
score = self.jaccard("free vaccines available now", "free vaccines available now")
assert score == 1.0
def test_unrelated_texts_low_score(self):
score = self.jaccard("banana pancakes recipe", "supreme court ruling on property tax")
assert score < 0.2
def test_empty_claim_returns_0(self):
assert self.compute("", "some article text") == 0.0
def test_score_in_range(self):
score = self.compute("government hid truth about vaccines", "vaccine rollout delayed by officials")
assert 0.0 <= score <= 1.0
def test_rank_articles_sorted_desc(self):
articles = [
{"title": "Banana split recipe tips", "description": ""},
{"title": "Government vaccine program expanded", "description": "DOH announces rollout"},
{"title": "COVID vaccination drive update", "description": "Metro Manila sites open"},
]
ranked = self.rank("vaccine rollout in Metro Manila", articles)
similarities = [a["similarity"] for a in ranked]
assert similarities == sorted(similarities, reverse=True)
# ── Phase 5: Stance Detection ─────────────────────────────────────────────────
class TestStanceDetector:
def setup_method(self):
from evidence.stance_detector import detect_stance, Stance
self.detect = detect_stance
self.Stance = Stance
def test_refutation_keywords_trigger_refutes(self):
result = self.detect(
claim="Government distributed free rice to all families",
article_title="FACT CHECK: False β€” No free rice distribution was authorized",
article_description="Officials confirmed no such program exists",
similarity=0.55,
)
assert result.stance == self.Stance.REFUTES
def test_low_similarity_returns_nei(self):
result = self.detect(
claim="Earthquake hits Mindanao",
article_title="Restaurant review: Best adobo in Quezon City",
article_description="Five star dining experience downtown",
similarity=0.05,
)
assert result.stance == self.Stance.NOT_ENOUGH_INFO
def test_fact_check_domain_returns_refutes(self):
result = self.detect(
claim="New law passed by senate",
article_title="Article about laws",
article_description="Senate session coverage",
article_url="https://vera-files.org/fact-check/123",
similarity=0.40,
)
assert result.stance == self.Stance.REFUTES
def test_confidence_in_range(self):
result = self.detect(
claim="DOH confirms new disease outbreak",
article_title="DOH official statement on health alert confirmed",
article_description="Health officials verified the outbreak in Metro Manila",
similarity=0.60,
)
assert 0.0 <= result.confidence <= 1.0
def test_result_has_reason(self):
result = self.detect("Some claim", "Some title", "Some description", similarity=0.30)
assert isinstance(result.reason, str) and len(result.reason) > 0