Spaces:

SemiAutomat1c
/

philverify-api

Running

philverify-api / tests /test_philverify.py

Ryan Christian D. Deniega

fix: cold start 502, favicon, verify state persistence

b1c84b5 18 days ago

12.9 kB

	"""
	PhilVerify — Unit Tests
	Covers: text preprocessor, language detector, clickbait detector, scoring engine,
	and Phase 5 evidence modules (similarity, stance detection, domain credibility).
	Run: pytest tests/ -v
	"""
	import sys
	from pathlib import Path

	# Ensure project root is on PYTHONPATH
	sys.path.insert(0, str(Path(__file__).parent.parent))

	import pytest


	# ── TextPreprocessor ──────────────────────────────────────────────────────────

	class TestTextPreprocessor:
	def setup_method(self):
	from nlp.preprocessor import TextPreprocessor
	self.preprocessor = TextPreprocessor()

	def test_lowercases_text(self):
	result = self.preprocessor.clean("HELLO WORLD")
	assert result == "hello world"

	def test_strips_urls(self):
	result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
	assert "https://" not in result
	assert "rappler.com" not in result

	def test_strips_html_tags(self):
	result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
	assert "<" not in result and ">" not in result

	def test_strips_mentions(self):
	result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
	assert "@" not in result

	def test_removes_stopwords(self):
	filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
	assert "ang" not in filtered
	assert "fake" in filtered

	def test_normalizes_repeated_chars(self):
	result = self.preprocessor.normalize("graaabe ang gaaalit ko")
	assert "graaabe" not in result

	def test_full_pipeline_returns_result(self):
	from nlp.preprocessor import PreprocessResult
	result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
	assert isinstance(result, PreprocessResult)
	assert result.char_count > 0
	assert len(result.tokens) > 0


	# ── LanguageDetector ──────────────────────────────────────────────────────────

	class TestLanguageDetector:
	def setup_method(self):
	from nlp.language_detector import LanguageDetector
	self.detector = LanguageDetector()

	def test_detects_tagalog(self):
	result = self.detector.detect(
	"Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
	)
	assert result.language in ("Tagalog", "Taglish")

	def test_detects_english(self):
	result = self.detector.detect(
	"The Supreme Court ruled in favor of the petition filed by the opposition."
	)
	assert result.language in ("English", "Taglish")

	def test_detects_taglish(self):
	result = self.detector.detect(
	"Grabe ang news ngayon! The president announced na libre ang lahat!"
	)
	# Should detect either Taglish or remain consistent
	assert result.language in ("Tagalog", "English", "Taglish")

	def test_unknown_for_empty(self):
	result = self.detector.detect("")
	assert result.language == "Unknown"

	def test_confidence_between_0_and_1(self):
	result = self.detector.detect("Ang balita ay napakalaki!")
	assert 0.0 <= result.confidence <= 1.0


	# ── ClickbaitDetector ─────────────────────────────────────────────────────────

	class TestClickbaitDetector:
	def setup_method(self):
	from nlp.clickbait import ClickbaitDetector
	self.detector = ClickbaitDetector()

	def test_detects_clickbait_all_caps(self):
	result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
	assert result.is_clickbait is True
	assert result.score > 0.3

	def test_detects_clickbait_tagalog(self):
	result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
	assert result.score > 0.3

	def test_clean_headline_not_clickbait(self):
	result = self.detector.detect(
	"DOH reports 500 new cases as vaccination drive continues in Metro Manila"
	)
	assert result.is_clickbait is False

	def test_score_between_0_and_1(self):
	result = self.detector.detect("Breaking news today")
	assert 0.0 <= result.score <= 1.0


	# ── TF-IDF Classifier ─────────────────────────────────────────────────────────

	class TestTFIDFClassifier:
	def setup_method(self):
	from ml.tfidf_classifier import TFIDFClassifier
	self.clf = TFIDFClassifier()
	self.clf.train()

	def test_predict_returns_valid_verdict(self):
	result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
	assert result.verdict in ("Credible", "Unverified", "Fake")

	def test_confidence_in_valid_range(self):
	result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
	assert 0.0 <= result.confidence <= 100.0

	def test_triggered_features_are_strings(self):
	result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
	assert all(isinstance(f, str) for f in result.triggered_features)

	def test_seed_fake_news_detected(self):
	result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
	# Should not be Credible for obvious fake claim
	assert result.verdict in ("Unverified", "Fake", "Likely Fake")


	# ── Scoring Engine (lightweight integration) ──────────────────────────────────

	class TestScoringEngine:
	"""Integration test — no API keys needed, evidence score defaults to 50."""

	@pytest.mark.asyncio
	async def test_verify_text_returns_response(self):
	from scoring.engine import run_verification
	from api.schemas import VerificationResponse

	result = await run_verification(
	"GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
	input_type="text",
	)
	assert isinstance(result, VerificationResponse)
	assert result.verdict is not None
	assert 0.0 <= result.final_score <= 100.0

	@pytest.mark.asyncio
	async def test_verify_credible_text(self):
	from scoring.engine import run_verification

	result = await run_verification(
	"DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
	input_type="text",
	)
	assert result.final_score is not None
	assert result.language is not None

	@pytest.mark.asyncio
	async def test_entities_extracted(self):
	from scoring.engine import run_verification

	result = await run_verification(
	"President Marcos announced new policies in Manila regarding the AFP and PNP.",
	input_type="text",
	)
	assert result.entities is not None


	# ── Phase 5: Domain Credibility ───────────────────────────────────────────────

	class TestDomainCredibility:
	def setup_method(self):
	from evidence.domain_credibility import lookup_domain, extract_domain, is_blacklisted, DomainTier
	self.lookup = lookup_domain
	self.extract = extract_domain
	self.is_blacklisted = is_blacklisted
	self.DomainTier = DomainTier

	def test_rappler_is_tier1(self):
	result = self.lookup("https://www.rappler.com/news/something")
	assert result.tier == self.DomainTier.CREDIBLE

	def test_inquirer_is_tier1(self):
	result = self.lookup("inquirer.net")
	assert result.tier == self.DomainTier.CREDIBLE

	def test_known_fake_is_tier4(self):
	result = self.lookup("duterte.news")
	assert result.tier == self.DomainTier.KNOWN_FAKE

	def test_unknown_domain_is_tier3(self):
	result = self.lookup("some-totally-random-blog.ph")
	assert result.tier == self.DomainTier.SUSPICIOUS

	def test_blacklisted_returns_true(self):
	assert self.is_blacklisted("maharlikanews.com") is True

	def test_rappler_not_blacklisted(self):
	assert self.is_blacklisted("rappler.com") is False

	def test_extract_domain_strips_www(self):
	assert self.extract("https://www.gmanetwork.com/news/story") == "gmanetwork.com"

	def test_tier1_score_adjustment_positive(self):
	result = self.lookup("rappler.com")
	assert result.score_adjustment > 0

	def test_tier4_score_adjustment_negative(self):
	result = self.lookup("pinoyakoblog.com")
	assert result.score_adjustment < 0


	# ── Phase 5: Similarity ───────────────────────────────────────────────────────

	class TestSimilarity:
	def setup_method(self):
	from evidence.similarity import compute_similarity, _jaccard_similarity, rank_articles_by_similarity
	self.compute = compute_similarity
	self.jaccard = _jaccard_similarity
	self.rank = rank_articles_by_similarity

	def test_identical_texts_score_1(self):
	score = self.jaccard("free vaccines available now", "free vaccines available now")
	assert score == 1.0

	def test_unrelated_texts_low_score(self):
	score = self.jaccard("banana pancakes recipe", "supreme court ruling on property tax")
	assert score < 0.2

	def test_empty_claim_returns_0(self):
	assert self.compute("", "some article text") == 0.0

	def test_score_in_range(self):
	score = self.compute("government hid truth about vaccines", "vaccine rollout delayed by officials")
	assert 0.0 <= score <= 1.0

	def test_rank_articles_sorted_desc(self):
	articles = [
	{"title": "Banana split recipe tips", "description": ""},
	{"title": "Government vaccine program expanded", "description": "DOH announces rollout"},
	{"title": "COVID vaccination drive update", "description": "Metro Manila sites open"},
	]
	ranked = self.rank("vaccine rollout in Metro Manila", articles)
	similarities = [a["similarity"] for a in ranked]
	assert similarities == sorted(similarities, reverse=True)


	# ── Phase 5: Stance Detection ─────────────────────────────────────────────────

	class TestStanceDetector:
	def setup_method(self):
	from evidence.stance_detector import detect_stance, Stance
	self.detect = detect_stance
	self.Stance = Stance

	def test_refutation_keywords_trigger_refutes(self):
	result = self.detect(
	claim="Government distributed free rice to all families",
	article_title="FACT CHECK: False — No free rice distribution was authorized",
	article_description="Officials confirmed no such program exists",
	similarity=0.55,
	)
	assert result.stance == self.Stance.REFUTES

	def test_low_similarity_returns_nei(self):
	result = self.detect(
	claim="Earthquake hits Mindanao",
	article_title="Restaurant review: Best adobo in Quezon City",
	article_description="Five star dining experience downtown",
	similarity=0.05,
	)
	assert result.stance == self.Stance.NOT_ENOUGH_INFO

	def test_fact_check_domain_returns_refutes(self):
	result = self.detect(
	claim="New law passed by senate",
	article_title="Article about laws",
	article_description="Senate session coverage",
	article_url="https://vera-files.org/fact-check/123",
	similarity=0.40,
	)
	assert result.stance == self.Stance.REFUTES

	def test_confidence_in_range(self):
	result = self.detect(
	claim="DOH confirms new disease outbreak",
	article_title="DOH official statement on health alert confirmed",
	article_description="Health officials verified the outbreak in Metro Manila",
	similarity=0.60,
	)
	assert 0.0 <= result.confidence <= 1.0

	def test_result_has_reason(self):
	result = self.detect("Some claim", "Some title", "Some description", similarity=0.30)
	assert isinstance(result.reason, str) and len(result.reason) > 0