Spaces:
Running
Running
File size: 12,948 Bytes
6c9b8f1 b1c84b5 6c9b8f1 b1c84b5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | """
PhilVerify β Unit Tests
Covers: text preprocessor, language detector, clickbait detector, scoring engine,
and Phase 5 evidence modules (similarity, stance detection, domain credibility).
Run: pytest tests/ -v
"""
import sys
from pathlib import Path
# Ensure project root is on PYTHONPATH
sys.path.insert(0, str(Path(__file__).parent.parent))
import pytest
# ββ TextPreprocessor ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestTextPreprocessor:
def setup_method(self):
from nlp.preprocessor import TextPreprocessor
self.preprocessor = TextPreprocessor()
def test_lowercases_text(self):
result = self.preprocessor.clean("HELLO WORLD")
assert result == "hello world"
def test_strips_urls(self):
result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
assert "https://" not in result
assert "rappler.com" not in result
def test_strips_html_tags(self):
result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
assert "<" not in result and ">" not in result
def test_strips_mentions(self):
result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
assert "@" not in result
def test_removes_stopwords(self):
filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
assert "ang" not in filtered
assert "fake" in filtered
def test_normalizes_repeated_chars(self):
result = self.preprocessor.normalize("graaabe ang gaaalit ko")
assert "graaabe" not in result
def test_full_pipeline_returns_result(self):
from nlp.preprocessor import PreprocessResult
result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
assert isinstance(result, PreprocessResult)
assert result.char_count > 0
assert len(result.tokens) > 0
# ββ LanguageDetector ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestLanguageDetector:
def setup_method(self):
from nlp.language_detector import LanguageDetector
self.detector = LanguageDetector()
def test_detects_tagalog(self):
result = self.detector.detect(
"Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
)
assert result.language in ("Tagalog", "Taglish")
def test_detects_english(self):
result = self.detector.detect(
"The Supreme Court ruled in favor of the petition filed by the opposition."
)
assert result.language in ("English", "Taglish")
def test_detects_taglish(self):
result = self.detector.detect(
"Grabe ang news ngayon! The president announced na libre ang lahat!"
)
# Should detect either Taglish or remain consistent
assert result.language in ("Tagalog", "English", "Taglish")
def test_unknown_for_empty(self):
result = self.detector.detect("")
assert result.language == "Unknown"
def test_confidence_between_0_and_1(self):
result = self.detector.detect("Ang balita ay napakalaki!")
assert 0.0 <= result.confidence <= 1.0
# ββ ClickbaitDetector βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestClickbaitDetector:
def setup_method(self):
from nlp.clickbait import ClickbaitDetector
self.detector = ClickbaitDetector()
def test_detects_clickbait_all_caps(self):
result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
assert result.is_clickbait is True
assert result.score > 0.3
def test_detects_clickbait_tagalog(self):
result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
assert result.score > 0.3
def test_clean_headline_not_clickbait(self):
result = self.detector.detect(
"DOH reports 500 new cases as vaccination drive continues in Metro Manila"
)
assert result.is_clickbait is False
def test_score_between_0_and_1(self):
result = self.detector.detect("Breaking news today")
assert 0.0 <= result.score <= 1.0
# ββ TF-IDF Classifier βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestTFIDFClassifier:
def setup_method(self):
from ml.tfidf_classifier import TFIDFClassifier
self.clf = TFIDFClassifier()
self.clf.train()
def test_predict_returns_valid_verdict(self):
result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
assert result.verdict in ("Credible", "Unverified", "Fake")
def test_confidence_in_valid_range(self):
result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
assert 0.0 <= result.confidence <= 100.0
def test_triggered_features_are_strings(self):
result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
assert all(isinstance(f, str) for f in result.triggered_features)
def test_seed_fake_news_detected(self):
result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
# Should not be Credible for obvious fake claim
assert result.verdict in ("Unverified", "Fake", "Likely Fake")
# ββ Scoring Engine (lightweight integration) ββββββββββββββββββββββββββββββββββ
class TestScoringEngine:
"""Integration test β no API keys needed, evidence score defaults to 50."""
@pytest.mark.asyncio
async def test_verify_text_returns_response(self):
from scoring.engine import run_verification
from api.schemas import VerificationResponse
result = await run_verification(
"GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
input_type="text",
)
assert isinstance(result, VerificationResponse)
assert result.verdict is not None
assert 0.0 <= result.final_score <= 100.0
@pytest.mark.asyncio
async def test_verify_credible_text(self):
from scoring.engine import run_verification
result = await run_verification(
"DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
input_type="text",
)
assert result.final_score is not None
assert result.language is not None
@pytest.mark.asyncio
async def test_entities_extracted(self):
from scoring.engine import run_verification
result = await run_verification(
"President Marcos announced new policies in Manila regarding the AFP and PNP.",
input_type="text",
)
assert result.entities is not None
# ββ Phase 5: Domain Credibility βββββββββββββββββββββββββββββββββββββββββββββββ
class TestDomainCredibility:
def setup_method(self):
from evidence.domain_credibility import lookup_domain, extract_domain, is_blacklisted, DomainTier
self.lookup = lookup_domain
self.extract = extract_domain
self.is_blacklisted = is_blacklisted
self.DomainTier = DomainTier
def test_rappler_is_tier1(self):
result = self.lookup("https://www.rappler.com/news/something")
assert result.tier == self.DomainTier.CREDIBLE
def test_inquirer_is_tier1(self):
result = self.lookup("inquirer.net")
assert result.tier == self.DomainTier.CREDIBLE
def test_known_fake_is_tier4(self):
result = self.lookup("duterte.news")
assert result.tier == self.DomainTier.KNOWN_FAKE
def test_unknown_domain_is_tier3(self):
result = self.lookup("some-totally-random-blog.ph")
assert result.tier == self.DomainTier.SUSPICIOUS
def test_blacklisted_returns_true(self):
assert self.is_blacklisted("maharlikanews.com") is True
def test_rappler_not_blacklisted(self):
assert self.is_blacklisted("rappler.com") is False
def test_extract_domain_strips_www(self):
assert self.extract("https://www.gmanetwork.com/news/story") == "gmanetwork.com"
def test_tier1_score_adjustment_positive(self):
result = self.lookup("rappler.com")
assert result.score_adjustment > 0
def test_tier4_score_adjustment_negative(self):
result = self.lookup("pinoyakoblog.com")
assert result.score_adjustment < 0
# ββ Phase 5: Similarity βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestSimilarity:
def setup_method(self):
from evidence.similarity import compute_similarity, _jaccard_similarity, rank_articles_by_similarity
self.compute = compute_similarity
self.jaccard = _jaccard_similarity
self.rank = rank_articles_by_similarity
def test_identical_texts_score_1(self):
score = self.jaccard("free vaccines available now", "free vaccines available now")
assert score == 1.0
def test_unrelated_texts_low_score(self):
score = self.jaccard("banana pancakes recipe", "supreme court ruling on property tax")
assert score < 0.2
def test_empty_claim_returns_0(self):
assert self.compute("", "some article text") == 0.0
def test_score_in_range(self):
score = self.compute("government hid truth about vaccines", "vaccine rollout delayed by officials")
assert 0.0 <= score <= 1.0
def test_rank_articles_sorted_desc(self):
articles = [
{"title": "Banana split recipe tips", "description": ""},
{"title": "Government vaccine program expanded", "description": "DOH announces rollout"},
{"title": "COVID vaccination drive update", "description": "Metro Manila sites open"},
]
ranked = self.rank("vaccine rollout in Metro Manila", articles)
similarities = [a["similarity"] for a in ranked]
assert similarities == sorted(similarities, reverse=True)
# ββ Phase 5: Stance Detection βββββββββββββββββββββββββββββββββββββββββββββββββ
class TestStanceDetector:
def setup_method(self):
from evidence.stance_detector import detect_stance, Stance
self.detect = detect_stance
self.Stance = Stance
def test_refutation_keywords_trigger_refutes(self):
result = self.detect(
claim="Government distributed free rice to all families",
article_title="FACT CHECK: False β No free rice distribution was authorized",
article_description="Officials confirmed no such program exists",
similarity=0.55,
)
assert result.stance == self.Stance.REFUTES
def test_low_similarity_returns_nei(self):
result = self.detect(
claim="Earthquake hits Mindanao",
article_title="Restaurant review: Best adobo in Quezon City",
article_description="Five star dining experience downtown",
similarity=0.05,
)
assert result.stance == self.Stance.NOT_ENOUGH_INFO
def test_fact_check_domain_returns_refutes(self):
result = self.detect(
claim="New law passed by senate",
article_title="Article about laws",
article_description="Senate session coverage",
article_url="https://vera-files.org/fact-check/123",
similarity=0.40,
)
assert result.stance == self.Stance.REFUTES
def test_confidence_in_range(self):
result = self.detect(
claim="DOH confirms new disease outbreak",
article_title="DOH official statement on health alert confirmed",
article_description="Health officials verified the outbreak in Metro Manila",
similarity=0.60,
)
assert 0.0 <= result.confidence <= 1.0
def test_result_has_reason(self):
result = self.detect("Some claim", "Some title", "Some description", similarity=0.30)
assert isinstance(result.reason, str) and len(result.reason) > 0
|