File size: 12,948 Bytes
6c9b8f1
 
b1c84b5
 
6c9b8f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1c84b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
"""
PhilVerify β€” Unit Tests
Covers: text preprocessor, language detector, clickbait detector, scoring engine,
        and Phase 5 evidence modules (similarity, stance detection, domain credibility).
Run: pytest tests/ -v
"""
import sys
from pathlib import Path

# Ensure project root is on PYTHONPATH
sys.path.insert(0, str(Path(__file__).parent.parent))

import pytest


# ── TextPreprocessor ──────────────────────────────────────────────────────────

class TestTextPreprocessor:
    def setup_method(self):
        from nlp.preprocessor import TextPreprocessor
        self.preprocessor = TextPreprocessor()

    def test_lowercases_text(self):
        result = self.preprocessor.clean("HELLO WORLD")
        assert result == "hello world"

    def test_strips_urls(self):
        result = self.preprocessor.clean("Check this out https://rappler.com/news/article123")
        assert "https://" not in result
        assert "rappler.com" not in result

    def test_strips_html_tags(self):
        result = self.preprocessor.clean("<p>Hello <b>World</b></p>")
        assert "<" not in result and ">" not in result

    def test_strips_mentions(self):
        result = self.preprocessor.clean("Great post @PresidentPH and @DOH_Philippines!")
        assert "@" not in result

    def test_removes_stopwords(self):
        filtered = self.preprocessor.remove_stopwords(["ang", "fake", "news", "sa", "pilipinas"])
        assert "ang" not in filtered
        assert "fake" in filtered

    def test_normalizes_repeated_chars(self):
        result = self.preprocessor.normalize("graaabe ang gaaalit ko")
        assert "graaabe" not in result

    def test_full_pipeline_returns_result(self):
        from nlp.preprocessor import PreprocessResult
        result = self.preprocessor.preprocess("GRABE! Namatay daw ang tatlong tao sa bagong sakit na kumakalat!")
        assert isinstance(result, PreprocessResult)
        assert result.char_count > 0
        assert len(result.tokens) > 0


# ── LanguageDetector ──────────────────────────────────────────────────────────

class TestLanguageDetector:
    def setup_method(self):
        from nlp.language_detector import LanguageDetector
        self.detector = LanguageDetector()

    def test_detects_tagalog(self):
        result = self.detector.detect(
            "Ang mga mamamayan ay nag-aalala sa bagong batas na isinusulong ng pangulo."
        )
        assert result.language in ("Tagalog", "Taglish")

    def test_detects_english(self):
        result = self.detector.detect(
            "The Supreme Court ruled in favor of the petition filed by the opposition."
        )
        assert result.language in ("English", "Taglish")

    def test_detects_taglish(self):
        result = self.detector.detect(
            "Grabe ang news ngayon! The president announced na libre ang lahat!"
        )
        # Should detect either Taglish or remain consistent
        assert result.language in ("Tagalog", "English", "Taglish")

    def test_unknown_for_empty(self):
        result = self.detector.detect("")
        assert result.language == "Unknown"

    def test_confidence_between_0_and_1(self):
        result = self.detector.detect("Ang balita ay napakalaki!")
        assert 0.0 <= result.confidence <= 1.0


# ── ClickbaitDetector ─────────────────────────────────────────────────────────

class TestClickbaitDetector:
    def setup_method(self):
        from nlp.clickbait import ClickbaitDetector
        self.detector = ClickbaitDetector()

    def test_detects_clickbait_all_caps(self):
        result = self.detector.detect("SHOCKING NEWS: GOVERNMENT CAUGHT LYING TO EVERYONE!")
        assert result.is_clickbait is True
        assert result.score > 0.3

    def test_detects_clickbait_tagalog(self):
        result = self.detector.detect("GRABE!! Natuklasan na ang katotohanan ng bigas scandal!!!")
        assert result.score > 0.3

    def test_clean_headline_not_clickbait(self):
        result = self.detector.detect(
            "DOH reports 500 new cases as vaccination drive continues in Metro Manila"
        )
        assert result.is_clickbait is False

    def test_score_between_0_and_1(self):
        result = self.detector.detect("Breaking news today")
        assert 0.0 <= result.score <= 1.0


# ── TF-IDF Classifier ─────────────────────────────────────────────────────────

class TestTFIDFClassifier:
    def setup_method(self):
        from ml.tfidf_classifier import TFIDFClassifier
        self.clf = TFIDFClassifier()
        self.clf.train()

    def test_predict_returns_valid_verdict(self):
        result = self.clf.predict("DOH reports 500 new COVID cases today in Metro Manila")
        assert result.verdict in ("Credible", "Unverified", "Fake")

    def test_confidence_in_valid_range(self):
        result = self.clf.predict("SHOCKING: Government hid the truth about vaccines!")
        assert 0.0 <= result.confidence <= 100.0

    def test_triggered_features_are_strings(self):
        result = self.clf.predict("GRABE! Namatay daw ang tatlong tao sa bagong sakit!")
        assert all(isinstance(f, str) for f in result.triggered_features)

    def test_seed_fake_news_detected(self):
        result = self.clf.predict("CONFIRMED: Philippines to become 51st state of USA in 2026!")
        # Should not be Credible for obvious fake claim
        assert result.verdict in ("Unverified", "Fake", "Likely Fake")


# ── Scoring Engine (lightweight integration) ──────────────────────────────────

class TestScoringEngine:
    """Integration test β€” no API keys needed, evidence score defaults to 50."""

    @pytest.mark.asyncio
    async def test_verify_text_returns_response(self):
        from scoring.engine import run_verification
        from api.schemas import VerificationResponse

        result = await run_verification(
            "GRABE! Nakita ko raw namatay ang tatlong tao sa bagong sakit na kumakalat sa Pilipinas!",
            input_type="text",
        )
        assert isinstance(result, VerificationResponse)
        assert result.verdict is not None
        assert 0.0 <= result.final_score <= 100.0

    @pytest.mark.asyncio
    async def test_verify_credible_text(self):
        from scoring.engine import run_verification

        result = await run_verification(
            "DOH reports 500 new COVID-19 cases as vaccination drive continues in Metro Manila",
            input_type="text",
        )
        assert result.final_score is not None
        assert result.language is not None

    @pytest.mark.asyncio
    async def test_entities_extracted(self):
        from scoring.engine import run_verification

        result = await run_verification(
            "President Marcos announced new policies in Manila regarding the AFP and PNP.",
            input_type="text",
        )
        assert result.entities is not None


# ── Phase 5: Domain Credibility ───────────────────────────────────────────────

class TestDomainCredibility:
    def setup_method(self):
        from evidence.domain_credibility import lookup_domain, extract_domain, is_blacklisted, DomainTier
        self.lookup = lookup_domain
        self.extract = extract_domain
        self.is_blacklisted = is_blacklisted
        self.DomainTier = DomainTier

    def test_rappler_is_tier1(self):
        result = self.lookup("https://www.rappler.com/news/something")
        assert result.tier == self.DomainTier.CREDIBLE

    def test_inquirer_is_tier1(self):
        result = self.lookup("inquirer.net")
        assert result.tier == self.DomainTier.CREDIBLE

    def test_known_fake_is_tier4(self):
        result = self.lookup("duterte.news")
        assert result.tier == self.DomainTier.KNOWN_FAKE

    def test_unknown_domain_is_tier3(self):
        result = self.lookup("some-totally-random-blog.ph")
        assert result.tier == self.DomainTier.SUSPICIOUS

    def test_blacklisted_returns_true(self):
        assert self.is_blacklisted("maharlikanews.com") is True

    def test_rappler_not_blacklisted(self):
        assert self.is_blacklisted("rappler.com") is False

    def test_extract_domain_strips_www(self):
        assert self.extract("https://www.gmanetwork.com/news/story") == "gmanetwork.com"

    def test_tier1_score_adjustment_positive(self):
        result = self.lookup("rappler.com")
        assert result.score_adjustment > 0

    def test_tier4_score_adjustment_negative(self):
        result = self.lookup("pinoyakoblog.com")
        assert result.score_adjustment < 0


# ── Phase 5: Similarity ───────────────────────────────────────────────────────

class TestSimilarity:
    def setup_method(self):
        from evidence.similarity import compute_similarity, _jaccard_similarity, rank_articles_by_similarity
        self.compute = compute_similarity
        self.jaccard = _jaccard_similarity
        self.rank = rank_articles_by_similarity

    def test_identical_texts_score_1(self):
        score = self.jaccard("free vaccines available now", "free vaccines available now")
        assert score == 1.0

    def test_unrelated_texts_low_score(self):
        score = self.jaccard("banana pancakes recipe", "supreme court ruling on property tax")
        assert score < 0.2

    def test_empty_claim_returns_0(self):
        assert self.compute("", "some article text") == 0.0

    def test_score_in_range(self):
        score = self.compute("government hid truth about vaccines", "vaccine rollout delayed by officials")
        assert 0.0 <= score <= 1.0

    def test_rank_articles_sorted_desc(self):
        articles = [
            {"title": "Banana split recipe tips", "description": ""},
            {"title": "Government vaccine program expanded", "description": "DOH announces rollout"},
            {"title": "COVID vaccination drive update", "description": "Metro Manila sites open"},
        ]
        ranked = self.rank("vaccine rollout in Metro Manila", articles)
        similarities = [a["similarity"] for a in ranked]
        assert similarities == sorted(similarities, reverse=True)


# ── Phase 5: Stance Detection ─────────────────────────────────────────────────

class TestStanceDetector:
    def setup_method(self):
        from evidence.stance_detector import detect_stance, Stance
        self.detect = detect_stance
        self.Stance = Stance

    def test_refutation_keywords_trigger_refutes(self):
        result = self.detect(
            claim="Government distributed free rice to all families",
            article_title="FACT CHECK: False β€” No free rice distribution was authorized",
            article_description="Officials confirmed no such program exists",
            similarity=0.55,
        )
        assert result.stance == self.Stance.REFUTES

    def test_low_similarity_returns_nei(self):
        result = self.detect(
            claim="Earthquake hits Mindanao",
            article_title="Restaurant review: Best adobo in Quezon City",
            article_description="Five star dining experience downtown",
            similarity=0.05,
        )
        assert result.stance == self.Stance.NOT_ENOUGH_INFO

    def test_fact_check_domain_returns_refutes(self):
        result = self.detect(
            claim="New law passed by senate",
            article_title="Article about laws",
            article_description="Senate session coverage",
            article_url="https://vera-files.org/fact-check/123",
            similarity=0.40,
        )
        assert result.stance == self.Stance.REFUTES

    def test_confidence_in_range(self):
        result = self.detect(
            claim="DOH confirms new disease outbreak",
            article_title="DOH official statement on health alert confirmed",
            article_description="Health officials verified the outbreak in Metro Manila",
            similarity=0.60,
        )
        assert 0.0 <= result.confidence <= 1.0

    def test_result_has_reason(self):
        result = self.detect("Some claim", "Some title", "Some description", similarity=0.30)
        assert isinstance(result.reason, str) and len(result.reason) > 0