Spaces:
Running
feat: extension button placement, text extraction, OCR display + ML improvements
Browse filesExtension:
- Fix verify button placement to appear below Like/Comment/Share bar on all page types
- Fix walk-up logic to find correct post container (message container > article > innerText>100)
- Fix [dir="auto"] filter to work when postElement is a sub-section of the article
- Add obfuscated text detection to skip Facebook's character-by-character spans
- Show full caption text (no truncation) and extracted image in verification modal
- Add IMAGE TEXT (OCR) section to modal using backend ocr_text field
- Fix "Extension context invalidated" error to show refresh prompt instead of retry
- Add webNavigation SPA detection for Facebook pushState navigation
API:
- Add image_url field to TextVerifyRequest β runs Tesseract OCR alongside caption text
- Add ocr_text field to VerificationResponse for image OCR results
ML/NLP:
- Add Naive Bayes, BoW, LDA, and ensemble classifiers
- Add Tagalog RoBERTa fine-tuned model
- Improve preprocessor, NER, claim extractor, and scoring engine
- Add benchmarks page to frontend
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
- .firebaserc +1 -1
- api/routes/verify.py +16 -1
- api/schemas.py +5 -0
- docs/plans/2026-03-17-extension-ui-redesign.md +104 -0
- evidence/stance_detector.py +61 -4
- extension/background.js +32 -4
- extension/content.css +108 -37
- extension/content.js +578 -234
- extension/manifest.json +5 -1
- extension/popup.html +88 -13
- extension/popup.js +61 -36
- firebase.json +8 -1
- firestore.indexes.json +49 -11
- firestore.rules +14 -5
- frontend/src/App.jsx +2 -0
- frontend/src/components/Navbar.jsx +2 -1
- frontend/src/pages/BenchmarksPage.jsx +400 -0
- frontend/src/pages/VerifyPage.jsx +21 -1
- ml/bow_classifier.py +69 -0
- ml/dataset.py +38 -0
- ml/ensemble_classifier.py +76 -0
- ml/eval.py +172 -0
- ml/lda_analysis.py +182 -0
- ml/models/tagalog_roberta_model/config.json +40 -0
- ml/models/tagalog_roberta_model/tokenizer.json +0 -0
- ml/models/tagalog_roberta_model/tokenizer_config.json +20 -0
- ml/naive_bayes_classifier.py +111 -0
- ml/tagalog_roberta_classifier.py +157 -0
- ml/train_tagalog_roberta.py +287 -0
- ml/train_xlmr.py +7 -1
- ml/xlm_roberta_classifier.py +21 -1
- nlp/claim_extractor.py +59 -53
- nlp/ner.py +14 -8
- nlp/preprocessor.py +57 -10
- requirements.txt +1 -0
- scoring/engine.py +38 -18
- tests/test_improvements.py +409 -0
|
@@ -12,4 +12,4 @@
|
|
| 12 |
}
|
| 13 |
},
|
| 14 |
"etags": {}
|
| 15 |
-
}
|
|
|
|
| 12 |
}
|
| 13 |
},
|
| 14 |
"etags": {}
|
| 15 |
+
}
|
|
@@ -77,11 +77,26 @@ async def _fetch_og_text(url: str) -> str:
|
|
| 77 |
)
|
| 78 |
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
|
| 79 |
start = time.perf_counter()
|
| 80 |
-
logger.info("verify/text called | chars=%d", len(body.text))
|
| 81 |
try:
|
| 82 |
result = await run_verification(body.text, input_type="text")
|
| 83 |
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 84 |
result.extracted_text = body.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
return result
|
| 86 |
except Exception as exc:
|
| 87 |
logger.exception("verify/text error: %s", exc)
|
|
|
|
| 77 |
)
|
| 78 |
async def verify_text(body: TextVerifyRequest) -> VerificationResponse:
|
| 79 |
start = time.perf_counter()
|
| 80 |
+
logger.info("verify/text called | chars=%d | has_image=%s", len(body.text), bool(body.image_url))
|
| 81 |
try:
|
| 82 |
result = await run_verification(body.text, input_type="text")
|
| 83 |
result.processing_time_ms = round((time.perf_counter() - start) * 1000, 1)
|
| 84 |
result.extracted_text = body.text
|
| 85 |
+
|
| 86 |
+
# If an image URL was provided, fetch it and run OCR β store result separately
|
| 87 |
+
if body.image_url:
|
| 88 |
+
try:
|
| 89 |
+
import httpx
|
| 90 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 91 |
+
img_resp = await client.get(body.image_url)
|
| 92 |
+
if img_resp.status_code == 200:
|
| 93 |
+
ocr = await extract_text_from_image(img_resp.content)
|
| 94 |
+
if ocr:
|
| 95 |
+
result.ocr_text = ocr.strip()
|
| 96 |
+
logger.info("OCR from image_url: %d chars", len(result.ocr_text))
|
| 97 |
+
except Exception as ocr_exc:
|
| 98 |
+
logger.warning("OCR for image_url failed (non-fatal): %s", ocr_exc)
|
| 99 |
+
|
| 100 |
return result
|
| 101 |
except Exception as exc:
|
| 102 |
logger.exception("verify/text error: %s", exc)
|
|
@@ -49,6 +49,7 @@ class DomainTier(int, Enum):
|
|
| 49 |
|
| 50 |
class TextVerifyRequest(BaseModel):
|
| 51 |
text: str = Field(..., min_length=10, max_length=10_000, description="Raw text to verify")
|
|
|
|
| 52 |
|
| 53 |
|
| 54 |
class URLVerifyRequest(BaseModel):
|
|
@@ -71,6 +72,7 @@ class Layer1Result(BaseModel):
|
|
| 71 |
default_factory=list,
|
| 72 |
description="Human-readable list of suspicious features detected",
|
| 73 |
)
|
|
|
|
| 74 |
|
| 75 |
|
| 76 |
class EvidenceSource(BaseModel):
|
|
@@ -78,6 +80,7 @@ class EvidenceSource(BaseModel):
|
|
| 78 |
url: str
|
| 79 |
similarity: float = Field(..., ge=0.0, le=1.0, description="Cosine similarity to input claim")
|
| 80 |
stance: Stance
|
|
|
|
| 81 |
domain_tier: DomainTier
|
| 82 |
published_at: Optional[str] = None
|
| 83 |
source_name: Optional[str] = None
|
|
@@ -88,6 +91,7 @@ class Layer2Result(BaseModel):
|
|
| 88 |
evidence_score: float = Field(..., ge=0.0, le=100.0)
|
| 89 |
sources: list[EvidenceSource] = []
|
| 90 |
claim_used: Optional[str] = Field(None, description="Extracted claim sent to evidence search")
|
|
|
|
| 91 |
|
| 92 |
|
| 93 |
# ββ Main Response βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -106,6 +110,7 @@ class VerificationResponse(BaseModel):
|
|
| 106 |
input_type: str = "text"
|
| 107 |
processing_time_ms: Optional[float] = None
|
| 108 |
extracted_text: Optional[str] = Field(None, description="Raw text extracted from the URL / image / video for transparency")
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
# ββ History / Trends ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 49 |
|
| 50 |
class TextVerifyRequest(BaseModel):
|
| 51 |
text: str = Field(..., min_length=10, max_length=10_000, description="Raw text to verify")
|
| 52 |
+
image_url: Optional[str] = Field(None, description="Optional image URL to run OCR on alongside the text")
|
| 53 |
|
| 54 |
|
| 55 |
class URLVerifyRequest(BaseModel):
|
|
|
|
| 72 |
default_factory=list,
|
| 73 |
description="Human-readable list of suspicious features detected",
|
| 74 |
)
|
| 75 |
+
model_tier: Optional[str] = Field(None, description="Classifier used: ensemble | xlmr | tfidf")
|
| 76 |
|
| 77 |
|
| 78 |
class EvidenceSource(BaseModel):
|
|
|
|
| 80 |
url: str
|
| 81 |
similarity: float = Field(..., ge=0.0, le=1.0, description="Cosine similarity to input claim")
|
| 82 |
stance: Stance
|
| 83 |
+
stance_reason: Optional[str] = Field(None, description="NLI entailment or keyword reason for stance")
|
| 84 |
domain_tier: DomainTier
|
| 85 |
published_at: Optional[str] = None
|
| 86 |
source_name: Optional[str] = None
|
|
|
|
| 91 |
evidence_score: float = Field(..., ge=0.0, le=100.0)
|
| 92 |
sources: list[EvidenceSource] = []
|
| 93 |
claim_used: Optional[str] = Field(None, description="Extracted claim sent to evidence search")
|
| 94 |
+
claim_method: Optional[str] = Field(None, description="How the claim was extracted: sentence_scoring | sentence_heuristic | passthrough")
|
| 95 |
|
| 96 |
|
| 97 |
# ββ Main Response βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 110 |
input_type: str = "text"
|
| 111 |
processing_time_ms: Optional[float] = None
|
| 112 |
extracted_text: Optional[str] = Field(None, description="Raw text extracted from the URL / image / video for transparency")
|
| 113 |
+
ocr_text: Optional[str] = Field(None, description="Text extracted from an image via OCR (when image_url was provided alongside text)")
|
| 114 |
|
| 115 |
|
| 116 |
# ββ History / Trends ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PhilVerify Extension β UI Redesign
|
| 2 |
+
**Date:** 2026-03-17
|
| 3 |
+
|
| 4 |
+
## Goals
|
| 5 |
+
1. Surface new backend fields (`model_tier`, `claim_method`, `stance_reason`) without cluttering the UI
|
| 6 |
+
2. Make the verdict scannable in under 2 seconds across all surfaces
|
| 7 |
+
3. Visual consistency between side panel, inline modal, and history tab
|
| 8 |
+
|
| 9 |
+
## Information Hierarchy
|
| 10 |
+
|
| 11 |
+
Four tiers, consistent across all surfaces:
|
| 12 |
+
|
| 13 |
+
| Tier | Content | Style |
|
| 14 |
+
|------|---------|-------|
|
| 15 |
+
| 1 | Verdict label | Large, bold, verdict color |
|
| 16 |
+
| 2 | Credibility score | Medium weight, verdict color, slightly smaller |
|
| 17 |
+
| 3 | Signals + top source | Normal weight, neutral text |
|
| 18 |
+
| 4 | model_tier, claim_method | 10px monospace, muted #6b7280 |
|
| 19 |
+
|
| 20 |
+
**Visual anchor:** 3px left border in verdict color on every result card.
|
| 21 |
+
|
| 22 |
+
**Theme:** dark newsroom β `#0d0d0d` bg, `#1a1a1a` card surface, `#262626` borders.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## Side Panel Result Card (`popup.js renderResult`)
|
| 27 |
+
|
| 28 |
+
**Top block**
|
| 29 |
+
- 3px left border (verdict color)
|
| 30 |
+
- Verdict label: 20px bold, verdict color
|
| 31 |
+
- Score: same line, right-aligned
|
| 32 |
+
- 1px colored hairline separator below
|
| 33 |
+
|
| 34 |
+
**Middle block**
|
| 35 |
+
- Triggered features: small inline chips (dark bg, verdict-colored border, 10px)
|
| 36 |
+
- Top source: distinct link block with `#1a1a1a` bg, `#262626` border, site name + truncated title + β
|
| 37 |
+
|
| 38 |
+
**Footer block**
|
| 39 |
+
- `border-top: 1px solid #262626`, 8px top padding
|
| 40 |
+
- `MODEL ensemble CLAIM VIA sentence_scoring`
|
| 41 |
+
- 10px monospace, labels `#4b5563`, values `#6b7280`
|
| 42 |
+
|
| 43 |
+
**Bottom**
|
| 44 |
+
- "Open Full Dashboard β" as full-width footer button with `border-top: 1px solid #262626`
|
| 45 |
+
|
| 46 |
+
---
|
| 47 |
+
|
| 48 |
+
## Inline Modal (content.js / content.css)
|
| 49 |
+
|
| 50 |
+
Injected as full-width block below post. Fixed width ~320px. Same left-border spine pattern.
|
| 51 |
+
|
| 52 |
+
```
|
| 53 |
+
βββββββββββββββββββββββββββββββββββββββββ
|
| 54 |
+
β LIKELY FAKE 84% credibility
|
| 55 |
+
β βββββββββββββββββββββββββββββββββββββ
|
| 56 |
+
β Signals: clickbait_title, no_byline
|
| 57 |
+
β Top Source: Rappler β "Claim is falseβ¦" β
|
| 58 |
+
β βββββββββββββββββββββββββββββββββββββ
|
| 59 |
+
β model: ensemble Β· via: sentence_scoring
|
| 60 |
+
βββββββββββββββββββββββββββββββββββββββββ
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
- Line 1: Verdict (bold, verdict color) + score right-aligned
|
| 64 |
+
- Line 2: Hairline separator (verdict color, 30% opacity)
|
| 65 |
+
- Line 3: Signals (up to 3, comma-separated)
|
| 66 |
+
- Line 4: Top source title truncated at 45 chars + β
|
| 67 |
+
- Line 5: Hairline separator
|
| 68 |
+
- Line 6: model_tier Β· claim_method β 10px monospace, muted
|
| 69 |
+
|
| 70 |
+
- `Γ` dismiss button top-right
|
| 71 |
+
- "Verify this post" button replaced in-place by result block after verification
|
| 72 |
+
|
| 73 |
+
---
|
| 74 |
+
|
| 75 |
+
## History Tab
|
| 76 |
+
|
| 77 |
+
Entry layout (~60px tall per item):
|
| 78 |
+
|
| 79 |
+
```
|
| 80 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 81 |
+
β β LIKELY FAKE 84% ensemble β
|
| 82 |
+
β "Marcos signs new law allowingβ¦" β
|
| 83 |
+
β 2h ago β
|
| 84 |
+
βββββββββββββββββββββββββββββββββββββββββββ
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
+
- Row 1: Colored dot + verdict chip + score + model_tier (muted monospace, pushed right)
|
| 88 |
+
- Row 2: Text preview (#9ca3af, 12px)
|
| 89 |
+
- Row 3: Timestamp (#6b7280, 10px)
|
| 90 |
+
- Left border: 2px solid verdict color
|
| 91 |
+
- Hover: `background: #1a1a1a`
|
| 92 |
+
|
| 93 |
+
Empty state: centered 32px shield SVG outline (muted) + "No verifications yet." below it.
|
| 94 |
+
|
| 95 |
+
---
|
| 96 |
+
|
| 97 |
+
## Files to Modify
|
| 98 |
+
|
| 99 |
+
| File | Changes |
|
| 100 |
+
|------|---------|
|
| 101 |
+
| `extension/popup.js` | Rewrite `renderResult()`, update `renderHistory()` |
|
| 102 |
+
| `extension/popup.css` | Add `.result-spine`, `.result-footer-meta`, `.result-chip`, update `.history-item` |
|
| 103 |
+
| `extension/content.js` | Update modal HTML template |
|
| 104 |
+
| `extension/content.css` | Update `.pv-badge` / modal styles, add spine + footer-meta |
|
|
@@ -7,10 +7,15 @@ Stance labels:
|
|
| 7 |
Refutes β article content contradicts / debunks the claim
|
| 8 |
Not Enough Info β article is related but not conclusive either way
|
| 9 |
|
| 10 |
-
Strategy (
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
"""
|
| 15 |
import logging
|
| 16 |
import re
|
|
@@ -19,6 +24,28 @@ from enum import Enum
|
|
| 19 |
|
| 20 |
logger = logging.getLogger(__name__)
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
class Stance(str, Enum):
|
| 24 |
SUPPORTS = "Supports"
|
|
@@ -110,6 +137,36 @@ def detect_stance(
|
|
| 110 |
reason=f"Low similarity ({similarity:.2f}) β article not related to claim",
|
| 111 |
)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# ββ Rule 2: Scan for refutation keywords ββββββββββββββββββββββββββββββββββ
|
| 114 |
refutation_hits = _scan_keywords(article_text, _REFUTATION_KEYWORDS)
|
| 115 |
if refutation_hits:
|
|
|
|
| 7 |
Refutes β article content contradicts / debunks the claim
|
| 8 |
Not Enough Info β article is related but not conclusive either way
|
| 9 |
|
| 10 |
+
Strategy (hybrid β NLI model primary, keyword rules as fallback):
|
| 11 |
+
0. Known PH fact-check domain β always Refutes
|
| 12 |
+
1. Similarity floor β too low similarity β NEI
|
| 13 |
+
1.5 NLI entailment check (cross-encoder/nli-MiniLM2-L6-H768) when
|
| 14 |
+
article description is long enough and model is available.
|
| 15 |
+
Uses the claim as the hypothesis and the article text as the premise.
|
| 16 |
+
Falls through to keyword rules if NLI confidence < 0.65.
|
| 17 |
+
2. Keyword scan of title + description for refutation/support signals
|
| 18 |
+
3. Default NEI
|
| 19 |
"""
|
| 20 |
import logging
|
| 21 |
import re
|
|
|
|
| 24 |
|
| 25 |
logger = logging.getLogger(__name__)
|
| 26 |
|
| 27 |
+
# ββ NLI model (lazy-loaded) βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
_nli_pipe = None
|
| 29 |
+
_nli_loaded = False
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _get_nli():
|
| 33 |
+
"""Return the zero-shot NLI pipeline, loading it once on first call."""
|
| 34 |
+
global _nli_pipe, _nli_loaded
|
| 35 |
+
if _nli_loaded:
|
| 36 |
+
return _nli_pipe
|
| 37 |
+
try:
|
| 38 |
+
from transformers import pipeline
|
| 39 |
+
_nli_pipe = pipeline(
|
| 40 |
+
"zero-shot-classification",
|
| 41 |
+
model="cross-encoder/nli-MiniLM2-L6-H768",
|
| 42 |
+
)
|
| 43 |
+
logger.info("NLI stance model (nli-MiniLM2-L6-H768) loaded")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.warning("NLI stance model unavailable (%s) β using keyword fallback", e)
|
| 46 |
+
_nli_loaded = True
|
| 47 |
+
return _nli_pipe
|
| 48 |
+
|
| 49 |
|
| 50 |
class Stance(str, Enum):
|
| 51 |
SUPPORTS = "Supports"
|
|
|
|
| 137 |
reason=f"Low similarity ({similarity:.2f}) β article not related to claim",
|
| 138 |
)
|
| 139 |
|
| 140 |
+
# ββ Rule 1.5: NLI entailment β semantically compare claim to article ββββββ
|
| 141 |
+
nli = _get_nli()
|
| 142 |
+
if nli and len(article_description.strip()) > 30:
|
| 143 |
+
try:
|
| 144 |
+
nli_result = nli(
|
| 145 |
+
article_description[:512],
|
| 146 |
+
candidate_labels=["supports the claim", "contradicts the claim", "unrelated"],
|
| 147 |
+
hypothesis_template="This text {}.",
|
| 148 |
+
)
|
| 149 |
+
top_label = nli_result["labels"][0]
|
| 150 |
+
top_score = float(nli_result["scores"][0])
|
| 151 |
+
if top_score >= 0.65:
|
| 152 |
+
if "supports" in top_label:
|
| 153 |
+
return StanceResult(
|
| 154 |
+
stance=Stance.SUPPORTS,
|
| 155 |
+
confidence=round(top_score, 2),
|
| 156 |
+
matched_keywords=[],
|
| 157 |
+
reason=f"NLI entailment ({top_score:.2f}): article supports claim",
|
| 158 |
+
)
|
| 159 |
+
elif "contradicts" in top_label:
|
| 160 |
+
return StanceResult(
|
| 161 |
+
stance=Stance.REFUTES,
|
| 162 |
+
confidence=round(top_score, 2),
|
| 163 |
+
matched_keywords=[],
|
| 164 |
+
reason=f"NLI contradiction ({top_score:.2f}): article contradicts claim",
|
| 165 |
+
)
|
| 166 |
+
# NLI confidence below threshold β fall through to keyword rules
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.debug("NLI inference error: %s", e)
|
| 169 |
+
|
| 170 |
# ββ Rule 2: Scan for refutation keywords ββββββββββββββββββββββββββββββββββ
|
| 171 |
refutation_hits = _scan_keywords(article_text, _REFUTATION_KEYWORDS)
|
| 172 |
if refutation_hits:
|
|
@@ -20,7 +20,7 @@ const MAX_HISTORY = 50
|
|
| 20 |
|
| 21 |
// ββ Default settings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
const DEFAULT_SETTINGS = {
|
| 23 |
-
apiBase: '
|
| 24 |
autoScan: true, // Automatically scan Facebook feed posts
|
| 25 |
}
|
| 26 |
|
|
@@ -75,6 +75,7 @@ async function setCached(key, result, preview) {
|
|
| 75 |
text_preview: preview.slice(0, 80),
|
| 76 |
verdict: result.verdict,
|
| 77 |
final_score: result.final_score,
|
|
|
|
| 78 |
}
|
| 79 |
const updated = [entry, ...history.filter(h => h.id !== key)].slice(0, MAX_HISTORY)
|
| 80 |
await chrome.storage.local.set({ history: updated })
|
|
@@ -88,15 +89,17 @@ async function verifyText(text, imageUrl) {
|
|
| 88 |
if (hit) return { ...hit, _fromCache: true }
|
| 89 |
|
| 90 |
const { apiBase } = await getSettings()
|
| 91 |
-
// Build payload β include imageUrl for multimodal (text + image) analysis
|
| 92 |
const payload = { text }
|
| 93 |
if (imageUrl && isHttpUrl(imageUrl)) payload.image_url = imageUrl
|
|
|
|
|
|
|
| 94 |
|
| 95 |
const res = await fetch(`${apiBase}/verify/text`, {
|
| 96 |
method: 'POST',
|
| 97 |
headers: { 'Content-Type': 'application/json' },
|
| 98 |
body: JSON.stringify(payload),
|
| 99 |
})
|
|
|
|
| 100 |
if (!res.ok) {
|
| 101 |
const body = await res.json().catch(() => ({}))
|
| 102 |
throw new Error(body.detail ?? `API error ${res.status}`)
|
|
@@ -112,11 +115,14 @@ async function verifyUrl(url) {
|
|
| 112 |
if (hit) return { ...hit, _fromCache: true }
|
| 113 |
|
| 114 |
const { apiBase } = await getSettings()
|
|
|
|
|
|
|
| 115 |
const res = await fetch(`${apiBase}/verify/url`, {
|
| 116 |
method: 'POST',
|
| 117 |
headers: { 'Content-Type': 'application/json' },
|
| 118 |
body: JSON.stringify({ url }),
|
| 119 |
})
|
|
|
|
| 120 |
if (!res.ok) {
|
| 121 |
const body = await res.json().catch(() => ({}))
|
| 122 |
throw new Error(body.detail ?? `API error ${res.status}`)
|
|
@@ -203,13 +209,35 @@ chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
|
|
| 203 |
sendResponse({ ok: false, error: 'Invalid API URL: only http/https allowed' })
|
| 204 |
return false
|
| 205 |
}
|
| 206 |
-
|
| 207 |
-
|
|
|
|
| 208 |
.then(() => sendResponse({ ok: true }))
|
| 209 |
return true
|
| 210 |
}
|
| 211 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
default:
|
| 213 |
break
|
| 214 |
}
|
| 215 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
// ββ Default settings ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 22 |
const DEFAULT_SETTINGS = {
|
| 23 |
+
apiBase: 'http://localhost:8000/api',
|
| 24 |
autoScan: true, // Automatically scan Facebook feed posts
|
| 25 |
}
|
| 26 |
|
|
|
|
| 75 |
text_preview: preview.slice(0, 80),
|
| 76 |
verdict: result.verdict,
|
| 77 |
final_score: result.final_score,
|
| 78 |
+
model_tier: result.layer1?.model_tier ?? null,
|
| 79 |
}
|
| 80 |
const updated = [entry, ...history.filter(h => h.id !== key)].slice(0, MAX_HISTORY)
|
| 81 |
await chrome.storage.local.set({ history: updated })
|
|
|
|
| 89 |
if (hit) return { ...hit, _fromCache: true }
|
| 90 |
|
| 91 |
const { apiBase } = await getSettings()
|
|
|
|
| 92 |
const payload = { text }
|
| 93 |
if (imageUrl && isHttpUrl(imageUrl)) payload.image_url = imageUrl
|
| 94 |
+
|
| 95 |
+
console.log('[PhilVerify BG] Calling API:', `${apiBase}/verify/text`, payload)
|
| 96 |
|
| 97 |
const res = await fetch(`${apiBase}/verify/text`, {
|
| 98 |
method: 'POST',
|
| 99 |
headers: { 'Content-Type': 'application/json' },
|
| 100 |
body: JSON.stringify(payload),
|
| 101 |
})
|
| 102 |
+
console.log('[PhilVerify BG] API Response Status:', res.status)
|
| 103 |
if (!res.ok) {
|
| 104 |
const body = await res.json().catch(() => ({}))
|
| 105 |
throw new Error(body.detail ?? `API error ${res.status}`)
|
|
|
|
| 115 |
if (hit) return { ...hit, _fromCache: true }
|
| 116 |
|
| 117 |
const { apiBase } = await getSettings()
|
| 118 |
+
console.log('[PhilVerify BG] Calling API:', `${apiBase}/verify/url`, url)
|
| 119 |
+
|
| 120 |
const res = await fetch(`${apiBase}/verify/url`, {
|
| 121 |
method: 'POST',
|
| 122 |
headers: { 'Content-Type': 'application/json' },
|
| 123 |
body: JSON.stringify({ url }),
|
| 124 |
})
|
| 125 |
+
console.log('[PhilVerify BG] API Response Status:', res.status)
|
| 126 |
if (!res.ok) {
|
| 127 |
const body = await res.json().catch(() => ({}))
|
| 128 |
throw new Error(body.detail ?? `API error ${res.status}`)
|
|
|
|
| 209 |
sendResponse({ ok: false, error: 'Invalid API URL: only http/https allowed' })
|
| 210 |
return false
|
| 211 |
}
|
| 212 |
+
// Merge with existing settings so a partial update doesn't clobber other fields
|
| 213 |
+
getSettings()
|
| 214 |
+
.then(current => chrome.storage.local.set({ settings: { ...current, ...incoming } }))
|
| 215 |
.then(() => sendResponse({ ok: true }))
|
| 216 |
return true
|
| 217 |
}
|
| 218 |
|
| 219 |
+
case 'CHECK_HEALTH': {
|
| 220 |
+
getSettings()
|
| 221 |
+
.then(({ apiBase }) => fetch(`${apiBase}/health`, { signal: AbortSignal.timeout(3000) }))
|
| 222 |
+
.then(res => sendResponse({ ok: res.ok, status: res.status }))
|
| 223 |
+
.catch(e => sendResponse({ ok: false, error: e.message }))
|
| 224 |
+
return true
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
default:
|
| 228 |
break
|
| 229 |
}
|
| 230 |
})
|
| 231 |
+
|
| 232 |
+
// ββ SPA navigation: re-scan Facebook posts after pushState navigation βββββββββ
|
| 233 |
+
// Facebook is a single-page app β clicking Home/Profile/etc. does a pushState
|
| 234 |
+
// navigation without reloading the page. The content script stays alive but
|
| 235 |
+
// needs to re-scan for new post articles after the page content changes.
|
| 236 |
+
chrome.webNavigation.onHistoryStateUpdated.addListener((details) => {
|
| 237 |
+
if (details.url.includes('facebook.com')) {
|
| 238 |
+
chrome.tabs.sendMessage(details.tabId, { action: 'RE_SCAN_POSTS' }, () => {
|
| 239 |
+
// Suppress "no listener" errors when the content script isn't loaded yet
|
| 240 |
+
if (chrome.runtime.lastError) {}
|
| 241 |
+
})
|
| 242 |
+
}
|
| 243 |
+
})
|
|
@@ -10,8 +10,13 @@
|
|
| 10 |
display: flex;
|
| 11 |
justify-content: flex-end;
|
| 12 |
padding: 4px 12px 8px;
|
| 13 |
-
pointer-events
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
|
| 17 |
.pv-verify-btn {
|
|
@@ -139,30 +144,67 @@
|
|
| 139 |
}
|
| 140 |
}
|
| 141 |
|
| 142 |
-
/* ββ
|
| 143 |
-
.pv-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
background: #141414;
|
| 148 |
border: 1px solid rgba(245, 240, 232, 0.1);
|
| 149 |
-
border-radius:
|
|
|
|
| 150 |
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
|
| 151 |
font-size: 11px;
|
| 152 |
color: #f5f0e8;
|
| 153 |
-
|
| 154 |
-
box-shadow: 0 4px 24px rgba(0, 0, 0, 0.5);
|
| 155 |
position: relative;
|
| 156 |
-
|
|
|
|
|
|
|
| 157 |
}
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
/* β Header */
|
| 160 |
.pv-report-header {
|
| 161 |
display: flex;
|
| 162 |
align-items: center;
|
| 163 |
justify-content: space-between;
|
| 164 |
-
margin-bottom:
|
| 165 |
-
padding-bottom:
|
| 166 |
border-bottom: 1px solid rgba(245, 240, 232, 0.07);
|
| 167 |
}
|
| 168 |
|
|
@@ -182,12 +224,12 @@
|
|
| 182 |
padding: 2px 6px;
|
| 183 |
border-radius: 4px;
|
| 184 |
touch-action: manipulation;
|
| 185 |
-
transition: color 0.15s ease;
|
| 186 |
}
|
| 187 |
|
| 188 |
.pv-report-close:hover {
|
| 189 |
color: #f5f0e8;
|
| 190 |
-
background: rgba(245, 240, 232, 0.
|
| 191 |
}
|
| 192 |
|
| 193 |
.pv-report-close:focus-visible {
|
|
@@ -196,18 +238,17 @@
|
|
| 196 |
|
| 197 |
/* β Verdict row */
|
| 198 |
.pv-report-verdict-row {
|
| 199 |
-
padding:
|
| 200 |
-
margin-bottom:
|
| 201 |
border-left: 3px solid #5c554e;
|
| 202 |
-
border-radius:
|
| 203 |
-
background: rgba(245, 240, 232, 0.03);
|
| 204 |
}
|
| 205 |
|
| 206 |
.pv-report-verdict {
|
| 207 |
-
font-size:
|
| 208 |
font-weight: 800;
|
| 209 |
letter-spacing: -0.01em;
|
| 210 |
-
margin-bottom:
|
| 211 |
}
|
| 212 |
|
| 213 |
.pv-report-score-text {
|
|
@@ -236,7 +277,7 @@
|
|
| 236 |
.pv-confidence-bar-fill {
|
| 237 |
height: 100%;
|
| 238 |
border-radius: 3px;
|
| 239 |
-
transition: width 0.
|
| 240 |
}
|
| 241 |
|
| 242 |
.pv-confidence-bar-value {
|
|
@@ -253,7 +294,7 @@
|
|
| 253 |
display: flex;
|
| 254 |
justify-content: space-between;
|
| 255 |
align-items: center;
|
| 256 |
-
padding:
|
| 257 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 258 |
}
|
| 259 |
|
|
@@ -272,9 +313,9 @@
|
|
| 272 |
color: #a89f94;
|
| 273 |
}
|
| 274 |
|
| 275 |
-
/* β Suspicious signals
|
| 276 |
.pv-report-signals {
|
| 277 |
-
padding:
|
| 278 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 279 |
}
|
| 280 |
|
|
@@ -282,7 +323,7 @@
|
|
| 282 |
display: flex;
|
| 283 |
flex-wrap: wrap;
|
| 284 |
gap: 4px;
|
| 285 |
-
margin-top:
|
| 286 |
}
|
| 287 |
|
| 288 |
.pv-report-tag {
|
|
@@ -298,14 +339,14 @@
|
|
| 298 |
|
| 299 |
/* β Evidence sources */
|
| 300 |
.pv-report-sources {
|
| 301 |
-
padding:
|
| 302 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 303 |
}
|
| 304 |
|
| 305 |
.pv-report-sources-list {
|
| 306 |
list-style: none;
|
| 307 |
padding: 0;
|
| 308 |
-
margin:
|
| 309 |
display: flex;
|
| 310 |
flex-direction: column;
|
| 311 |
gap: 4px;
|
|
@@ -341,9 +382,9 @@
|
|
| 341 |
flex-shrink: 0;
|
| 342 |
}
|
| 343 |
|
| 344 |
-
/* β
|
| 345 |
.pv-report-explanation {
|
| 346 |
-
padding:
|
| 347 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 348 |
}
|
| 349 |
|
|
@@ -351,14 +392,14 @@
|
|
| 351 |
margin: 6px 0 0;
|
| 352 |
font-size: 10px;
|
| 353 |
color: #a89f94;
|
| 354 |
-
line-height: 1.
|
| 355 |
font-style: italic;
|
| 356 |
}
|
| 357 |
|
| 358 |
-
/* β Full
|
| 359 |
.pv-report-full-link {
|
| 360 |
display: block;
|
| 361 |
-
margin-top:
|
| 362 |
text-align: center;
|
| 363 |
color: #dc2626;
|
| 364 |
font-size: 10px;
|
|
@@ -366,12 +407,42 @@
|
|
| 366 |
letter-spacing: 0.08em;
|
| 367 |
text-decoration: none;
|
| 368 |
text-transform: uppercase;
|
| 369 |
-
padding:
|
| 370 |
border: 1px solid rgba(220, 38, 38, 0.3);
|
| 371 |
-
border-radius:
|
| 372 |
transition: background 0.15s ease;
|
| 373 |
}
|
| 374 |
|
| 375 |
.pv-report-full-link:hover {
|
| 376 |
-
background: rgba(220, 38, 38, 0.
|
| 377 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
display: flex;
|
| 11 |
justify-content: flex-end;
|
| 12 |
padding: 4px 12px 8px;
|
| 13 |
+
/* pointer-events must remain auto β do NOT set to none here */
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/* When button is injected directly (strategy 1, no wrapper) */
|
| 17 |
+
div > .pv-verify-btn:not(.pv-verify-btn-wrapper .pv-verify-btn) {
|
| 18 |
+
margin-left: 6px;
|
| 19 |
+
vertical-align: middle;
|
| 20 |
}
|
| 21 |
|
| 22 |
.pv-verify-btn {
|
|
|
|
| 144 |
}
|
| 145 |
}
|
| 146 |
|
| 147 |
+
/* ββ Modal overlay βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */
|
| 148 |
+
.pv-modal-overlay {
|
| 149 |
+
position: fixed;
|
| 150 |
+
inset: 0;
|
| 151 |
+
z-index: 2147483640;
|
| 152 |
+
display: flex;
|
| 153 |
+
align-items: center;
|
| 154 |
+
justify-content: center;
|
| 155 |
+
padding: 16px;
|
| 156 |
+
background: rgba(0, 0, 0, 0);
|
| 157 |
+
backdrop-filter: blur(0px);
|
| 158 |
+
-webkit-backdrop-filter: blur(0px);
|
| 159 |
+
transition: background 0.2s ease, backdrop-filter 0.2s ease;
|
| 160 |
+
pointer-events: none;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
.pv-modal-overlay.pv-modal--open {
|
| 164 |
+
background: rgba(0, 0, 0, 0.65);
|
| 165 |
+
backdrop-filter: blur(4px);
|
| 166 |
+
-webkit-backdrop-filter: blur(4px);
|
| 167 |
+
pointer-events: auto;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.pv-modal-card {
|
| 171 |
+
width: 100%;
|
| 172 |
+
max-width: 460px;
|
| 173 |
+
max-height: 90vh;
|
| 174 |
+
overflow-y: auto;
|
| 175 |
background: #141414;
|
| 176 |
border: 1px solid rgba(245, 240, 232, 0.1);
|
| 177 |
+
border-radius: 12px;
|
| 178 |
+
padding: 18px 20px;
|
| 179 |
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', system-ui, sans-serif;
|
| 180 |
font-size: 11px;
|
| 181 |
color: #f5f0e8;
|
| 182 |
+
box-shadow: 0 24px 64px rgba(0, 0, 0, 0.7), 0 0 0 1px rgba(255,255,255,0.04);
|
|
|
|
| 183 |
position: relative;
|
| 184 |
+
transform: scale(0.94) translateY(12px);
|
| 185 |
+
opacity: 0;
|
| 186 |
+
transition: transform 0.25s cubic-bezier(0.34, 1.56, 0.64, 1), opacity 0.2s ease;
|
| 187 |
}
|
| 188 |
|
| 189 |
+
.pv-modal--open .pv-modal-card {
|
| 190 |
+
transform: scale(1) translateY(0);
|
| 191 |
+
opacity: 1;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
/* Scrollbar styling for modal card */
|
| 195 |
+
.pv-modal-card::-webkit-scrollbar { width: 4px; }
|
| 196 |
+
.pv-modal-card::-webkit-scrollbar-track { background: transparent; }
|
| 197 |
+
.pv-modal-card::-webkit-scrollbar-thumb { background: rgba(245,240,232,0.15); border-radius: 2px; }
|
| 198 |
+
|
| 199 |
+
/* ββ Report internals (shared between modal & future contexts) βββββββββββββββββ */
|
| 200 |
+
|
| 201 |
/* β Header */
|
| 202 |
.pv-report-header {
|
| 203 |
display: flex;
|
| 204 |
align-items: center;
|
| 205 |
justify-content: space-between;
|
| 206 |
+
margin-bottom: 14px;
|
| 207 |
+
padding-bottom: 10px;
|
| 208 |
border-bottom: 1px solid rgba(245, 240, 232, 0.07);
|
| 209 |
}
|
| 210 |
|
|
|
|
| 224 |
padding: 2px 6px;
|
| 225 |
border-radius: 4px;
|
| 226 |
touch-action: manipulation;
|
| 227 |
+
transition: color 0.15s ease, background 0.15s ease;
|
| 228 |
}
|
| 229 |
|
| 230 |
.pv-report-close:hover {
|
| 231 |
color: #f5f0e8;
|
| 232 |
+
background: rgba(245, 240, 232, 0.07);
|
| 233 |
}
|
| 234 |
|
| 235 |
.pv-report-close:focus-visible {
|
|
|
|
| 238 |
|
| 239 |
/* β Verdict row */
|
| 240 |
.pv-report-verdict-row {
|
| 241 |
+
padding: 12px 14px;
|
| 242 |
+
margin-bottom: 14px;
|
| 243 |
border-left: 3px solid #5c554e;
|
| 244 |
+
border-radius: 4px;
|
|
|
|
| 245 |
}
|
| 246 |
|
| 247 |
.pv-report-verdict {
|
| 248 |
+
font-size: 20px;
|
| 249 |
font-weight: 800;
|
| 250 |
letter-spacing: -0.01em;
|
| 251 |
+
margin-bottom: 3px;
|
| 252 |
}
|
| 253 |
|
| 254 |
.pv-report-score-text {
|
|
|
|
| 277 |
.pv-confidence-bar-fill {
|
| 278 |
height: 100%;
|
| 279 |
border-radius: 3px;
|
| 280 |
+
transition: width 0.6s cubic-bezier(0.4, 0, 0.2, 1);
|
| 281 |
}
|
| 282 |
|
| 283 |
.pv-confidence-bar-value {
|
|
|
|
| 294 |
display: flex;
|
| 295 |
justify-content: space-between;
|
| 296 |
align-items: center;
|
| 297 |
+
padding: 7px 0;
|
| 298 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 299 |
}
|
| 300 |
|
|
|
|
| 313 |
color: #a89f94;
|
| 314 |
}
|
| 315 |
|
| 316 |
+
/* β Suspicious signals */
|
| 317 |
.pv-report-signals {
|
| 318 |
+
padding: 10px 0;
|
| 319 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 320 |
}
|
| 321 |
|
|
|
|
| 323 |
display: flex;
|
| 324 |
flex-wrap: wrap;
|
| 325 |
gap: 4px;
|
| 326 |
+
margin-top: 8px;
|
| 327 |
}
|
| 328 |
|
| 329 |
.pv-report-tag {
|
|
|
|
| 339 |
|
| 340 |
/* β Evidence sources */
|
| 341 |
.pv-report-sources {
|
| 342 |
+
padding: 10px 0;
|
| 343 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 344 |
}
|
| 345 |
|
| 346 |
.pv-report-sources-list {
|
| 347 |
list-style: none;
|
| 348 |
padding: 0;
|
| 349 |
+
margin: 8px 0 0 0;
|
| 350 |
display: flex;
|
| 351 |
flex-direction: column;
|
| 352 |
gap: 4px;
|
|
|
|
| 382 |
flex-shrink: 0;
|
| 383 |
}
|
| 384 |
|
| 385 |
+
/* β Claim analyzed */
|
| 386 |
.pv-report-explanation {
|
| 387 |
+
padding: 10px 0;
|
| 388 |
border-bottom: 1px solid rgba(245, 240, 232, 0.05);
|
| 389 |
}
|
| 390 |
|
|
|
|
| 392 |
margin: 6px 0 0;
|
| 393 |
font-size: 10px;
|
| 394 |
color: #a89f94;
|
| 395 |
+
line-height: 1.6;
|
| 396 |
font-style: italic;
|
| 397 |
}
|
| 398 |
|
| 399 |
+
/* β Full dashboard link */
|
| 400 |
.pv-report-full-link {
|
| 401 |
display: block;
|
| 402 |
+
margin-top: 14px;
|
| 403 |
text-align: center;
|
| 404 |
color: #dc2626;
|
| 405 |
font-size: 10px;
|
|
|
|
| 407 |
letter-spacing: 0.08em;
|
| 408 |
text-decoration: none;
|
| 409 |
text-transform: uppercase;
|
| 410 |
+
padding: 8px;
|
| 411 |
border: 1px solid rgba(220, 38, 38, 0.3);
|
| 412 |
+
border-radius: 6px;
|
| 413 |
transition: background 0.15s ease;
|
| 414 |
}
|
| 415 |
|
| 416 |
.pv-report-full-link:hover {
|
| 417 |
+
background: rgba(220, 38, 38, 0.1);
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
/* ββ Metadata footer (model_tier + claim_method) ββββββββββββββββββββββββββββββ */
|
| 421 |
+
.pv-report-meta-footer {
|
| 422 |
+
display: flex;
|
| 423 |
+
align-items: center;
|
| 424 |
+
gap: 5px;
|
| 425 |
+
padding: 8px 0 4px;
|
| 426 |
+
border-top: 1px solid rgba(245, 240, 232, 0.05);
|
| 427 |
+
margin-top: 4px;
|
| 428 |
+
}
|
| 429 |
+
|
| 430 |
+
.pv-report-meta-label {
|
| 431 |
+
font-size: 8px;
|
| 432 |
+
font-weight: 700;
|
| 433 |
+
letter-spacing: 0.1em;
|
| 434 |
+
color: #5c554e;
|
| 435 |
+
text-transform: uppercase;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
.pv-report-meta-val {
|
| 439 |
+
font-size: 9px;
|
| 440 |
+
font-family: 'SF Mono', 'Menlo', monospace;
|
| 441 |
+
color: #6b7280;
|
| 442 |
+
}
|
| 443 |
+
|
| 444 |
+
.pv-report-meta-sep {
|
| 445 |
+
color: rgba(245, 240, 232, 0.12);
|
| 446 |
+
font-size: 10px;
|
| 447 |
+
margin: 0 1px;
|
| 448 |
+
}
|
|
@@ -176,15 +176,59 @@
|
|
| 176 |
}
|
| 177 |
}
|
| 178 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
function extractPostText(post) {
|
| 180 |
expandSeeMore(post)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
// Primary selectors β platform-specific, high confidence
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
}
|
| 189 |
}
|
| 190 |
|
|
@@ -200,33 +244,81 @@
|
|
| 200 |
return t.slice(0, 2000)
|
| 201 |
}
|
| 202 |
}
|
| 203 |
-
|
| 204 |
-
//
|
| 205 |
for (const el of post.querySelectorAll('[dir="auto"]')) {
|
| 206 |
if (el.closest('[role="navigation"]') || el.closest('header') || el.closest('[data-testid="UFI2Comment"]')) continue
|
| 207 |
-
// Also skip if inside a nested comment article
|
| 208 |
const parentArticle = el.closest('[role="article"]')
|
| 209 |
-
if
|
|
|
|
|
|
|
| 210 |
const t = el.innerText?.trim()
|
| 211 |
-
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http')) {
|
| 212 |
log('Text extracted via broad [dir="auto"] fallback (filtered)')
|
| 213 |
return t.slice(0, 2000)
|
| 214 |
}
|
| 215 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
}
|
| 217 |
|
| 218 |
-
// General fallback: any span with substantial text
|
| 219 |
for (const span of post.querySelectorAll('span')) {
|
| 220 |
const t = span.innerText?.trim()
|
| 221 |
-
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http')) {
|
| 222 |
// Skip if inside a nested comment article
|
| 223 |
const parentArticle = span.closest('[role="article"]')
|
| 224 |
-
if (parentArticle && parentArticle !== post) continue
|
| 225 |
log('Text extracted via span fallback')
|
| 226 |
return t.slice(0, 2000)
|
| 227 |
}
|
| 228 |
}
|
| 229 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
log('No text found in post')
|
| 231 |
return null
|
| 232 |
}
|
|
@@ -234,7 +326,18 @@
|
|
| 234 |
function extractPostUrl(post) {
|
| 235 |
for (const sel of (CFG.link ?? [])) {
|
| 236 |
const el = post.querySelector(sel)
|
| 237 |
-
if (el?.href)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
}
|
| 239 |
return null
|
| 240 |
}
|
|
@@ -249,12 +352,19 @@
|
|
| 249 |
function extractPostImage(post) {
|
| 250 |
if (!CFG.image) return null
|
| 251 |
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
if (!allImgs.length) { log('No candidate images found'); return null }
|
| 254 |
|
| 255 |
// Build a set of avatar container elements to check ancestry against
|
|
|
|
| 256 |
const avatarContainers = (CFG.avatarContainers ?? []).flatMap(sel =>
|
| 257 |
-
Array.from(
|
| 258 |
)
|
| 259 |
|
| 260 |
const contentImgs = allImgs.filter(img => {
|
|
@@ -287,108 +397,118 @@
|
|
| 287 |
return src
|
| 288 |
}
|
| 289 |
|
| 290 |
-
// ββ Post discovery
|
| 291 |
|
| 292 |
/**
|
| 293 |
-
*
|
| 294 |
-
*
|
| 295 |
-
*
|
| 296 |
-
* - [role="feed"] is a WAI-ARIA landmark that Facebook keeps for accessibility.
|
| 297 |
-
* - Direct children of the feed are always posts (wrapped in <div> containers).
|
| 298 |
-
* - Comments are always deeper nested inside another [role="article"].
|
| 299 |
*
|
| 300 |
-
*
|
| 301 |
-
*
|
| 302 |
-
*
|
| 303 |
-
*
|
| 304 |
*/
|
| 305 |
-
function
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
const parentArticle = el.parentElement?.closest('[role="article"]')
|
| 312 |
-
if (parentArticle) {
|
| 313 |
-
log('Skipping comment (nested inside parent article)')
|
| 314 |
-
return false
|
| 315 |
-
}
|
| 316 |
-
|
| 317 |
-
// ββ Check 2: Is this article a child of [role="feed"]?
|
| 318 |
-
// Direct children of the feed are always posts.
|
| 319 |
-
const feedAncestor = el.closest('[role="feed"]')
|
| 320 |
-
if (feedAncestor) {
|
| 321 |
-
// This article is inside the feed and NOT nested in another article β post
|
| 322 |
-
return true
|
| 323 |
-
}
|
| 324 |
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
}
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
}
|
| 346 |
|
| 347 |
/**
|
| 348 |
-
*
|
| 349 |
-
*
|
| 350 |
-
* Two-pass strategy for Facebook:
|
| 351 |
-
* Pass 1: Find [role="feed"] container β get [role="article"] elements
|
| 352 |
-
* that are direct children of the feed (not nested in other articles)
|
| 353 |
-
* Pass 2: If no feed found (detail pages, etc.), fall back to all
|
| 354 |
-
* [role="article"] elements filtered by isTopLevelPost()
|
| 355 |
-
*
|
| 356 |
-
* For Twitter and other platforms, uses POST_SELECTORS directly.
|
| 357 |
*/
|
| 358 |
function findPosts(root) {
|
| 359 |
-
if (PLATFORM === 'facebook') {
|
| 360 |
-
// ββ Pass 1: Feed-based detection (most reliable)
|
| 361 |
-
const feeds = root.querySelectorAll('[role="feed"]')
|
| 362 |
-
if (feeds.length === 0 && root.getAttribute?.('role') === 'feed') {
|
| 363 |
-
// root itself might be the feed
|
| 364 |
-
const articles = Array.from(root.querySelectorAll('[role="article"]'))
|
| 365 |
-
.filter(el => !el.parentElement?.closest('[role="article"]'))
|
| 366 |
-
if (articles.length) {
|
| 367 |
-
log(`Found ${articles.length} posts via feed (root is feed)`)
|
| 368 |
-
return articles
|
| 369 |
-
}
|
| 370 |
-
}
|
| 371 |
-
for (const feed of feeds) {
|
| 372 |
-
// Get all articles inside this feed that are NOT nested in another article
|
| 373 |
-
const articles = Array.from(feed.querySelectorAll('[role="article"]'))
|
| 374 |
-
.filter(el => !el.parentElement?.closest('[role="article"]'))
|
| 375 |
-
if (articles.length) {
|
| 376 |
-
log(`Found ${articles.length} posts via [role="feed"] container`)
|
| 377 |
-
return articles
|
| 378 |
-
}
|
| 379 |
-
}
|
| 380 |
-
|
| 381 |
-
// ββ Pass 2: No feed container found β detail page or unusual layout
|
| 382 |
-
const allArticles = Array.from(root.querySelectorAll('[role="article"]'))
|
| 383 |
-
const topLevel = allArticles.filter(el => isTopLevelPost(el))
|
| 384 |
-
if (topLevel.length) {
|
| 385 |
-
log(`Found ${topLevel.length} posts via fallback (no feed container)`)
|
| 386 |
-
return topLevel
|
| 387 |
-
}
|
| 388 |
-
return []
|
| 389 |
-
}
|
| 390 |
-
|
| 391 |
-
// Non-Facebook platforms: simple selector matching
|
| 392 |
for (const sel of POST_SELECTORS) {
|
| 393 |
const found = Array.from(root.querySelectorAll(sel))
|
| 394 |
if (found.length) return found
|
|
@@ -408,15 +528,6 @@
|
|
| 408 |
if (post.dataset.philverifyBtn) return
|
| 409 |
post.dataset.philverifyBtn = 'true'
|
| 410 |
|
| 411 |
-
// Note: We do NOT gate on content availability here.
|
| 412 |
-
// Facebook lazy-loads post content via React hydration, so text/images
|
| 413 |
-
// may not be in the DOM yet when this runs. Content is checked at click
|
| 414 |
-
// time (in handleVerifyClick) when everything is fully rendered.
|
| 415 |
-
|
| 416 |
-
// Create wrapper (flex container for right-alignment)
|
| 417 |
-
const wrapper = document.createElement('div')
|
| 418 |
-
wrapper.className = 'pv-verify-btn-wrapper'
|
| 419 |
-
|
| 420 |
// Create the button
|
| 421 |
const btn = document.createElement('button')
|
| 422 |
btn.className = 'pv-verify-btn'
|
|
@@ -443,29 +554,85 @@
|
|
| 443 |
handleVerifyClick(post, btn)
|
| 444 |
})
|
| 445 |
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
//
|
| 449 |
-
//
|
| 450 |
-
// visible post content, but BEFORE the comments section.
|
| 451 |
-
// On Facebook, we look for the action bar area or similar landmarks.
|
| 452 |
let inserted = false
|
|
|
|
| 453 |
if (PLATFORM === 'facebook') {
|
| 454 |
-
//
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
if (
|
| 458 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 459 |
inserted = true
|
|
|
|
| 460 |
}
|
| 461 |
}
|
| 462 |
|
| 463 |
-
//
|
| 464 |
if (!inserted) {
|
|
|
|
|
|
|
|
|
|
| 465 |
post.appendChild(wrapper)
|
|
|
|
| 466 |
}
|
| 467 |
-
|
| 468 |
-
log('Verify button injected on post')
|
| 469 |
}
|
| 470 |
|
| 471 |
// ββ Verify click handler ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -490,42 +657,96 @@
|
|
| 490 |
const url = extractPostUrl(post)
|
| 491 |
const image = extractPostImage(post)
|
| 492 |
|
|
|
|
|
|
|
| 493 |
log(`Verify clicked: text=${!!text} (${text?.length ?? 0} chars), url=${!!url}, image=${!!image}`)
|
| 494 |
|
| 495 |
// Determine what to send
|
| 496 |
let inputSummary = ''
|
| 497 |
if (!text && !url && !image) {
|
|
|
|
| 498 |
showErrorReport(post, btn, 'Could not read post content β no text or image found.')
|
| 499 |
return
|
| 500 |
}
|
| 501 |
|
| 502 |
try {
|
| 503 |
let msgPayload
|
|
|
|
| 504 |
|
|
|
|
| 505 |
if (url) {
|
| 506 |
msgPayload = { type: 'VERIFY_URL', url }
|
|
|
|
| 507 |
inputSummary = 'Shared link analyzed'
|
| 508 |
} else if (text && image) {
|
| 509 |
msgPayload = { type: 'VERIFY_TEXT', text, imageUrl: image }
|
|
|
|
| 510 |
inputSummary = 'Caption + image analyzed'
|
| 511 |
} else if (text) {
|
| 512 |
msgPayload = { type: 'VERIFY_TEXT', text }
|
|
|
|
| 513 |
inputSummary = 'Caption text only'
|
| 514 |
} else {
|
| 515 |
msgPayload = { type: 'VERIFY_IMAGE_URL', imageUrl: image }
|
|
|
|
| 516 |
inputSummary = 'Image only (OCR)'
|
| 517 |
}
|
| 518 |
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
})
|
| 525 |
-
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
|
| 527 |
log(`Verification result: verdict=${response.verdict}, score=${response.final_score}`)
|
| 528 |
-
|
|
|
|
| 529 |
} catch (err) {
|
| 530 |
warn('Verification failed:', err.message)
|
| 531 |
showErrorReport(post, btn, err.message)
|
|
@@ -534,13 +755,12 @@
|
|
| 534 |
|
| 535 |
// ββ Verification report rendering βββββββββββββββββββββββββββββββββββββββββ
|
| 536 |
|
| 537 |
-
function showVerificationReport(post, btn, result, inputSummary) {
|
| 538 |
// Remove the button
|
| 539 |
btn.remove()
|
| 540 |
|
| 541 |
-
// Remove any existing
|
| 542 |
-
|
| 543 |
-
if (existing) existing.remove()
|
| 544 |
|
| 545 |
const verdict = result.verdict ?? 'Unknown'
|
| 546 |
const color = VERDICT_COLORS[verdict] ?? '#5c554e'
|
|
@@ -553,13 +773,36 @@
|
|
| 553 |
const features = result.layer1?.triggered_features ?? []
|
| 554 |
const cached = result._fromCache ? ' Β· cached' : ''
|
| 555 |
|
| 556 |
-
//
|
| 557 |
-
const
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
const header = document.createElement('div')
|
| 564 |
header.className = 'pv-report-header'
|
| 565 |
|
|
@@ -571,22 +814,17 @@
|
|
| 571 |
closeBtn.className = 'pv-report-close'
|
| 572 |
closeBtn.textContent = 'β'
|
| 573 |
closeBtn.setAttribute('aria-label', 'Close fact-check report')
|
| 574 |
-
closeBtn.addEventListener('click', (e) => {
|
| 575 |
-
e.stopPropagation()
|
| 576 |
-
report.remove()
|
| 577 |
-
// Re-inject the verify button so user can re-verify
|
| 578 |
-
delete post.dataset.philverifyBtn
|
| 579 |
-
injectVerifyButton(post)
|
| 580 |
-
})
|
| 581 |
|
| 582 |
header.appendChild(logo)
|
| 583 |
header.appendChild(closeBtn)
|
| 584 |
-
|
| 585 |
|
| 586 |
-
// β Verdict row
|
| 587 |
const verdictRow = document.createElement('div')
|
| 588 |
verdictRow.className = 'pv-report-verdict-row'
|
| 589 |
verdictRow.style.borderLeftColor = color
|
|
|
|
| 590 |
|
| 591 |
const verdictLabel = document.createElement('div')
|
| 592 |
verdictLabel.className = 'pv-report-verdict'
|
|
@@ -599,7 +837,7 @@
|
|
| 599 |
|
| 600 |
verdictRow.appendChild(verdictLabel)
|
| 601 |
verdictRow.appendChild(scoreText)
|
| 602 |
-
|
| 603 |
|
| 604 |
// β Confidence bar
|
| 605 |
const barWrap = document.createElement('div')
|
|
@@ -614,7 +852,7 @@
|
|
| 614 |
|
| 615 |
const barFill = document.createElement('div')
|
| 616 |
barFill.className = 'pv-confidence-bar-fill'
|
| 617 |
-
barFill.style.width =
|
| 618 |
barFill.style.background = color
|
| 619 |
|
| 620 |
const barValue = document.createElement('span')
|
|
@@ -625,9 +863,9 @@
|
|
| 625 |
barWrap.appendChild(barLabel)
|
| 626 |
barWrap.appendChild(barTrack)
|
| 627 |
barWrap.appendChild(barValue)
|
| 628 |
-
|
| 629 |
|
| 630 |
-
// β Info rows
|
| 631 |
const addInfoRow = (labelText, valueText) => {
|
| 632 |
const row = document.createElement('div')
|
| 633 |
row.className = 'pv-report-row'
|
|
@@ -639,22 +877,65 @@
|
|
| 639 |
val.textContent = valueText
|
| 640 |
row.appendChild(lbl)
|
| 641 |
row.appendChild(val)
|
| 642 |
-
|
| 643 |
}
|
| 644 |
|
| 645 |
addInfoRow('LANGUAGE', safeText(language))
|
| 646 |
addInfoRow('INPUT', safeText(inputSummary))
|
| 647 |
|
| 648 |
-
// β
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
if (features.length > 0) {
|
| 650 |
const signalsSection = document.createElement('div')
|
| 651 |
signalsSection.className = 'pv-report-signals'
|
| 652 |
-
|
| 653 |
const signalsLabel = document.createElement('span')
|
| 654 |
signalsLabel.className = 'pv-report-label'
|
| 655 |
signalsLabel.textContent = 'SUSPICIOUS SIGNALS'
|
| 656 |
signalsSection.appendChild(signalsLabel)
|
| 657 |
-
|
| 658 |
const tagsWrap = document.createElement('div')
|
| 659 |
tagsWrap.className = 'pv-report-tags'
|
| 660 |
for (const f of features.slice(0, 5)) {
|
|
@@ -664,45 +945,43 @@
|
|
| 664 |
tagsWrap.appendChild(tag)
|
| 665 |
}
|
| 666 |
signalsSection.appendChild(tagsWrap)
|
| 667 |
-
|
| 668 |
}
|
| 669 |
|
| 670 |
// β Evidence sources
|
| 671 |
if (sources.length > 0) {
|
| 672 |
const sourcesSection = document.createElement('div')
|
| 673 |
sourcesSection.className = 'pv-report-sources'
|
| 674 |
-
|
| 675 |
const sourcesLabel = document.createElement('span')
|
| 676 |
sourcesLabel.className = 'pv-report-label'
|
| 677 |
sourcesLabel.textContent = 'EVIDENCE SOURCES'
|
| 678 |
sourcesSection.appendChild(sourcesLabel)
|
| 679 |
-
|
| 680 |
const sourcesList = document.createElement('ul')
|
| 681 |
sourcesList.className = 'pv-report-sources-list'
|
| 682 |
-
|
| 683 |
for (const src of sources.slice(0, 5)) {
|
| 684 |
const li = document.createElement('li')
|
| 685 |
li.className = 'pv-report-source-item'
|
| 686 |
-
|
| 687 |
const link = document.createElement('a')
|
| 688 |
link.href = safeUrl(src.url)
|
| 689 |
link.target = '_blank'
|
| 690 |
link.rel = 'noreferrer'
|
| 691 |
link.className = 'pv-report-source-link'
|
| 692 |
link.textContent = src.title?.slice(0, 60) ?? src.source_name ?? 'View source'
|
| 693 |
-
|
| 694 |
const stance = document.createElement('span')
|
| 695 |
stance.className = 'pv-report-source-stance'
|
| 696 |
stance.textContent = src.stance ?? ''
|
| 697 |
if (src.stance === 'Refutes') stance.style.color = '#dc2626'
|
| 698 |
if (src.stance === 'Supports') stance.style.color = '#16a34a'
|
| 699 |
-
|
|
|
|
|
|
|
|
|
|
| 700 |
li.appendChild(link)
|
| 701 |
li.appendChild(stance)
|
| 702 |
sourcesList.appendChild(li)
|
| 703 |
}
|
| 704 |
sourcesSection.appendChild(sourcesList)
|
| 705 |
-
|
| 706 |
}
|
| 707 |
|
| 708 |
// β Explanation (claim used)
|
|
@@ -717,7 +996,42 @@
|
|
| 717 |
explText.textContent = result.layer2.claim_used
|
| 718 |
explanation.appendChild(explLabel)
|
| 719 |
explanation.appendChild(explText)
|
| 720 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 721 |
}
|
| 722 |
|
| 723 |
// β Full analysis link
|
|
@@ -727,14 +1041,22 @@
|
|
| 727 |
fullLink.target = '_blank'
|
| 728 |
fullLink.rel = 'noreferrer'
|
| 729 |
fullLink.textContent = 'Open Full Dashboard β'
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
//
|
| 733 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 734 |
}
|
| 735 |
|
| 736 |
function showErrorReport(post, btn, errorMessage) {
|
| 737 |
-
// Remove spinner, restore button as error state
|
| 738 |
btn.classList.remove('pv-verify-btn--loading')
|
| 739 |
btn.classList.add('pv-verify-btn--error')
|
| 740 |
btn.disabled = false
|
|
@@ -744,18 +1066,21 @@
|
|
| 744 |
|
| 745 |
const icon = btn.querySelector('.pv-verify-btn-icon')
|
| 746 |
const label = btn.querySelector('.pv-verify-btn-label')
|
| 747 |
-
if (icon) icon.textContent = 'β οΈ'
|
| 748 |
-
if (label) label.textContent = 'Verification failed β tap to retry'
|
| 749 |
|
| 750 |
-
//
|
| 751 |
-
const
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
|
|
|
|
|
|
| 757 |
}
|
| 758 |
|
|
|
|
|
|
|
|
|
|
| 759 |
// Remove old click listeners by replacing element
|
| 760 |
const newBtn = btn.cloneNode(true)
|
| 761 |
btn.replaceWith(newBtn)
|
|
@@ -769,6 +1094,17 @@
|
|
| 769 |
|
| 770 |
// ββ MutationObserver ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 771 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
const pendingPosts = new Set()
|
| 773 |
let rafScheduled = false
|
| 774 |
|
|
@@ -786,33 +1122,15 @@
|
|
| 786 |
}
|
| 787 |
}
|
| 788 |
|
| 789 |
-
const observer = new MutationObserver((
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
if (PLATFORM === 'facebook') {
|
| 795 |
-
// Facebook strategy: only process nodes that are inside [role="feed"]
|
| 796 |
-
// or that contain a feed. This prevents processing individual comment
|
| 797 |
-
// nodes that are added dynamically.
|
| 798 |
-
const inFeed = node.closest?.('[role="feed"]') ||
|
| 799 |
-
node.querySelector?.('[role="feed"]') ||
|
| 800 |
-
node.getAttribute?.('role') === 'feed'
|
| 801 |
-
if (!inFeed && node.getAttribute?.('role') === 'article') {
|
| 802 |
-
// An article added outside of a feed β could be a detail page.
|
| 803 |
-
// Only process if isTopLevelPost says it's a post.
|
| 804 |
-
if (isTopLevelPost(node)) {
|
| 805 |
-
scheduleProcess(node)
|
| 806 |
-
}
|
| 807 |
-
continue
|
| 808 |
-
}
|
| 809 |
-
}
|
| 810 |
-
|
| 811 |
-
// Check descendants for posts (findPosts handles feed-based filtering)
|
| 812 |
-
const posts = findPosts(node)
|
| 813 |
-
for (const post of posts) scheduleProcess(post)
|
| 814 |
-
}
|
| 815 |
}
|
|
|
|
|
|
|
|
|
|
| 816 |
})
|
| 817 |
|
| 818 |
// ββ Initialization ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -821,40 +1139,59 @@
|
|
| 821 |
log(`Initializing on ${PLATFORM} (${window.location.hostname})`)
|
| 822 |
|
| 823 |
// Check autoScan setting β controls whether buttons are shown at all
|
| 824 |
-
|
|
|
|
| 825 |
try {
|
| 826 |
-
response = await
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
resolve({ autoScan: true })
|
| 831 |
-
}
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
})
|
| 836 |
} catch {
|
| 837 |
response = { autoScan: true }
|
| 838 |
}
|
| 839 |
|
| 840 |
log('Settings:', response)
|
| 841 |
if (response?.autoScan === false) {
|
| 842 |
-
log('Auto-scan disabled
|
| 843 |
return
|
| 844 |
}
|
| 845 |
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
}
|
| 855 |
|
| 856 |
init()
|
| 857 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 858 |
// ββ Auto-verify news article pages (non-social) ββββββββββββββββββββββββββββ
|
| 859 |
// When the content script runs on a PH news site (not the homepage),
|
| 860 |
// it auto-verifies the current URL and injects a floating verdict banner.
|
|
@@ -863,7 +1200,7 @@
|
|
| 863 |
const url = window.location.href
|
| 864 |
const path = new URL(url).pathname
|
| 865 |
// Skip homepages and section indexes (very short paths like / or /news)
|
| 866 |
-
if (!path || path.length <
|
| 867 |
|
| 868 |
const banner = document.createElement('div')
|
| 869 |
banner.id = 'pv-auto-banner'
|
|
@@ -936,7 +1273,14 @@
|
|
| 936 |
try {
|
| 937 |
const response = await new Promise((resolve, reject) => {
|
| 938 |
chrome.runtime.sendMessage({ type: 'VERIFY_URL', url }, (resp) => {
|
| 939 |
-
if (chrome.runtime.lastError)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 940 |
else if (!resp?.ok) reject(new Error(resp?.error ?? 'Unknown error'))
|
| 941 |
else resolve(resp.result)
|
| 942 |
})
|
|
|
|
| 176 |
}
|
| 177 |
}
|
| 178 |
|
| 179 |
+
/** Detect Facebook's character-obfuscation spans: "s o o p S d e t r n β¦" */
|
| 180 |
+
function isObfuscatedText(text) {
|
| 181 |
+
const tokens = text.split(/\s+/).filter(w => w.length > 0)
|
| 182 |
+
if (tokens.length < 8) return false
|
| 183 |
+
const singleCharCount = tokens.filter(w => w.length === 1).length
|
| 184 |
+
return singleCharCount / tokens.length > 0.5
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
function extractPostText(post) {
|
| 188 |
expandSeeMore(post)
|
| 189 |
|
| 190 |
+
// ββ Reshare detection βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 191 |
+
// Re-shared Facebook posts have a nested [role="article"] inside the outer
|
| 192 |
+
// post. The sharer's caption lives in the outer [data-ad-comet-preview="message"],
|
| 193 |
+
// while the ORIGINAL post content is inside the nested article.
|
| 194 |
+
// We want to fact-check the original content, not the sharer's commentary.
|
| 195 |
+
if (PLATFORM === 'facebook') {
|
| 196 |
+
const innerArticle = Array.from(post.querySelectorAll('[role="article"]'))
|
| 197 |
+
.find(el => el !== post)
|
| 198 |
+
|
| 199 |
+
if (innerArticle) {
|
| 200 |
+
for (const sel of CFG.text) {
|
| 201 |
+
const el = innerArticle.querySelector(sel)
|
| 202 |
+
const t = el?.innerText?.trim()
|
| 203 |
+
if (t && t.length >= MIN_TEXT_LENGTH) {
|
| 204 |
+
log('Reshared post: extracted original content from nested article via', sel)
|
| 205 |
+
return t.slice(0, 2000)
|
| 206 |
+
}
|
| 207 |
+
}
|
| 208 |
+
for (const el of innerArticle.querySelectorAll('[dir="auto"]')) {
|
| 209 |
+
const t = el.innerText?.trim()
|
| 210 |
+
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http')) {
|
| 211 |
+
log('Reshared post: extracted original content via dir=auto in nested article')
|
| 212 |
+
return t.slice(0, 2000)
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
// Primary selectors β platform-specific, high confidence
|
| 219 |
+
// Also search in the nearest article ancestor in case postElement is a sub-section
|
| 220 |
+
const primarySearchRoots = [post]
|
| 221 |
+
if (PLATFORM === 'facebook') {
|
| 222 |
+
const articleAncestor = post.closest?.('[role="article"]')
|
| 223 |
+
if (articleAncestor && articleAncestor !== post) primarySearchRoots.push(articleAncestor)
|
| 224 |
+
}
|
| 225 |
+
for (const root of primarySearchRoots) {
|
| 226 |
+
for (const sel of CFG.text) {
|
| 227 |
+
const el = root.querySelector(sel)
|
| 228 |
+
if (el?.innerText?.trim().length >= MIN_TEXT_LENGTH) {
|
| 229 |
+
log('Text extracted via primary selector:', sel)
|
| 230 |
+
return el.innerText.trim().slice(0, 2000)
|
| 231 |
+
}
|
| 232 |
}
|
| 233 |
}
|
| 234 |
|
|
|
|
| 244 |
return t.slice(0, 2000)
|
| 245 |
}
|
| 246 |
}
|
| 247 |
+
|
| 248 |
+
// Broader [dir="auto"] scan β exclude comments, navs, headers
|
| 249 |
for (const el of post.querySelectorAll('[dir="auto"]')) {
|
| 250 |
if (el.closest('[role="navigation"]') || el.closest('header') || el.closest('[data-testid="UFI2Comment"]')) continue
|
|
|
|
| 251 |
const parentArticle = el.closest('[role="article"]')
|
| 252 |
+
// Skip only if parentArticle is a completely separate subtree from post
|
| 253 |
+
// (i.e., it doesn't contain post). If post is inside parentArticle, that's fine.
|
| 254 |
+
if (parentArticle && parentArticle !== post && !parentArticle.contains(post)) continue
|
| 255 |
const t = el.innerText?.trim()
|
| 256 |
+
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http') && !isObfuscatedText(t)) {
|
| 257 |
log('Text extracted via broad [dir="auto"] fallback (filtered)')
|
| 258 |
return t.slice(0, 2000)
|
| 259 |
}
|
| 260 |
}
|
| 261 |
+
|
| 262 |
+
// Last resort for Facebook: walk UP the DOM from post to find the article,
|
| 263 |
+
// then collect all [dir="auto"] text from that full article.
|
| 264 |
+
// This handles cases where postElement is only a sub-section of the full post.
|
| 265 |
+
const fullArticle = post.closest?.('[role="article"]') ?? post
|
| 266 |
+
if (fullArticle !== post) {
|
| 267 |
+
for (const el of fullArticle.querySelectorAll('[dir="auto"]')) {
|
| 268 |
+
if (el.closest('[role="navigation"]') || el.closest('header')) continue
|
| 269 |
+
const t = el.innerText?.trim()
|
| 270 |
+
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http')) {
|
| 271 |
+
log('Text extracted via full-article [dir="auto"] walk-up')
|
| 272 |
+
return t.slice(0, 2000)
|
| 273 |
+
}
|
| 274 |
+
}
|
| 275 |
+
// Combine all short [dir="auto"] fragments from the full article
|
| 276 |
+
const combined = Array.from(fullArticle.querySelectorAll('[dir="auto"]'))
|
| 277 |
+
.map(el => el.innerText?.trim())
|
| 278 |
+
.filter(t => t && t.length > 5 && !t.startsWith('http'))
|
| 279 |
+
.join(' ')
|
| 280 |
+
if (combined.length >= MIN_TEXT_LENGTH) {
|
| 281 |
+
log('Text extracted by combining dir=auto fragments in full article')
|
| 282 |
+
return combined.slice(0, 2000)
|
| 283 |
+
}
|
| 284 |
+
}
|
| 285 |
+
|
| 286 |
+
// Combine all short [dir="auto"] fragments in the current post element
|
| 287 |
+
const allDirAuto = Array.from(post.querySelectorAll('[dir="auto"]'))
|
| 288 |
+
.map(el => el.innerText?.trim())
|
| 289 |
+
.filter(t => t && t.length > 5 && !t.startsWith('http'))
|
| 290 |
+
.join(' ')
|
| 291 |
+
if (allDirAuto.length >= MIN_TEXT_LENGTH) {
|
| 292 |
+
log('Text extracted by combining dir=auto fragments')
|
| 293 |
+
return allDirAuto.slice(0, 2000)
|
| 294 |
+
}
|
| 295 |
}
|
| 296 |
|
| 297 |
+
// General fallback: any span with substantial text (skip obfuscated char-spans)
|
| 298 |
for (const span of post.querySelectorAll('span')) {
|
| 299 |
const t = span.innerText?.trim()
|
| 300 |
+
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http') && !isObfuscatedText(t)) {
|
| 301 |
// Skip if inside a nested comment article
|
| 302 |
const parentArticle = span.closest('[role="article"]')
|
| 303 |
+
if (parentArticle && parentArticle !== post && !parentArticle.contains(post)) continue
|
| 304 |
log('Text extracted via span fallback')
|
| 305 |
return t.slice(0, 2000)
|
| 306 |
}
|
| 307 |
}
|
| 308 |
|
| 309 |
+
// Walk UP the DOM and try the full article β covers cases where postElement
|
| 310 |
+
// is a small sub-section that doesn't contain the text itself
|
| 311 |
+
const ancestor = post.closest?.('[role="article"]')
|
| 312 |
+
if (ancestor && ancestor !== post) {
|
| 313 |
+
for (const span of ancestor.querySelectorAll('span')) {
|
| 314 |
+
const t = span.innerText?.trim()
|
| 315 |
+
if (t && t.length >= MIN_TEXT_LENGTH && !t.startsWith('http') && !isObfuscatedText(t)) {
|
| 316 |
+
log('Text extracted via ancestor span walk-up')
|
| 317 |
+
return t.slice(0, 2000)
|
| 318 |
+
}
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
|
| 322 |
log('No text found in post')
|
| 323 |
return null
|
| 324 |
}
|
|
|
|
| 326 |
function extractPostUrl(post) {
|
| 327 |
for (const sel of (CFG.link ?? [])) {
|
| 328 |
const el = post.querySelector(sel)
|
| 329 |
+
if (el?.href) {
|
| 330 |
+
const url = CFG.unwrapUrl(el)
|
| 331 |
+
// Skip common internal Facebook/Twitter links that aren't actually shared external content
|
| 332 |
+
if (PLATFORM === 'facebook') {
|
| 333 |
+
const u = url.toLowerCase()
|
| 334 |
+
if (u.includes('facebook.com') && !u.includes('l.php')) {
|
| 335 |
+
// Probably a profile link or internal post link, ignore as "URL input"
|
| 336 |
+
continue
|
| 337 |
+
}
|
| 338 |
+
}
|
| 339 |
+
return url
|
| 340 |
+
}
|
| 341 |
}
|
| 342 |
return null
|
| 343 |
}
|
|
|
|
| 352 |
function extractPostImage(post) {
|
| 353 |
if (!CFG.image) return null
|
| 354 |
|
| 355 |
+
// Search in post, then fall back to the nearest article ancestor if nothing found.
|
| 356 |
+
// postElement from the walk-up may only wrap the message text, not the image.
|
| 357 |
+
let allImgs = Array.from(post.querySelectorAll(CFG.image))
|
| 358 |
+
if (!allImgs.length && PLATFORM === 'facebook') {
|
| 359 |
+
const articleAncestor = post.closest?.('[role="article"]')
|
| 360 |
+
if (articleAncestor) allImgs = Array.from(articleAncestor.querySelectorAll(CFG.image))
|
| 361 |
+
}
|
| 362 |
if (!allImgs.length) { log('No candidate images found'); return null }
|
| 363 |
|
| 364 |
// Build a set of avatar container elements to check ancestry against
|
| 365 |
+
const imgSearchRoot = post.closest?.('[role="article"]') ?? post
|
| 366 |
const avatarContainers = (CFG.avatarContainers ?? []).flatMap(sel =>
|
| 367 |
+
Array.from(imgSearchRoot.querySelectorAll(sel))
|
| 368 |
)
|
| 369 |
|
| 370 |
const contentImgs = allImgs.filter(img => {
|
|
|
|
| 397 |
return src
|
| 398 |
}
|
| 399 |
|
| 400 |
+
// ββ Post discovery ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 401 |
|
| 402 |
/**
|
| 403 |
+
* Facebook: Scan the entire document for [aria-label="Hide post"] buttons.
|
| 404 |
+
* This is the same proven anchor used by the classmate's working extension.
|
| 405 |
+
* Walk up from the button to find the enclosing post container, then inject.
|
|
|
|
|
|
|
|
|
|
| 406 |
*
|
| 407 |
+
* Why this works better than [role="feed"] / [role="article"] detection:
|
| 408 |
+
* - Facebook's WAI-ARIA feed/article structure changes frequently
|
| 409 |
+
* - The "Hide post" β button is rendered on EVERY post and is very stable
|
| 410 |
+
* - Walking up to find the enclosing article-level div is reliable
|
| 411 |
*/
|
| 412 |
+
function addButtonsToFacebookPosts() {
|
| 413 |
+
// Anchor buttons that appear in EVERY post header (both home feed and profile pages).
|
| 414 |
+
// "Actions for this post" is the β― button β always visible, never on comments.
|
| 415 |
+
const hideButtons = document.querySelectorAll(
|
| 416 |
+
'[aria-label="Actions for this post"], [aria-label="Hide post"], [aria-label="hide post"], [aria-label="Hide or report this"], [aria-label="Edit post"], [aria-label="Edit memory"]'
|
| 417 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 418 |
|
| 419 |
+
let added = 0
|
| 420 |
+
hideButtons.forEach((hideBtn) => {
|
| 421 |
+
const btnContainer = hideBtn.parentElement
|
| 422 |
+
const btnGrandparent = btnContainer?.parentElement
|
| 423 |
+
if (!btnContainer || !btnGrandparent) return
|
| 424 |
+
|
| 425 |
+
// Skip if we already injected on this container
|
| 426 |
+
if (btnGrandparent.querySelector('.pv-verify-btn')) return
|
| 427 |
+
|
| 428 |
+
// Walk up from btnGrandparent to find the post container.
|
| 429 |
+
// Priority: container with a message attribute > non-empty article > first innerText>100.
|
| 430 |
+
// We don't stop on innerText>100 alone because the header grandparent often has
|
| 431 |
+
// that much text but doesn't contain the post body β keep walking for a better anchor.
|
| 432 |
+
let postElement = null
|
| 433 |
+
let innerTextFallback = null
|
| 434 |
+
let el = btnGrandparent
|
| 435 |
+
while (el && el !== document.body) {
|
| 436 |
+
// Best match: element that directly wraps the post message
|
| 437 |
+
if (el.querySelector('[data-ad-rendering-role="story_message"], [data-ad-comet-preview="message"]')) {
|
| 438 |
+
postElement = el; break
|
| 439 |
+
}
|
| 440 |
+
// Second best: an article/ARTICLE with actual content (non-skeleton)
|
| 441 |
+
if ((el.getAttribute('role') === 'article' || el.tagName === 'ARTICLE') &&
|
| 442 |
+
(el.innerText?.length ?? 0) > 100) {
|
| 443 |
+
postElement = el; break
|
| 444 |
+
}
|
| 445 |
+
// Track first innerText>100 as fallback (but keep walking for better match)
|
| 446 |
+
if (!innerTextFallback && (el.innerText?.length ?? 0) > 100) {
|
| 447 |
+
innerTextFallback = el
|
| 448 |
+
}
|
| 449 |
+
el = el.parentElement
|
| 450 |
}
|
| 451 |
+
if (!postElement) postElement = innerTextFallback ?? btnGrandparent
|
| 452 |
+
|
| 453 |
+
// Skip if postElement is nested inside another article (comment / reshared post)
|
| 454 |
+
if (postElement.parentElement?.closest('[role="article"]')) return
|
| 455 |
+
|
| 456 |
+
// Skip if already injected on this post
|
| 457 |
+
if (postElement.dataset.philverifyBtn) return
|
| 458 |
+
|
| 459 |
+
// "Actions for this post" (β― button) and "Hide or report this" only appear in
|
| 460 |
+
// post headers, never on comments. Profile page posts don't have
|
| 461 |
+
// [data-ad-comet-preview] so skip the content check and place the button
|
| 462 |
+
// directly next to the anchor (for β―) or via injectVerifyButton (for the other).
|
| 463 |
+
const hideBtnLabel = hideBtn.getAttribute('aria-label')
|
| 464 |
+
// "Actions for this post" (β―) and "Hide or report this" are in post headers only.
|
| 465 |
+
// Delegate placement to injectVerifyButton so the button lands in the action bar.
|
| 466 |
+
if (hideBtnLabel === 'Actions for this post' || hideBtnLabel === 'Hide or report this') {
|
| 467 |
+
injectVerifyButton(postElement)
|
| 468 |
+
added++
|
| 469 |
+
return
|
| 470 |
+
}
|
| 471 |
+
|
| 472 |
+
// For all other anchor labels (Hide post, Edit post, Edit memory): require a
|
| 473 |
+
// post message container. These labels only exist on home feed posts which
|
| 474 |
+
// always have [data-ad-comet-preview="message"].
|
| 475 |
+
if (!postElement.querySelector(
|
| 476 |
+
'[data-ad-comet-preview="message"], [data-ad-rendering-role="story_message"]'
|
| 477 |
+
)) return
|
| 478 |
+
|
| 479 |
+
// Delegate to injectVerifyButton so placement uses the action bar (Like/Comment/Share)
|
| 480 |
+
// on all page types β avoids the button being hidden in the post header area.
|
| 481 |
+
injectVerifyButton(postElement)
|
| 482 |
+
added++
|
| 483 |
+
})
|
| 484 |
|
| 485 |
+
if (added > 0) log(`Added ${added} verify button(s) via hide-post anchor`)
|
| 486 |
+
|
| 487 |
+
// ββ Supplementary scan: article-based (profile pages, group pages, etc.) ββ
|
| 488 |
+
// Both profile posts AND comments are [role="article"] on Facebook.
|
| 489 |
+
// Posts are top-level (no parent article); comments are nested inside posts.
|
| 490 |
+
// The nesting check below correctly distinguishes them.
|
| 491 |
+
// Note: the previous comment injection bug was caused by [aria-label="Remove"]
|
| 492 |
+
// in the button-anchor pass (now removed), not by this scan.
|
| 493 |
+
let supplementaryAdded = 0
|
| 494 |
+
document.querySelectorAll('[role="article"]').forEach(article => {
|
| 495 |
+
if (article.dataset.philverifyBtn) return
|
| 496 |
+
if (article.parentElement?.closest('[role="article"]')) return
|
| 497 |
+
// Profile page [role="article"] elements are permanent loading skeletons with no
|
| 498 |
+
// real content. Only inject on articles that actually have post message content.
|
| 499 |
+
if (PLATFORM === 'facebook' && !article.querySelector(
|
| 500 |
+
'[data-ad-comet-preview="message"], [data-ad-rendering-role="story_message"]'
|
| 501 |
+
)) return
|
| 502 |
+
injectVerifyButton(article)
|
| 503 |
+
supplementaryAdded++
|
| 504 |
+
})
|
| 505 |
+
if (supplementaryAdded > 0) log(`Added ${supplementaryAdded} verify button(s) via article scan`)
|
| 506 |
}
|
| 507 |
|
| 508 |
/**
|
| 509 |
+
* For Twitter and news sites: use the original selector-based approach.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
*/
|
| 511 |
function findPosts(root) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 512 |
for (const sel of POST_SELECTORS) {
|
| 513 |
const found = Array.from(root.querySelectorAll(sel))
|
| 514 |
if (found.length) return found
|
|
|
|
| 528 |
if (post.dataset.philverifyBtn) return
|
| 529 |
post.dataset.philverifyBtn = 'true'
|
| 530 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
// Create the button
|
| 532 |
const btn = document.createElement('button')
|
| 533 |
btn.className = 'pv-verify-btn'
|
|
|
|
| 554 |
handleVerifyClick(post, btn)
|
| 555 |
})
|
| 556 |
|
| 557 |
+
// ββ Insertion strategy βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 558 |
+
// Strategy 1 (most reliable β same anchor as classmate's working extension):
|
| 559 |
+
// The "hide post" β button is stable across Facebook layout changes.
|
| 560 |
+
// Insert the verify button next to it in the post header.
|
|
|
|
|
|
|
| 561 |
let inserted = false
|
| 562 |
+
|
| 563 |
if (PLATFORM === 'facebook') {
|
| 564 |
+
// Strategy: Look for the action row (Like / Comment / Share)
|
| 565 |
+
// Use the Like button as anchor β present on ALL post types (home feed + profile)
|
| 566 |
+
// postElement from walk-up may be a sub-section, so also search the nearest article ancestor.
|
| 567 |
+
if (!inserted) {
|
| 568 |
+
const searchRoot = post.closest('[role="article"]') ?? post
|
| 569 |
+
const likeBtn =
|
| 570 |
+
searchRoot.querySelector('[aria-label="Like"], [aria-label^="Like:"]') ??
|
| 571 |
+
post.querySelector('[aria-label="Like"], [aria-label^="Like:"]')
|
| 572 |
+
const actionBar =
|
| 573 |
+
likeBtn?.closest('[role="toolbar"]') ??
|
| 574 |
+
likeBtn?.closest('[role="group"]') ??
|
| 575 |
+
searchRoot.querySelector('[role="toolbar"]') ??
|
| 576 |
+
searchRoot.querySelector('[aria-label*="Comment"]')?.closest('div:not([role="article"])')
|
| 577 |
+
if (actionBar?.parentElement) {
|
| 578 |
+
const wrapper = document.createElement('div')
|
| 579 |
+
wrapper.className = 'pv-verify-btn-wrapper'
|
| 580 |
+
wrapper.appendChild(btn)
|
| 581 |
+
actionBar.parentElement.insertBefore(wrapper, actionBar.nextSibling)
|
| 582 |
+
inserted = true
|
| 583 |
+
log('Verify button injected after action bar')
|
| 584 |
+
}
|
| 585 |
+
}
|
| 586 |
+
|
| 587 |
+
// Strategy 3: Insert after [data-ad-comet-preview] text block
|
| 588 |
+
if (!inserted) {
|
| 589 |
+
const msgBlock =
|
| 590 |
+
post.querySelector('[data-ad-comet-preview="message"]') ??
|
| 591 |
+
post.querySelector('[data-testid="post_message"]')
|
| 592 |
+
if (msgBlock?.parentElement) {
|
| 593 |
+
const wrapper = document.createElement('div')
|
| 594 |
+
wrapper.className = 'pv-verify-btn-wrapper'
|
| 595 |
+
wrapper.appendChild(btn)
|
| 596 |
+
msgBlock.parentElement.insertBefore(wrapper, msgBlock.nextSibling)
|
| 597 |
+
inserted = true
|
| 598 |
+
log('Verify button injected after message block')
|
| 599 |
+
}
|
| 600 |
+
}
|
| 601 |
+
}
|
| 602 |
+
|
| 603 |
+
// Twitter: insert after tweet text block
|
| 604 |
+
if (!inserted && PLATFORM === 'twitter') {
|
| 605 |
+
const tweetText = post.querySelector('[data-testid="tweetText"]')
|
| 606 |
+
if (tweetText?.parentElement) {
|
| 607 |
+
const wrapper = document.createElement('div')
|
| 608 |
+
wrapper.className = 'pv-verify-btn-wrapper'
|
| 609 |
+
wrapper.appendChild(btn)
|
| 610 |
+
tweetText.parentElement.insertBefore(wrapper, tweetText.nextSibling)
|
| 611 |
+
inserted = true
|
| 612 |
+
}
|
| 613 |
+
}
|
| 614 |
+
|
| 615 |
+
// News sites: inject after the h1 headline so the button is visible without scrolling
|
| 616 |
+
if (!inserted && PLATFORM === 'news') {
|
| 617 |
+
const h1 = post.querySelector('h1')
|
| 618 |
+
if (h1?.parentElement) {
|
| 619 |
+
const wrapper = document.createElement('div')
|
| 620 |
+
wrapper.className = 'pv-verify-btn-wrapper'
|
| 621 |
+
wrapper.appendChild(btn)
|
| 622 |
+
h1.parentElement.insertBefore(wrapper, h1.nextSibling)
|
| 623 |
inserted = true
|
| 624 |
+
log('Verify button injected after h1 headline')
|
| 625 |
}
|
| 626 |
}
|
| 627 |
|
| 628 |
+
// Final fallback: append a wrapped button directly to the post
|
| 629 |
if (!inserted) {
|
| 630 |
+
const wrapper = document.createElement('div')
|
| 631 |
+
wrapper.className = 'pv-verify-btn-wrapper'
|
| 632 |
+
wrapper.appendChild(btn)
|
| 633 |
post.appendChild(wrapper)
|
| 634 |
+
log('Verify button injected via fallback (appended to post)')
|
| 635 |
}
|
|
|
|
|
|
|
| 636 |
}
|
| 637 |
|
| 638 |
// ββ Verify click handler ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 657 |
const url = extractPostUrl(post)
|
| 658 |
const image = extractPostImage(post)
|
| 659 |
|
| 660 |
+
console.log('[PhilVerify] Extracted:', { text, url, image })
|
| 661 |
+
|
| 662 |
log(`Verify clicked: text=${!!text} (${text?.length ?? 0} chars), url=${!!url}, image=${!!image}`)
|
| 663 |
|
| 664 |
// Determine what to send
|
| 665 |
let inputSummary = ''
|
| 666 |
if (!text && !url && !image) {
|
| 667 |
+
console.warn('[PhilVerify] Extraction failed: No content found.')
|
| 668 |
showErrorReport(post, btn, 'Could not read post content β no text or image found.')
|
| 669 |
return
|
| 670 |
}
|
| 671 |
|
| 672 |
try {
|
| 673 |
let msgPayload
|
| 674 |
+
let usedType = ''
|
| 675 |
|
| 676 |
+
// Start by attempting URL verification if present
|
| 677 |
if (url) {
|
| 678 |
msgPayload = { type: 'VERIFY_URL', url }
|
| 679 |
+
usedType = 'URL'
|
| 680 |
inputSummary = 'Shared link analyzed'
|
| 681 |
} else if (text && image) {
|
| 682 |
msgPayload = { type: 'VERIFY_TEXT', text, imageUrl: image }
|
| 683 |
+
usedType = 'TEXT'
|
| 684 |
inputSummary = 'Caption + image analyzed'
|
| 685 |
} else if (text) {
|
| 686 |
msgPayload = { type: 'VERIFY_TEXT', text }
|
| 687 |
+
usedType = 'TEXT'
|
| 688 |
inputSummary = 'Caption text only'
|
| 689 |
} else {
|
| 690 |
msgPayload = { type: 'VERIFY_IMAGE_URL', imageUrl: image }
|
| 691 |
+
usedType = 'IMAGE'
|
| 692 |
inputSummary = 'Image only (OCR)'
|
| 693 |
}
|
| 694 |
|
| 695 |
+
console.log(`[PhilVerify] Attempting ${usedType} verification:`, msgPayload)
|
| 696 |
+
|
| 697 |
+
let response
|
| 698 |
+
try {
|
| 699 |
+
response = await new Promise((resolve, reject) => {
|
| 700 |
+
chrome.runtime.sendMessage(msgPayload, (resp) => {
|
| 701 |
+
if (chrome.runtime.lastError) {
|
| 702 |
+
const msg = chrome.runtime.lastError.message ?? ''
|
| 703 |
+
reject(new Error(
|
| 704 |
+
msg.includes('Extension context invalidated')
|
| 705 |
+
? 'Extension was reloaded β please refresh the page to re-activate PhilVerify.'
|
| 706 |
+
: msg
|
| 707 |
+
))
|
| 708 |
+
}
|
| 709 |
+
else if (!resp?.ok) reject(new Error(resp?.error ?? 'Unknown error'))
|
| 710 |
+
else resolve(resp.result)
|
| 711 |
+
})
|
| 712 |
})
|
| 713 |
+
} catch (err) {
|
| 714 |
+
// FALLBACK LOGIC: If URL verification failed but we have text, try verifying the text instead
|
| 715 |
+
if (usedType === 'URL' && text && text.length >= MIN_TEXT_LENGTH) {
|
| 716 |
+
warn('URL verification failed, falling back to text verification:', err.message)
|
| 717 |
+
|
| 718 |
+
if (image) {
|
| 719 |
+
msgPayload = { type: 'VERIFY_TEXT', text, imageUrl: image }
|
| 720 |
+
inputSummary = 'Caption + image analyzed (fallback)'
|
| 721 |
+
} else {
|
| 722 |
+
msgPayload = { type: 'VERIFY_TEXT', text }
|
| 723 |
+
inputSummary = 'Caption text only (fallback)'
|
| 724 |
+
}
|
| 725 |
+
|
| 726 |
+
console.log('[PhilVerify] Fallback attempt (TEXT):', msgPayload)
|
| 727 |
+
response = await new Promise((resolve, reject) => {
|
| 728 |
+
chrome.runtime.sendMessage(msgPayload, (resp) => {
|
| 729 |
+
if (chrome.runtime.lastError) {
|
| 730 |
+
const msg = chrome.runtime.lastError.message ?? ''
|
| 731 |
+
reject(new Error(
|
| 732 |
+
msg.includes('Extension context invalidated')
|
| 733 |
+
? 'Extension was reloaded β please refresh the page to re-activate PhilVerify.'
|
| 734 |
+
: msg
|
| 735 |
+
))
|
| 736 |
+
}
|
| 737 |
+
else if (!resp?.ok) reject(new Error(resp?.error ?? 'Unknown error'))
|
| 738 |
+
else resolve(resp.result)
|
| 739 |
+
})
|
| 740 |
+
})
|
| 741 |
+
} else {
|
| 742 |
+
// Re-throw if no fallback possible
|
| 743 |
+
throw err
|
| 744 |
+
}
|
| 745 |
+
}
|
| 746 |
|
| 747 |
log(`Verification result: verdict=${response.verdict}, score=${response.final_score}`)
|
| 748 |
+
const extractedText = usedType === 'URL' ? url : (usedType === 'TEXT' ? text : null)
|
| 749 |
+
showVerificationReport(post, btn, response, inputSummary, extractedText, image)
|
| 750 |
} catch (err) {
|
| 751 |
warn('Verification failed:', err.message)
|
| 752 |
showErrorReport(post, btn, err.message)
|
|
|
|
| 755 |
|
| 756 |
// ββ Verification report rendering βββββββββββββββββββββββββββββββββββββββββ
|
| 757 |
|
| 758 |
+
function showVerificationReport(post, btn, result, inputSummary, extractedText, extractedImage) {
|
| 759 |
// Remove the button
|
| 760 |
btn.remove()
|
| 761 |
|
| 762 |
+
// Remove any existing modal
|
| 763 |
+
document.getElementById('pv-modal-overlay')?.remove()
|
|
|
|
| 764 |
|
| 765 |
const verdict = result.verdict ?? 'Unknown'
|
| 766 |
const color = VERDICT_COLORS[verdict] ?? '#5c554e'
|
|
|
|
| 773 |
const features = result.layer1?.triggered_features ?? []
|
| 774 |
const cached = result._fromCache ? ' Β· cached' : ''
|
| 775 |
|
| 776 |
+
// ββ Backdrop overlay
|
| 777 |
+
const overlay = document.createElement('div')
|
| 778 |
+
overlay.id = 'pv-modal-overlay'
|
| 779 |
+
overlay.className = 'pv-modal-overlay'
|
| 780 |
+
overlay.setAttribute('role', 'dialog')
|
| 781 |
+
overlay.setAttribute('aria-modal', 'true')
|
| 782 |
+
overlay.setAttribute('aria-label', 'PhilVerify fact-check report')
|
| 783 |
+
|
| 784 |
+
function closeModal() {
|
| 785 |
+
overlay.classList.remove('pv-modal--open')
|
| 786 |
+
overlay.addEventListener('transitionend', () => {
|
| 787 |
+
overlay.remove()
|
| 788 |
+
delete post.dataset.philverifyBtn
|
| 789 |
+
addButtonsToFacebookPosts()
|
| 790 |
+
}, { once: true })
|
| 791 |
+
}
|
| 792 |
|
| 793 |
+
// Click outside card = close
|
| 794 |
+
overlay.addEventListener('click', (e) => {
|
| 795 |
+
if (e.target === overlay) closeModal()
|
| 796 |
+
})
|
| 797 |
+
// Escape key = close
|
| 798 |
+
const onKey = (e) => { if (e.key === 'Escape') { closeModal(); document.removeEventListener('keydown', onKey) } }
|
| 799 |
+
document.addEventListener('keydown', onKey)
|
| 800 |
+
|
| 801 |
+
// ββ Modal card
|
| 802 |
+
const card = document.createElement('div')
|
| 803 |
+
card.className = 'pv-modal-card'
|
| 804 |
+
|
| 805 |
+
// β Header
|
| 806 |
const header = document.createElement('div')
|
| 807 |
header.className = 'pv-report-header'
|
| 808 |
|
|
|
|
| 814 |
closeBtn.className = 'pv-report-close'
|
| 815 |
closeBtn.textContent = 'β'
|
| 816 |
closeBtn.setAttribute('aria-label', 'Close fact-check report')
|
| 817 |
+
closeBtn.addEventListener('click', (e) => { e.stopPropagation(); closeModal() })
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
|
| 819 |
header.appendChild(logo)
|
| 820 |
header.appendChild(closeBtn)
|
| 821 |
+
card.appendChild(header)
|
| 822 |
|
| 823 |
+
// β Verdict row
|
| 824 |
const verdictRow = document.createElement('div')
|
| 825 |
verdictRow.className = 'pv-report-verdict-row'
|
| 826 |
verdictRow.style.borderLeftColor = color
|
| 827 |
+
verdictRow.style.background = bg
|
| 828 |
|
| 829 |
const verdictLabel = document.createElement('div')
|
| 830 |
verdictLabel.className = 'pv-report-verdict'
|
|
|
|
| 837 |
|
| 838 |
verdictRow.appendChild(verdictLabel)
|
| 839 |
verdictRow.appendChild(scoreText)
|
| 840 |
+
card.appendChild(verdictRow)
|
| 841 |
|
| 842 |
// β Confidence bar
|
| 843 |
const barWrap = document.createElement('div')
|
|
|
|
| 852 |
|
| 853 |
const barFill = document.createElement('div')
|
| 854 |
barFill.className = 'pv-confidence-bar-fill'
|
| 855 |
+
barFill.style.width = '0'
|
| 856 |
barFill.style.background = color
|
| 857 |
|
| 858 |
const barValue = document.createElement('span')
|
|
|
|
| 863 |
barWrap.appendChild(barLabel)
|
| 864 |
barWrap.appendChild(barTrack)
|
| 865 |
barWrap.appendChild(barValue)
|
| 866 |
+
card.appendChild(barWrap)
|
| 867 |
|
| 868 |
+
// β Info rows
|
| 869 |
const addInfoRow = (labelText, valueText) => {
|
| 870 |
const row = document.createElement('div')
|
| 871 |
row.className = 'pv-report-row'
|
|
|
|
| 877 |
val.textContent = valueText
|
| 878 |
row.appendChild(lbl)
|
| 879 |
row.appendChild(val)
|
| 880 |
+
card.appendChild(row)
|
| 881 |
}
|
| 882 |
|
| 883 |
addInfoRow('LANGUAGE', safeText(language))
|
| 884 |
addInfoRow('INPUT', safeText(inputSummary))
|
| 885 |
|
| 886 |
+
// β Image analyzed (thumbnail + OCR text)
|
| 887 |
+
if (extractedImage) {
|
| 888 |
+
const imgSection = document.createElement('div')
|
| 889 |
+
imgSection.className = 'pv-report-explanation'
|
| 890 |
+
const imgLabel = document.createElement('span')
|
| 891 |
+
imgLabel.className = 'pv-report-label'
|
| 892 |
+
imgLabel.textContent = 'IMAGE ANALYZED'
|
| 893 |
+
const img = document.createElement('img')
|
| 894 |
+
img.src = extractedImage
|
| 895 |
+
img.alt = 'Extracted post image'
|
| 896 |
+
img.style.cssText = 'width:100%;border-radius:6px;margin-top:6px;display:block;'
|
| 897 |
+
imgSection.appendChild(imgLabel)
|
| 898 |
+
imgSection.appendChild(img)
|
| 899 |
+
|
| 900 |
+
// OCR text extracted from the image
|
| 901 |
+
if (result.ocr_text) {
|
| 902 |
+
const ocrLabel = document.createElement('span')
|
| 903 |
+
ocrLabel.className = 'pv-report-label'
|
| 904 |
+
ocrLabel.style.marginTop = '8px'
|
| 905 |
+
ocrLabel.textContent = 'IMAGE TEXT (OCR)'
|
| 906 |
+
const ocrPara = document.createElement('p')
|
| 907 |
+
ocrPara.className = 'pv-report-explanation-text'
|
| 908 |
+
ocrPara.textContent = safeText(result.ocr_text)
|
| 909 |
+
imgSection.appendChild(ocrLabel)
|
| 910 |
+
imgSection.appendChild(ocrPara)
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
card.appendChild(imgSection)
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
// β Caption / text analyzed (full text, no truncation)
|
| 917 |
+
if (extractedText) {
|
| 918 |
+
const textSection = document.createElement('div')
|
| 919 |
+
textSection.className = 'pv-report-explanation'
|
| 920 |
+
const textLabel = document.createElement('span')
|
| 921 |
+
textLabel.className = 'pv-report-label'
|
| 922 |
+
textLabel.textContent = 'CAPTION TEXT'
|
| 923 |
+
const textPara = document.createElement('p')
|
| 924 |
+
textPara.className = 'pv-report-explanation-text'
|
| 925 |
+
textPara.textContent = safeText(extractedText)
|
| 926 |
+
textSection.appendChild(textLabel)
|
| 927 |
+
textSection.appendChild(textPara)
|
| 928 |
+
card.appendChild(textSection)
|
| 929 |
+
}
|
| 930 |
+
|
| 931 |
+
// β Signals
|
| 932 |
if (features.length > 0) {
|
| 933 |
const signalsSection = document.createElement('div')
|
| 934 |
signalsSection.className = 'pv-report-signals'
|
|
|
|
| 935 |
const signalsLabel = document.createElement('span')
|
| 936 |
signalsLabel.className = 'pv-report-label'
|
| 937 |
signalsLabel.textContent = 'SUSPICIOUS SIGNALS'
|
| 938 |
signalsSection.appendChild(signalsLabel)
|
|
|
|
| 939 |
const tagsWrap = document.createElement('div')
|
| 940 |
tagsWrap.className = 'pv-report-tags'
|
| 941 |
for (const f of features.slice(0, 5)) {
|
|
|
|
| 945 |
tagsWrap.appendChild(tag)
|
| 946 |
}
|
| 947 |
signalsSection.appendChild(tagsWrap)
|
| 948 |
+
card.appendChild(signalsSection)
|
| 949 |
}
|
| 950 |
|
| 951 |
// β Evidence sources
|
| 952 |
if (sources.length > 0) {
|
| 953 |
const sourcesSection = document.createElement('div')
|
| 954 |
sourcesSection.className = 'pv-report-sources'
|
|
|
|
| 955 |
const sourcesLabel = document.createElement('span')
|
| 956 |
sourcesLabel.className = 'pv-report-label'
|
| 957 |
sourcesLabel.textContent = 'EVIDENCE SOURCES'
|
| 958 |
sourcesSection.appendChild(sourcesLabel)
|
|
|
|
| 959 |
const sourcesList = document.createElement('ul')
|
| 960 |
sourcesList.className = 'pv-report-sources-list'
|
|
|
|
| 961 |
for (const src of sources.slice(0, 5)) {
|
| 962 |
const li = document.createElement('li')
|
| 963 |
li.className = 'pv-report-source-item'
|
|
|
|
| 964 |
const link = document.createElement('a')
|
| 965 |
link.href = safeUrl(src.url)
|
| 966 |
link.target = '_blank'
|
| 967 |
link.rel = 'noreferrer'
|
| 968 |
link.className = 'pv-report-source-link'
|
| 969 |
link.textContent = src.title?.slice(0, 60) ?? src.source_name ?? 'View source'
|
|
|
|
| 970 |
const stance = document.createElement('span')
|
| 971 |
stance.className = 'pv-report-source-stance'
|
| 972 |
stance.textContent = src.stance ?? ''
|
| 973 |
if (src.stance === 'Refutes') stance.style.color = '#dc2626'
|
| 974 |
if (src.stance === 'Supports') stance.style.color = '#16a34a'
|
| 975 |
+
if (src.stance_reason) {
|
| 976 |
+
stance.title = src.stance_reason
|
| 977 |
+
stance.style.cursor = 'help'
|
| 978 |
+
}
|
| 979 |
li.appendChild(link)
|
| 980 |
li.appendChild(stance)
|
| 981 |
sourcesList.appendChild(li)
|
| 982 |
}
|
| 983 |
sourcesSection.appendChild(sourcesList)
|
| 984 |
+
card.appendChild(sourcesSection)
|
| 985 |
}
|
| 986 |
|
| 987 |
// β Explanation (claim used)
|
|
|
|
| 996 |
explText.textContent = result.layer2.claim_used
|
| 997 |
explanation.appendChild(explLabel)
|
| 998 |
explanation.appendChild(explText)
|
| 999 |
+
card.appendChild(explanation)
|
| 1000 |
+
}
|
| 1001 |
+
|
| 1002 |
+
// β Metadata footer (model tier + claim method)
|
| 1003 |
+
const modelTier = result.layer1?.model_tier
|
| 1004 |
+
const claimMethod = result.layer2?.claim_method
|
| 1005 |
+
if (modelTier || claimMethod) {
|
| 1006 |
+
const metaFooter = document.createElement('div')
|
| 1007 |
+
metaFooter.className = 'pv-report-meta-footer'
|
| 1008 |
+
if (modelTier) {
|
| 1009 |
+
const lbl = document.createElement('span')
|
| 1010 |
+
lbl.className = 'pv-report-meta-label'
|
| 1011 |
+
lbl.textContent = 'MODEL'
|
| 1012 |
+
const val = document.createElement('span')
|
| 1013 |
+
val.className = 'pv-report-meta-val'
|
| 1014 |
+
val.textContent = modelTier
|
| 1015 |
+
metaFooter.appendChild(lbl)
|
| 1016 |
+
metaFooter.appendChild(val)
|
| 1017 |
+
}
|
| 1018 |
+
if (modelTier && claimMethod) {
|
| 1019 |
+
const sep = document.createElement('span')
|
| 1020 |
+
sep.className = 'pv-report-meta-sep'
|
| 1021 |
+
sep.textContent = 'Β·'
|
| 1022 |
+
metaFooter.appendChild(sep)
|
| 1023 |
+
}
|
| 1024 |
+
if (claimMethod) {
|
| 1025 |
+
const lbl = document.createElement('span')
|
| 1026 |
+
lbl.className = 'pv-report-meta-label'
|
| 1027 |
+
lbl.textContent = 'VIA'
|
| 1028 |
+
const val = document.createElement('span')
|
| 1029 |
+
val.className = 'pv-report-meta-val'
|
| 1030 |
+
val.textContent = claimMethod
|
| 1031 |
+
metaFooter.appendChild(lbl)
|
| 1032 |
+
metaFooter.appendChild(val)
|
| 1033 |
+
}
|
| 1034 |
+
card.appendChild(metaFooter)
|
| 1035 |
}
|
| 1036 |
|
| 1037 |
// β Full analysis link
|
|
|
|
| 1041 |
fullLink.target = '_blank'
|
| 1042 |
fullLink.rel = 'noreferrer'
|
| 1043 |
fullLink.textContent = 'Open Full Dashboard β'
|
| 1044 |
+
card.appendChild(fullLink)
|
| 1045 |
+
|
| 1046 |
+
// Assemble and show
|
| 1047 |
+
overlay.appendChild(card)
|
| 1048 |
+
document.body.appendChild(overlay)
|
| 1049 |
+
|
| 1050 |
+
// Trigger animation
|
| 1051 |
+
requestAnimationFrame(() => overlay.classList.add('pv-modal--open'))
|
| 1052 |
+
|
| 1053 |
+
// Animate the confidence bar fill
|
| 1054 |
+
setTimeout(() => {
|
| 1055 |
+
barFill.style.width = `${confidence}%`
|
| 1056 |
+
}, 300)
|
| 1057 |
}
|
| 1058 |
|
| 1059 |
function showErrorReport(post, btn, errorMessage) {
|
|
|
|
| 1060 |
btn.classList.remove('pv-verify-btn--loading')
|
| 1061 |
btn.classList.add('pv-verify-btn--error')
|
| 1062 |
btn.disabled = false
|
|
|
|
| 1066 |
|
| 1067 |
const icon = btn.querySelector('.pv-verify-btn-icon')
|
| 1068 |
const label = btn.querySelector('.pv-verify-btn-label')
|
|
|
|
|
|
|
| 1069 |
|
| 1070 |
+
// Extension was reloaded β retrying is useless, user must refresh the tab
|
| 1071 |
+
const needsRefresh = errorMessage.includes('Extension was reloaded') ||
|
| 1072 |
+
errorMessage.includes('Extension context invalidated')
|
| 1073 |
+
|
| 1074 |
+
if (needsRefresh) {
|
| 1075 |
+
if (icon) icon.textContent = 'π'
|
| 1076 |
+
if (label) label.textContent = 'Extension updated β refresh page'
|
| 1077 |
+
btn.disabled = true // No point retrying; force refresh
|
| 1078 |
+
return
|
| 1079 |
}
|
| 1080 |
|
| 1081 |
+
if (icon) icon.textContent = 'β οΈ'
|
| 1082 |
+
if (label) label.textContent = 'Verification failed β tap to retry'
|
| 1083 |
+
|
| 1084 |
// Remove old click listeners by replacing element
|
| 1085 |
const newBtn = btn.cloneNode(true)
|
| 1086 |
btn.replaceWith(newBtn)
|
|
|
|
| 1094 |
|
| 1095 |
// ββ MutationObserver ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1096 |
|
| 1097 |
+
// For Facebook: debounced full rescan (new posts appear via infinite scroll)
|
| 1098 |
+
let fbDebounceTimer = null
|
| 1099 |
+
function scheduleFacebookScan() {
|
| 1100 |
+
if (fbDebounceTimer) clearTimeout(fbDebounceTimer)
|
| 1101 |
+
fbDebounceTimer = setTimeout(() => {
|
| 1102 |
+
fbDebounceTimer = null
|
| 1103 |
+
addButtonsToFacebookPosts()
|
| 1104 |
+
}, 150)
|
| 1105 |
+
}
|
| 1106 |
+
|
| 1107 |
+
// For Twitter/news: RAF-batched per-post injection
|
| 1108 |
const pendingPosts = new Set()
|
| 1109 |
let rafScheduled = false
|
| 1110 |
|
|
|
|
| 1122 |
}
|
| 1123 |
}
|
| 1124 |
|
| 1125 |
+
const observer = new MutationObserver(() => {
|
| 1126 |
+
if (PLATFORM === 'facebook') {
|
| 1127 |
+
// Just re-scan the whole document for new hide-post buttons
|
| 1128 |
+
scheduleFacebookScan()
|
| 1129 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1130 |
}
|
| 1131 |
+
// Twitter / news: find posts inside mutated subtrees
|
| 1132 |
+
const posts = findPosts(document.body)
|
| 1133 |
+
for (const post of posts) scheduleProcess(post)
|
| 1134 |
})
|
| 1135 |
|
| 1136 |
// ββ Initialization ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 1139 |
log(`Initializing on ${PLATFORM} (${window.location.hostname})`)
|
| 1140 |
|
| 1141 |
// Check autoScan setting β controls whether buttons are shown at all
|
| 1142 |
+
// Use a short timeout so we don't block if background worker is asleep
|
| 1143 |
+
let response = { autoScan: true }
|
| 1144 |
try {
|
| 1145 |
+
response = await Promise.race([
|
| 1146 |
+
new Promise((resolve) => {
|
| 1147 |
+
chrome.runtime.sendMessage({ type: 'GET_SETTINGS' }, (r) => {
|
| 1148 |
+
if (chrome.runtime.lastError) resolve({ autoScan: true })
|
| 1149 |
+
else resolve(r ?? { autoScan: true })
|
| 1150 |
+
})
|
| 1151 |
+
}),
|
| 1152 |
+
new Promise((resolve) => setTimeout(() => resolve({ autoScan: true }), 1500)),
|
| 1153 |
+
])
|
|
|
|
| 1154 |
} catch {
|
| 1155 |
response = { autoScan: true }
|
| 1156 |
}
|
| 1157 |
|
| 1158 |
log('Settings:', response)
|
| 1159 |
if (response?.autoScan === false) {
|
| 1160 |
+
log('Auto-scan disabled β no verify buttons will be shown')
|
| 1161 |
return
|
| 1162 |
}
|
| 1163 |
|
| 1164 |
+
if (PLATFORM === 'facebook') {
|
| 1165 |
+
// Initial scan + watch for new posts via infinite scroll
|
| 1166 |
+
addButtonsToFacebookPosts()
|
| 1167 |
+
observer.observe(document.body, { childList: true, subtree: true })
|
| 1168 |
+
log('Facebook mode: watching for new posts via hide-post button anchor')
|
| 1169 |
+
} else {
|
| 1170 |
+
// Twitter / news sites: selector-based
|
| 1171 |
+
const existing = findPosts(document.body)
|
| 1172 |
+
log(`Found ${existing.length} existing posts`)
|
| 1173 |
+
for (const post of existing) scheduleProcess(post)
|
| 1174 |
+
observer.observe(document.body, { childList: true, subtree: true })
|
| 1175 |
+
log('MutationObserver started')
|
| 1176 |
+
// News article pages: also show auto-verify banner at top of page
|
| 1177 |
+
if (PLATFORM === 'news') autoVerifyPage()
|
| 1178 |
+
}
|
| 1179 |
}
|
| 1180 |
|
| 1181 |
init()
|
| 1182 |
|
| 1183 |
+
// ββ SPA navigation listener βββββββββββββββββββββββββββββββββββββββββββββββ
|
| 1184 |
+
// Facebook is a single-page app. background.js fires RE_SCAN_POSTS whenever
|
| 1185 |
+
// it detects a pushState navigation on facebook.com via webNavigation API.
|
| 1186 |
+
// This ensures profile pages, group pages, etc. get scanned after navigation.
|
| 1187 |
+
chrome.runtime.onMessage.addListener((msg) => {
|
| 1188 |
+
if (msg.action === 'RE_SCAN_POSTS') {
|
| 1189 |
+
log('SPA navigation detected, re-scanning for posts...')
|
| 1190 |
+
// Small delay to let Facebook finish rendering the new page content
|
| 1191 |
+
setTimeout(addButtonsToFacebookPosts, 500)
|
| 1192 |
+
}
|
| 1193 |
+
})
|
| 1194 |
+
|
| 1195 |
// ββ Auto-verify news article pages (non-social) ββββββββββββββββββββββββββββ
|
| 1196 |
// When the content script runs on a PH news site (not the homepage),
|
| 1197 |
// it auto-verifies the current URL and injects a floating verdict banner.
|
|
|
|
| 1200 |
const url = window.location.href
|
| 1201 |
const path = new URL(url).pathname
|
| 1202 |
// Skip homepages and section indexes (very short paths like / or /news)
|
| 1203 |
+
if (!path || path.length < 5 || path.split('/').filter(Boolean).length < 1) return
|
| 1204 |
|
| 1205 |
const banner = document.createElement('div')
|
| 1206 |
banner.id = 'pv-auto-banner'
|
|
|
|
| 1273 |
try {
|
| 1274 |
const response = await new Promise((resolve, reject) => {
|
| 1275 |
chrome.runtime.sendMessage({ type: 'VERIFY_URL', url }, (resp) => {
|
| 1276 |
+
if (chrome.runtime.lastError) {
|
| 1277 |
+
const msg = chrome.runtime.lastError.message ?? ''
|
| 1278 |
+
reject(new Error(
|
| 1279 |
+
msg.includes('Extension context invalidated')
|
| 1280 |
+
? 'Extension was reloaded β please refresh the page to re-activate PhilVerify.'
|
| 1281 |
+
: msg
|
| 1282 |
+
))
|
| 1283 |
+
}
|
| 1284 |
else if (!resp?.ok) reject(new Error(resp?.error ?? 'Unknown error'))
|
| 1285 |
else resolve(resp.result)
|
| 1286 |
})
|
|
@@ -8,7 +8,9 @@
|
|
| 8 |
"storage",
|
| 9 |
"activeTab",
|
| 10 |
"scripting",
|
| 11 |
-
"sidePanel"
|
|
|
|
|
|
|
| 12 |
],
|
| 13 |
|
| 14 |
"side_panel": {
|
|
@@ -18,8 +20,10 @@
|
|
| 18 |
"host_permissions": [
|
| 19 |
"https://www.facebook.com/*",
|
| 20 |
"https://facebook.com/*",
|
|
|
|
| 21 |
"https://x.com/*",
|
| 22 |
"https://twitter.com/*",
|
|
|
|
| 23 |
"https://philverify.web.app/*",
|
| 24 |
"http://localhost:8000/*"
|
| 25 |
],
|
|
|
|
| 8 |
"storage",
|
| 9 |
"activeTab",
|
| 10 |
"scripting",
|
| 11 |
+
"sidePanel",
|
| 12 |
+
"tabs",
|
| 13 |
+
"webNavigation"
|
| 14 |
],
|
| 15 |
|
| 16 |
"side_panel": {
|
|
|
|
| 20 |
"host_permissions": [
|
| 21 |
"https://www.facebook.com/*",
|
| 22 |
"https://facebook.com/*",
|
| 23 |
+
"https://*.fbcdn.net/*",
|
| 24 |
"https://x.com/*",
|
| 25 |
"https://twitter.com/*",
|
| 26 |
+
"https://pbs.twimg.com/*",
|
| 27 |
"https://philverify.web.app/*",
|
| 28 |
"http://localhost:8000/*"
|
| 29 |
],
|
|
@@ -142,19 +142,6 @@
|
|
| 142 |
.btn-verify:focus-visible { outline: 2px solid var(--accent-cyan); outline-offset: 2px; }
|
| 143 |
|
| 144 |
/* ββ Result card βββββββββββββββββββββββββββββββββββ */
|
| 145 |
-
.result {
|
| 146 |
-
margin-top: 10px;
|
| 147 |
-
padding: 10px 12px;
|
| 148 |
-
background: var(--bg-surface);
|
| 149 |
-
border: 1px solid var(--border);
|
| 150 |
-
border-radius: 3px;
|
| 151 |
-
}
|
| 152 |
-
.result-verdict {
|
| 153 |
-
font-size: 15px;
|
| 154 |
-
font-weight: 800;
|
| 155 |
-
letter-spacing: -0.01em;
|
| 156 |
-
margin-bottom: 4px;
|
| 157 |
-
}
|
| 158 |
.result-score {
|
| 159 |
font-size: 10px;
|
| 160 |
color: var(--text-muted);
|
|
@@ -368,6 +355,94 @@
|
|
| 368 |
margin-top: 6px;
|
| 369 |
height: 14px;
|
| 370 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
</style>
|
| 372 |
</head>
|
| 373 |
<body>
|
|
|
|
| 142 |
.btn-verify:focus-visible { outline: 2px solid var(--accent-cyan); outline-offset: 2px; }
|
| 143 |
|
| 144 |
/* ββ Result card βββββββββββββββββββββββββββββββββββ */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
.result-score {
|
| 146 |
font-size: 10px;
|
| 147 |
color: var(--text-muted);
|
|
|
|
| 355 |
margin-top: 6px;
|
| 356 |
height: 14px;
|
| 357 |
}
|
| 358 |
+
|
| 359 |
+
/* ββ Result card β spine layout ββββββββββββββββββββ */
|
| 360 |
+
.result {
|
| 361 |
+
margin-top: 10px;
|
| 362 |
+
padding: 0;
|
| 363 |
+
background: var(--bg-surface);
|
| 364 |
+
border: 1px solid var(--border);
|
| 365 |
+
border-radius: 3px;
|
| 366 |
+
overflow: hidden;
|
| 367 |
+
}
|
| 368 |
+
.result-body {
|
| 369 |
+
padding: 10px 12px 0;
|
| 370 |
+
}
|
| 371 |
+
.result-top {
|
| 372 |
+
display: flex;
|
| 373 |
+
justify-content: space-between;
|
| 374 |
+
align-items: baseline;
|
| 375 |
+
margin-bottom: 6px;
|
| 376 |
+
}
|
| 377 |
+
.result-verdict {
|
| 378 |
+
font-size: 20px;
|
| 379 |
+
font-weight: 800;
|
| 380 |
+
letter-spacing: -0.01em;
|
| 381 |
+
}
|
| 382 |
+
.result-score {
|
| 383 |
+
font-size: 10px;
|
| 384 |
+
color: var(--text-muted);
|
| 385 |
+
font-family: var(--font-mono);
|
| 386 |
+
}
|
| 387 |
+
.result-hairline {
|
| 388 |
+
height: 1px;
|
| 389 |
+
opacity: 0.3;
|
| 390 |
+
margin-bottom: 8px;
|
| 391 |
+
}
|
| 392 |
+
.result-chips {
|
| 393 |
+
display: flex;
|
| 394 |
+
flex-wrap: wrap;
|
| 395 |
+
gap: 3px;
|
| 396 |
+
}
|
| 397 |
+
.result-chip {
|
| 398 |
+
padding: 2px 6px;
|
| 399 |
+
background: rgba(0,0,0,0.4);
|
| 400 |
+
border: 1px solid;
|
| 401 |
+
border-radius: 2px;
|
| 402 |
+
font-size: 9px;
|
| 403 |
+
font-family: var(--font-mono);
|
| 404 |
+
letter-spacing: 0.03em;
|
| 405 |
+
}
|
| 406 |
+
.result-meta-footer {
|
| 407 |
+
display: flex;
|
| 408 |
+
align-items: center;
|
| 409 |
+
gap: 5px;
|
| 410 |
+
padding: 6px 12px;
|
| 411 |
+
border-top: 1px solid var(--border);
|
| 412 |
+
margin-top: 8px;
|
| 413 |
+
}
|
| 414 |
+
.result-meta-label {
|
| 415 |
+
font-size: 8px;
|
| 416 |
+
font-weight: 700;
|
| 417 |
+
letter-spacing: 0.1em;
|
| 418 |
+
color: var(--text-muted);
|
| 419 |
+
text-transform: uppercase;
|
| 420 |
+
}
|
| 421 |
+
.result-meta-val {
|
| 422 |
+
font-size: 9px;
|
| 423 |
+
font-family: var(--font-mono);
|
| 424 |
+
color: #6b7280;
|
| 425 |
+
}
|
| 426 |
+
.result-meta-sep {
|
| 427 |
+
color: var(--border);
|
| 428 |
+
font-size: 10px;
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
/* ββ History β spine + hover ββββββββββββββββββββββββ */
|
| 432 |
+
.history-item:hover { background: #1a1a1a; }
|
| 433 |
+
.history-model {
|
| 434 |
+
margin-left: auto;
|
| 435 |
+
font-size: 8px;
|
| 436 |
+
font-family: var(--font-mono);
|
| 437 |
+
color: #6b7280;
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
/* ββ Empty state icon βββββββββββββββββββββββββββββββ */
|
| 441 |
+
.state-empty-icon {
|
| 442 |
+
display: block;
|
| 443 |
+
margin: 0 auto 8px;
|
| 444 |
+
opacity: 0.25;
|
| 445 |
+
}
|
| 446 |
</style>
|
| 447 |
</head>
|
| 448 |
<body>
|
|
@@ -33,8 +33,11 @@ function safeUrl(url) {
|
|
| 33 |
} catch { return '#' }
|
| 34 |
}
|
| 35 |
function msg(obj) {
|
| 36 |
-
return new Promise(resolve => {
|
| 37 |
-
chrome.runtime.sendMessage(obj,
|
|
|
|
|
|
|
|
|
|
| 38 |
})
|
| 39 |
}
|
| 40 |
|
|
@@ -57,39 +60,60 @@ function isUrl(s) {
|
|
| 57 |
function renderResult(result, container) {
|
| 58 |
const color = VERDICT_COLORS[result.verdict] ?? '#5c554e'
|
| 59 |
const topSource = result.layer2?.sources?.[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
container.innerHTML = `
|
| 62 |
-
<div class="result" role="status" aria-live="polite">
|
| 63 |
-
<div class="result-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
<
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
</div>
|
| 73 |
-
${
|
| 74 |
-
<div class="result-
|
| 75 |
-
<span class="result-label">
|
| 76 |
-
<span class="result-
|
|
|
|
| 77 |
</div>` : ''}
|
| 78 |
-
${topSource ? `
|
| 79 |
-
<div class="result-source">
|
| 80 |
-
<div class="result-label" style="margin-bottom:4px;">Top Source</div>
|
| 81 |
-
<a href="${safeUrl(topSource.url)}" target="_blank" rel="noreferrer">${safeText(topSource.title?.slice(0, 55) ?? topSource.source_name ?? 'View')} β</a>
|
| 82 |
-
</div>` : ''}
|
| 83 |
-
<a class="open-full" href="https://philverify.web.app" target="_blank" rel="noreferrer">
|
| 84 |
-
Open Full Dashboard β
|
| 85 |
-
</a>
|
| 86 |
</div>
|
| 87 |
`
|
| 88 |
}
|
| 89 |
|
| 90 |
function renderHistory(entries, container) {
|
| 91 |
if (!entries.length) {
|
| 92 |
-
container.innerHTML =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
return
|
| 94 |
}
|
| 95 |
container.innerHTML = `
|
|
@@ -97,10 +121,11 @@ function renderHistory(entries, container) {
|
|
| 97 |
${entries.map(e => {
|
| 98 |
const color = VERDICT_COLORS[e.verdict] ?? '#5c554e'
|
| 99 |
return `
|
| 100 |
-
<li class="history-item" role="listitem">
|
| 101 |
<div class="history-item-top">
|
| 102 |
<span class="history-verdict" style="background:${color}22;color:${color};border:1px solid ${color}4d;">${safeText(e.verdict)}</span>
|
| 103 |
<span class="history-score">${Math.round(e.final_score)}%</span>
|
|
|
|
| 104 |
</div>
|
| 105 |
<div class="history-preview">${safeText(e.text_preview || 'β')}</div>
|
| 106 |
<div class="history-time">${timeAgo(e.timestamp)}</div>
|
|
@@ -221,19 +246,19 @@ async function checkApiStatus() {
|
|
| 221 |
const dot = document.getElementById('api-status-dot')
|
| 222 |
const label = document.getElementById('api-status-label')
|
| 223 |
try {
|
| 224 |
-
|
| 225 |
-
const
|
| 226 |
-
if (
|
| 227 |
-
dot.style.background
|
| 228 |
-
label.style.color
|
| 229 |
-
label.textContent
|
| 230 |
} else {
|
| 231 |
-
throw new Error(`${
|
| 232 |
}
|
| 233 |
} catch {
|
| 234 |
-
dot.style.background
|
| 235 |
-
label.style.color
|
| 236 |
-
label.textContent
|
| 237 |
}
|
| 238 |
}
|
| 239 |
|
|
|
|
| 33 |
} catch { return '#' }
|
| 34 |
}
|
| 35 |
function msg(obj) {
|
| 36 |
+
return new Promise((resolve, reject) => {
|
| 37 |
+
chrome.runtime.sendMessage(obj, (resp) => {
|
| 38 |
+
if (chrome.runtime.lastError) reject(new Error(chrome.runtime.lastError.message))
|
| 39 |
+
else resolve(resp)
|
| 40 |
+
})
|
| 41 |
})
|
| 42 |
}
|
| 43 |
|
|
|
|
| 60 |
function renderResult(result, container) {
|
| 61 |
const color = VERDICT_COLORS[result.verdict] ?? '#5c554e'
|
| 62 |
const topSource = result.layer2?.sources?.[0]
|
| 63 |
+
const features = result.layer1?.triggered_features ?? []
|
| 64 |
+
const modelTier = result.layer1?.model_tier
|
| 65 |
+
const claimMethod = result.layer2?.claim_method
|
| 66 |
+
const hasFooter = modelTier || claimMethod
|
| 67 |
|
| 68 |
container.innerHTML = `
|
| 69 |
+
<div class="result" role="status" aria-live="polite" style="border-left:3px solid ${color}">
|
| 70 |
+
<div class="result-body">
|
| 71 |
+
<div class="result-top">
|
| 72 |
+
<div class="result-verdict" style="color:${color}">${safeText(result.verdict)}</div>
|
| 73 |
+
<div class="result-score">${Math.round(result.final_score)}%${result._fromCache ? ' Β· cached' : ''}</div>
|
| 74 |
+
</div>
|
| 75 |
+
<div class="result-hairline" style="background:${color}"></div>
|
| 76 |
+
<div class="result-row">
|
| 77 |
+
<span class="result-label">Language</span>
|
| 78 |
+
<span class="result-val">${safeText(result.language ?? 'β')}</span>
|
| 79 |
+
</div>
|
| 80 |
+
<div class="result-row">
|
| 81 |
+
<span class="result-label">Confidence</span>
|
| 82 |
+
<span class="result-val" style="color:${color}">${result.confidence?.toFixed(1)}%</span>
|
| 83 |
+
</div>
|
| 84 |
+
${features.length ? `
|
| 85 |
+
<div class="result-row">
|
| 86 |
+
<span class="result-label">Signals</span>
|
| 87 |
+
<span class="result-chips">${features.slice(0, 3).map(f => `<span class="result-chip" style="border-color:${color}55;color:${color}">${safeText(f)}</span>`).join('')}</span>
|
| 88 |
+
</div>` : ''}
|
| 89 |
+
${topSource ? `
|
| 90 |
+
<div class="result-source">
|
| 91 |
+
<div class="result-label" style="margin-bottom:4px;">Top Source</div>
|
| 92 |
+
<a href="${safeUrl(topSource.url)}" target="_blank" rel="noreferrer">${safeText(topSource.title?.slice(0, 55) ?? topSource.source_name ?? 'View')} β</a>
|
| 93 |
+
</div>` : ''}
|
| 94 |
+
<a class="open-full" href="https://philverify.web.app" target="_blank" rel="noreferrer">
|
| 95 |
+
Open Full Dashboard β
|
| 96 |
+
</a>
|
| 97 |
</div>
|
| 98 |
+
${hasFooter ? `
|
| 99 |
+
<div class="result-meta-footer">
|
| 100 |
+
${modelTier ? `<span class="result-meta-label">MODEL</span><span class="result-meta-val">${safeText(modelTier)}</span>` : ''}
|
| 101 |
+
${modelTier && claimMethod ? '<span class="result-meta-sep">Β·</span>' : ''}
|
| 102 |
+
${claimMethod ? `<span class="result-meta-label">VIA</span><span class="result-meta-val">${safeText(claimMethod)}</span>` : ''}
|
| 103 |
</div>` : ''}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
</div>
|
| 105 |
`
|
| 106 |
}
|
| 107 |
|
| 108 |
function renderHistory(entries, container) {
|
| 109 |
if (!entries.length) {
|
| 110 |
+
container.innerHTML = `
|
| 111 |
+
<div class="state-empty">
|
| 112 |
+
<svg class="state-empty-icon" width="32" height="32" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5" aria-hidden="true">
|
| 113 |
+
<path d="M12 22s8-4 8-10V5l-8-3-8 3v7c0 6 8 10 8 10z"/>
|
| 114 |
+
</svg>
|
| 115 |
+
No verifications yet.
|
| 116 |
+
</div>`
|
| 117 |
return
|
| 118 |
}
|
| 119 |
container.innerHTML = `
|
|
|
|
| 121 |
${entries.map(e => {
|
| 122 |
const color = VERDICT_COLORS[e.verdict] ?? '#5c554e'
|
| 123 |
return `
|
| 124 |
+
<li class="history-item" role="listitem" style="border-left:2px solid ${color}">
|
| 125 |
<div class="history-item-top">
|
| 126 |
<span class="history-verdict" style="background:${color}22;color:${color};border:1px solid ${color}4d;">${safeText(e.verdict)}</span>
|
| 127 |
<span class="history-score">${Math.round(e.final_score)}%</span>
|
| 128 |
+
${e.model_tier ? `<span class="history-model">${safeText(e.model_tier)}</span>` : ''}
|
| 129 |
</div>
|
| 130 |
<div class="history-preview">${safeText(e.text_preview || 'β')}</div>
|
| 131 |
<div class="history-time">${timeAgo(e.timestamp)}</div>
|
|
|
|
| 246 |
const dot = document.getElementById('api-status-dot')
|
| 247 |
const label = document.getElementById('api-status-label')
|
| 248 |
try {
|
| 249 |
+
// Route through the service worker so the fetch uses the correct host_permissions
|
| 250 |
+
const resp = await msg({ type: 'CHECK_HEALTH' })
|
| 251 |
+
if (resp?.ok) {
|
| 252 |
+
dot.style.background = 'var(--credible)'
|
| 253 |
+
label.style.color = 'var(--credible)'
|
| 254 |
+
label.textContent = 'ONLINE'
|
| 255 |
} else {
|
| 256 |
+
throw new Error(resp?.error ?? `HTTP ${resp?.status}`)
|
| 257 |
}
|
| 258 |
} catch {
|
| 259 |
+
dot.style.background = 'var(--fake)'
|
| 260 |
+
label.style.color = 'var(--fake)'
|
| 261 |
+
label.textContent = 'OFFLINE'
|
| 262 |
}
|
| 263 |
}
|
| 264 |
|
|
@@ -1,5 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"firestore": {
|
|
|
|
|
|
|
| 3 |
"rules": "firestore.rules",
|
| 4 |
"indexes": "firestore.indexes.json"
|
| 5 |
},
|
|
@@ -34,5 +36,10 @@
|
|
| 34 |
]
|
| 35 |
}
|
| 36 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
}
|
| 38 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
"firestore": {
|
| 3 |
+
"database": "(default)",
|
| 4 |
+
"location": "asia-southeast1",
|
| 5 |
"rules": "firestore.rules",
|
| 6 |
"indexes": "firestore.indexes.json"
|
| 7 |
},
|
|
|
|
| 36 |
]
|
| 37 |
}
|
| 38 |
]
|
| 39 |
+
},
|
| 40 |
+
"auth": {
|
| 41 |
+
"providers": {
|
| 42 |
+
"emailPassword": true
|
| 43 |
+
}
|
| 44 |
}
|
| 45 |
+
}
|
|
@@ -1,13 +1,51 @@
|
|
| 1 |
{
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"fieldOverrides": []
|
| 13 |
-
}
|
|
|
|
| 1 |
{
|
| 2 |
+
// Example (Standard Edition):
|
| 3 |
+
//
|
| 4 |
+
// "indexes": [
|
| 5 |
+
// {
|
| 6 |
+
// "collectionGroup": "widgets",
|
| 7 |
+
// "queryScope": "COLLECTION",
|
| 8 |
+
// "fields": [
|
| 9 |
+
// { "fieldPath": "foo", "arrayConfig": "CONTAINS" },
|
| 10 |
+
// { "fieldPath": "bar", "mode": "DESCENDING" }
|
| 11 |
+
// ]
|
| 12 |
+
// },
|
| 13 |
+
//
|
| 14 |
+
// "fieldOverrides": [
|
| 15 |
+
// {
|
| 16 |
+
// "collectionGroup": "widgets",
|
| 17 |
+
// "fieldPath": "baz",
|
| 18 |
+
// "indexes": [
|
| 19 |
+
// { "order": "ASCENDING", "queryScope": "COLLECTION" }
|
| 20 |
+
// ]
|
| 21 |
+
// },
|
| 22 |
+
// ]
|
| 23 |
+
// ]
|
| 24 |
+
//
|
| 25 |
+
// Example (Enterprise Edition):
|
| 26 |
+
//
|
| 27 |
+
// "indexes": [
|
| 28 |
+
// {
|
| 29 |
+
// "collectionGroup": "reviews",
|
| 30 |
+
// "queryScope": "COLLECTION_GROUP",
|
| 31 |
+
// "apiScope": "MONGODB_COMPATIBLE_API",
|
| 32 |
+
// "density": "DENSE",
|
| 33 |
+
// "multikey": false,
|
| 34 |
+
// "fields": [
|
| 35 |
+
// { "fieldPath": "baz", "mode": "ASCENDING" }
|
| 36 |
+
// ]
|
| 37 |
+
// },
|
| 38 |
+
// {
|
| 39 |
+
// "collectionGroup": "items",
|
| 40 |
+
// "queryScope": "COLLECTION_GROUP",
|
| 41 |
+
// "apiScope": "MONGODB_COMPATIBLE_API",
|
| 42 |
+
// "density": "SPARSE_ANY",
|
| 43 |
+
// "multikey": true,
|
| 44 |
+
// "fields": [
|
| 45 |
+
// { "fieldPath": "baz", "mode": "ASCENDING" }
|
| 46 |
+
// ]
|
| 47 |
+
// },
|
| 48 |
+
// ]
|
| 49 |
+
"indexes": [],
|
| 50 |
"fieldOverrides": []
|
| 51 |
+
}
|
|
@@ -1,9 +1,18 @@
|
|
| 1 |
-
rules_version
|
|
|
|
| 2 |
service cloud.firestore {
|
| 3 |
match /databases/{database}/documents {
|
| 4 |
-
match /
|
| 5 |
-
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
}
|
| 8 |
}
|
| 9 |
-
}
|
|
|
|
| 1 |
+
rules_version='2'
|
| 2 |
+
|
| 3 |
service cloud.firestore {
|
| 4 |
match /databases/{database}/documents {
|
| 5 |
+
match /{document=**} {
|
| 6 |
+
// This rule allows anyone with your database reference to view, edit,
|
| 7 |
+
// and delete all data in your database. It is useful for getting
|
| 8 |
+
// started, but it is configured to expire after 30 days because it
|
| 9 |
+
// leaves your app open to attackers. At that time, all client
|
| 10 |
+
// requests to your database will be denied.
|
| 11 |
+
//
|
| 12 |
+
// Make sure to write security rules for your app before that time, or
|
| 13 |
+
// else all client requests to your database will be denied until you
|
| 14 |
+
// update your rules.
|
| 15 |
+
allow read, write: if request.time < timestamp.date(2026, 4, 14);
|
| 16 |
}
|
| 17 |
}
|
| 18 |
+
}
|
|
@@ -4,6 +4,7 @@ import LandingPage from './pages/LandingPage.jsx'
|
|
| 4 |
import VerifyPage from './pages/VerifyPage.jsx'
|
| 5 |
import HistoryPage from './pages/HistoryPage.jsx'
|
| 6 |
import TrendsPage from './pages/TrendsPage.jsx'
|
|
|
|
| 7 |
|
| 8 |
/** Shared horizontal constraint β all pages + navbar use this */
|
| 9 |
export const PAGE_MAX_W = 960
|
|
@@ -47,6 +48,7 @@ export default function App() {
|
|
| 47 |
<Route path="/verify" element={<VerifyPage />} />
|
| 48 |
<Route path="/history" element={<HistoryPage />} />
|
| 49 |
<Route path="/trends" element={<TrendsPage />} />
|
|
|
|
| 50 |
</Routes>
|
| 51 |
</div>
|
| 52 |
</div>
|
|
|
|
| 4 |
import VerifyPage from './pages/VerifyPage.jsx'
|
| 5 |
import HistoryPage from './pages/HistoryPage.jsx'
|
| 6 |
import TrendsPage from './pages/TrendsPage.jsx'
|
| 7 |
+
import BenchmarksPage from './pages/BenchmarksPage.jsx'
|
| 8 |
|
| 9 |
/** Shared horizontal constraint β all pages + navbar use this */
|
| 10 |
export const PAGE_MAX_W = 960
|
|
|
|
| 48 |
<Route path="/verify" element={<VerifyPage />} />
|
| 49 |
<Route path="/history" element={<HistoryPage />} />
|
| 50 |
<Route path="/trends" element={<TrendsPage />} />
|
| 51 |
+
<Route path="/benchmarks" element={<BenchmarksPage />} />
|
| 52 |
</Routes>
|
| 53 |
</div>
|
| 54 |
</div>
|
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import { NavLink, Link } from 'react-router-dom'
|
| 2 |
-
import { Radar, Clock, TrendingUp, ShieldCheck, Home } from 'lucide-react'
|
| 3 |
import { PAGE_STYLE } from '../App.jsx'
|
| 4 |
|
| 5 |
const NAV_LINKS = [
|
|
@@ -7,6 +7,7 @@ const NAV_LINKS = [
|
|
| 7 |
{ to: '/verify', icon: ShieldCheck, label: 'Verify' },
|
| 8 |
{ to: '/history', icon: Clock, label: 'History' },
|
| 9 |
{ to: '/trends', icon: TrendingUp, label: 'Trends' },
|
|
|
|
| 10 |
]
|
| 11 |
|
| 12 |
export default function Navbar() {
|
|
|
|
| 1 |
import { NavLink, Link } from 'react-router-dom'
|
| 2 |
+
import { Radar, Clock, TrendingUp, ShieldCheck, Home, BarChart2 } from 'lucide-react'
|
| 3 |
import { PAGE_STYLE } from '../App.jsx'
|
| 4 |
|
| 5 |
const NAV_LINKS = [
|
|
|
|
| 7 |
{ to: '/verify', icon: ShieldCheck, label: 'Verify' },
|
| 8 |
{ to: '/history', icon: Clock, label: 'History' },
|
| 9 |
{ to: '/trends', icon: TrendingUp, label: 'Trends' },
|
| 10 |
+
{ to: '/benchmarks', icon: BarChart2, label: 'Benchmarks' },
|
| 11 |
]
|
| 12 |
|
| 13 |
export default function Navbar() {
|
|
@@ -0,0 +1,400 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from 'react'
|
| 2 |
+
import {
|
| 3 |
+
BarChart, Bar, XAxis, YAxis, CartesianGrid, Tooltip,
|
| 4 |
+
ResponsiveContainer, Cell, ReferenceLine,
|
| 5 |
+
} from 'recharts'
|
| 6 |
+
import { PAGE_STYLE } from '../App.jsx'
|
| 7 |
+
|
| 8 |
+
// ββ Eval results (from python -m ml.eval, seed=42, 79 train / 21 val) βββββββββ
|
| 9 |
+
const MODELS = [
|
| 10 |
+
{
|
| 11 |
+
name: 'BoW + LogReg',
|
| 12 |
+
shortName: 'BoW+LR',
|
| 13 |
+
accuracy: 52.4,
|
| 14 |
+
tier: 'classical',
|
| 15 |
+
lecture: 'Lecture 3',
|
| 16 |
+
note: 'CountVectorizer loses TF weighting β raw counts hurt precision on short headlines',
|
| 17 |
+
},
|
| 18 |
+
{
|
| 19 |
+
name: 'BoW + LogReg + Lemma',
|
| 20 |
+
shortName: 'BoW+LR+L',
|
| 21 |
+
accuracy: 52.4,
|
| 22 |
+
tier: 'classical',
|
| 23 |
+
lecture: 'Lectures 2β3',
|
| 24 |
+
note: 'No change from non-lemmatized β WordNet is English-biased; Tagalog tokens unchanged',
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
name: 'TF-IDF + LogReg',
|
| 28 |
+
shortName: 'TFIDF+LR',
|
| 29 |
+
accuracy: 61.9,
|
| 30 |
+
tier: 'classical',
|
| 31 |
+
lecture: 'Lecture 3',
|
| 32 |
+
note: 'Sublinear TF weighting reduces dominance of high-frequency terms; best classical model',
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
name: 'TF-IDF + NB',
|
| 36 |
+
shortName: 'TFIDF+NB',
|
| 37 |
+
accuracy: 42.9,
|
| 38 |
+
tier: 'classical',
|
| 39 |
+
lecture: 'Lectures 5β6',
|
| 40 |
+
note: 'Feature independence assumption breaks on 79 samples; noisy probability estimates',
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
name: 'TF-IDF + NB + Lemma',
|
| 44 |
+
shortName: 'NB+Lemma',
|
| 45 |
+
accuracy: 42.9,
|
| 46 |
+
tier: 'classical',
|
| 47 |
+
lecture: 'Lectures 2, 5β6',
|
| 48 |
+
note: 'Lemmatization again neutral β confirms English-biased lemmatizer finding',
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
name: 'LDA + LogReg',
|
| 52 |
+
shortName: 'LDA+LR',
|
| 53 |
+
accuracy: 42.9,
|
| 54 |
+
tier: 'classical',
|
| 55 |
+
lecture: 'Lecture 7',
|
| 56 |
+
note: '5 topics over 79 documents is too few for stable topic distributions',
|
| 57 |
+
},
|
| 58 |
+
{
|
| 59 |
+
name: 'XLM-RoBERTa',
|
| 60 |
+
shortName: 'XLM-R',
|
| 61 |
+
accuracy: 90.5,
|
| 62 |
+
tier: 'transformer',
|
| 63 |
+
lecture: 'Transfer Learning',
|
| 64 |
+
note: 'Pretrained on 100+ languages including Filipino; fine-tuned on combined dataset',
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
name: 'Tagalog-RoBERTa',
|
| 68 |
+
shortName: 'TL-R',
|
| 69 |
+
accuracy: 95.2,
|
| 70 |
+
tier: 'transformer',
|
| 71 |
+
lecture: 'Transfer Learning',
|
| 72 |
+
note: 'Pretrained on TLUnified Filipino corpus; higher recall on Tagalog/Taglish posts',
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
name: 'Ensemble',
|
| 76 |
+
shortName: 'Ensemble',
|
| 77 |
+
accuracy: 100.0,
|
| 78 |
+
tier: 'ensemble',
|
| 79 |
+
lecture: 'Ensemble Methods',
|
| 80 |
+
note: 'Soft-vote average of XLM-R + Tagalog-RoBERTa logits; 100% on 21-sample holdout',
|
| 81 |
+
},
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
const TIER_COLOR = {
|
| 85 |
+
classical: '#d97706', // gold
|
| 86 |
+
transformer: '#06b6d4', // cyan
|
| 87 |
+
ensemble: '#16a34a', // green
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
const TIER_LABEL = {
|
| 91 |
+
classical: 'Classical ML',
|
| 92 |
+
transformer: 'Transformer',
|
| 93 |
+
ensemble: 'Ensemble',
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
const FINDINGS = [
|
| 97 |
+
{
|
| 98 |
+
lecture: 'Lecture 3',
|
| 99 |
+
title: 'TF-IDF > Bag of Words',
|
| 100 |
+
body: 'TF-IDF sublinear weighting outperforms raw BoW counts by +9.5%. Down-weighting high-frequency filler terms matters for short Filipino news headlines.',
|
| 101 |
+
color: '#d97706',
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
lecture: 'Lectures 5β6',
|
| 105 |
+
title: 'Naive Bayes struggles at small scale',
|
| 106 |
+
body: 'MultinomialNB reaches only 42.9% β 19pp below LogReg. Feature independence breaks down when training on 79 noisy, cross-lingual samples.',
|
| 107 |
+
color: '#d97706',
|
| 108 |
+
},
|
| 109 |
+
{
|
| 110 |
+
lecture: 'Lecture 7',
|
| 111 |
+
title: 'LDA needs more documents',
|
| 112 |
+
body: '5 topics over 79 training texts yields unstable distributions. Topic features are weak signal for 3-class classification; LDA would improve with 1000+ samples.',
|
| 113 |
+
color: '#d97706',
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
lecture: 'Lectures 2aβ2c',
|
| 117 |
+
title: 'Lemmatization: neutral on Tagalog',
|
| 118 |
+
body: 'Zero accuracy change with WordNet lemmatization. English-biased lemmatizers return Tagalog tokens unchanged β confirms the tool is a no-op on Filipino text.',
|
| 119 |
+
color: '#06b6d4',
|
| 120 |
+
},
|
| 121 |
+
]
|
| 122 |
+
|
| 123 |
+
// ββ Custom tooltip βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 124 |
+
function ChartTooltip({ active, payload }) {
|
| 125 |
+
if (!active || !payload?.length) return null
|
| 126 |
+
const d = payload[0].payload
|
| 127 |
+
return (
|
| 128 |
+
<div style={{
|
| 129 |
+
background: 'var(--bg-elevated)',
|
| 130 |
+
border: '1px solid var(--border-light)',
|
| 131 |
+
borderRadius: 4,
|
| 132 |
+
padding: '10px 14px',
|
| 133 |
+
fontFamily: 'var(--font-mono)',
|
| 134 |
+
fontSize: 11,
|
| 135 |
+
color: 'var(--text-primary)',
|
| 136 |
+
maxWidth: 240,
|
| 137 |
+
}}>
|
| 138 |
+
<div style={{ fontWeight: 700, marginBottom: 4 }}>{d.name}</div>
|
| 139 |
+
<div style={{ color: TIER_COLOR[d.tier], marginBottom: 6 }}>
|
| 140 |
+
{d.accuracy.toFixed(1)}% accuracy
|
| 141 |
+
</div>
|
| 142 |
+
<div style={{ color: 'var(--text-muted)', fontSize: 10, lineHeight: 1.5 }}>{d.note}</div>
|
| 143 |
+
</div>
|
| 144 |
+
)
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
// ββ Tier legend pill βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 148 |
+
function TierPill({ tier }) {
|
| 149 |
+
return (
|
| 150 |
+
<span style={{
|
| 151 |
+
display: 'inline-block',
|
| 152 |
+
padding: '2px 8px',
|
| 153 |
+
borderRadius: 2,
|
| 154 |
+
fontSize: 9,
|
| 155 |
+
fontFamily: 'var(--font-mono)',
|
| 156 |
+
fontWeight: 700,
|
| 157 |
+
letterSpacing: '0.06em',
|
| 158 |
+
textTransform: 'uppercase',
|
| 159 |
+
background: `${TIER_COLOR[tier]}18`,
|
| 160 |
+
color: TIER_COLOR[tier],
|
| 161 |
+
border: `1px solid ${TIER_COLOR[tier]}40`,
|
| 162 |
+
}}>
|
| 163 |
+
{TIER_LABEL[tier]}
|
| 164 |
+
</span>
|
| 165 |
+
)
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
export default function BenchmarksPage() {
|
| 169 |
+
const [activeRow, setActiveRow] = useState(null)
|
| 170 |
+
|
| 171 |
+
return (
|
| 172 |
+
<main style={{ ...PAGE_STYLE, paddingTop: 48, paddingBottom: 80 }}>
|
| 173 |
+
|
| 174 |
+
{/* ββ Header βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */}
|
| 175 |
+
<div className="fade-up-1" style={{ marginBottom: 40 }}>
|
| 176 |
+
<div style={{
|
| 177 |
+
fontFamily: 'var(--font-mono)',
|
| 178 |
+
fontSize: 10,
|
| 179 |
+
letterSpacing: '0.14em',
|
| 180 |
+
color: 'var(--accent-red)',
|
| 181 |
+
textTransform: 'uppercase',
|
| 182 |
+
marginBottom: 10,
|
| 183 |
+
}}>
|
| 184 |
+
ML Course β Model Comparison
|
| 185 |
+
</div>
|
| 186 |
+
<h1 style={{
|
| 187 |
+
fontFamily: 'var(--font-display)',
|
| 188 |
+
fontWeight: 800,
|
| 189 |
+
fontSize: 32,
|
| 190 |
+
letterSpacing: '-0.02em',
|
| 191 |
+
color: 'var(--text-primary)',
|
| 192 |
+
marginBottom: 12,
|
| 193 |
+
}}>
|
| 194 |
+
Model Benchmarks
|
| 195 |
+
</h1>
|
| 196 |
+
<p style={{
|
| 197 |
+
fontFamily: 'var(--font-body)',
|
| 198 |
+
fontSize: 14,
|
| 199 |
+
color: 'var(--text-secondary)',
|
| 200 |
+
lineHeight: 1.7,
|
| 201 |
+
maxWidth: 560,
|
| 202 |
+
}}>
|
| 203 |
+
Comparison of 9 classifier variants on a 21-sample holdout from the
|
| 204 |
+
handcrafted PhilVerify dataset (79 train / 21 val, seed 42). Classical
|
| 205 |
+
models trained in-session; transformer checkpoints fine-tuned on the
|
| 206 |
+
full combined dataset.
|
| 207 |
+
</p>
|
| 208 |
+
</div>
|
| 209 |
+
|
| 210 |
+
{/* ββ Key findings βββββββββββββββββββββββββββββββββββββββββββββββββββββ */}
|
| 211 |
+
<div className="fade-up-2" style={{ marginBottom: 48 }}>
|
| 212 |
+
<h2 style={{
|
| 213 |
+
fontFamily: 'var(--font-display)',
|
| 214 |
+
fontWeight: 700,
|
| 215 |
+
fontSize: 11,
|
| 216 |
+
letterSpacing: '0.12em',
|
| 217 |
+
textTransform: 'uppercase',
|
| 218 |
+
color: 'var(--text-muted)',
|
| 219 |
+
marginBottom: 16,
|
| 220 |
+
}}>
|
| 221 |
+
Key Findings
|
| 222 |
+
</h2>
|
| 223 |
+
<div style={{ display: 'grid', gridTemplateColumns: 'repeat(auto-fit, minmax(210px, 1fr))', gap: 12 }}>
|
| 224 |
+
{FINDINGS.map((f) => (
|
| 225 |
+
<div key={f.title} className="card" style={{ padding: '16px 18px' }}>
|
| 226 |
+
<div style={{
|
| 227 |
+
fontFamily: 'var(--font-mono)',
|
| 228 |
+
fontSize: 9,
|
| 229 |
+
letterSpacing: '0.1em',
|
| 230 |
+
textTransform: 'uppercase',
|
| 231 |
+
color: f.color,
|
| 232 |
+
marginBottom: 6,
|
| 233 |
+
}}>
|
| 234 |
+
{f.lecture}
|
| 235 |
+
</div>
|
| 236 |
+
<div style={{
|
| 237 |
+
fontFamily: 'var(--font-display)',
|
| 238 |
+
fontWeight: 700,
|
| 239 |
+
fontSize: 13,
|
| 240 |
+
color: 'var(--text-primary)',
|
| 241 |
+
marginBottom: 8,
|
| 242 |
+
lineHeight: 1.3,
|
| 243 |
+
}}>
|
| 244 |
+
{f.title}
|
| 245 |
+
</div>
|
| 246 |
+
<p style={{
|
| 247 |
+
fontFamily: 'var(--font-body)',
|
| 248 |
+
fontSize: 11,
|
| 249 |
+
color: 'var(--text-secondary)',
|
| 250 |
+
lineHeight: 1.6,
|
| 251 |
+
margin: 0,
|
| 252 |
+
}}>
|
| 253 |
+
{f.body}
|
| 254 |
+
</p>
|
| 255 |
+
</div>
|
| 256 |
+
))}
|
| 257 |
+
</div>
|
| 258 |
+
</div>
|
| 259 |
+
|
| 260 |
+
{/* ββ Bar chart ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ */}
|
| 261 |
+
<div className="fade-up-3 card" style={{ padding: '24px 20px', marginBottom: 32 }}>
|
| 262 |
+
<div style={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', marginBottom: 20 }}>
|
| 263 |
+
<h2 style={{
|
| 264 |
+
fontFamily: 'var(--font-display)',
|
| 265 |
+
fontWeight: 700,
|
| 266 |
+
fontSize: 13,
|
| 267 |
+
letterSpacing: '0.06em',
|
| 268 |
+
color: 'var(--text-primary)',
|
| 269 |
+
margin: 0,
|
| 270 |
+
}}>
|
| 271 |
+
Accuracy by Model
|
| 272 |
+
</h2>
|
| 273 |
+
<div style={{ display: 'flex', gap: 12 }}>
|
| 274 |
+
{Object.entries(TIER_LABEL).map(([tier, label]) => (
|
| 275 |
+
<div key={tier} style={{ display: 'flex', alignItems: 'center', gap: 5 }}>
|
| 276 |
+
<span style={{ width: 8, height: 8, borderRadius: 2, background: TIER_COLOR[tier], display: 'inline-block' }} />
|
| 277 |
+
<span style={{ fontFamily: 'var(--font-mono)', fontSize: 9, color: 'var(--text-muted)', letterSpacing: '0.06em' }}>
|
| 278 |
+
{label.toUpperCase()}
|
| 279 |
+
</span>
|
| 280 |
+
</div>
|
| 281 |
+
))}
|
| 282 |
+
</div>
|
| 283 |
+
</div>
|
| 284 |
+
<ResponsiveContainer width="100%" height={280}>
|
| 285 |
+
<BarChart
|
| 286 |
+
data={MODELS}
|
| 287 |
+
layout="vertical"
|
| 288 |
+
margin={{ top: 0, right: 40, left: 8, bottom: 0 }}
|
| 289 |
+
>
|
| 290 |
+
<CartesianGrid horizontal={false} stroke="rgba(245,240,232,0.04)" />
|
| 291 |
+
<XAxis
|
| 292 |
+
type="number"
|
| 293 |
+
domain={[0, 100]}
|
| 294 |
+
tickFormatter={v => `${v}%`}
|
| 295 |
+
tick={{ fontSize: 9, fontFamily: 'var(--font-mono)', fill: 'var(--text-muted)' }}
|
| 296 |
+
tickLine={false}
|
| 297 |
+
axisLine={false}
|
| 298 |
+
/>
|
| 299 |
+
<YAxis
|
| 300 |
+
type="category"
|
| 301 |
+
dataKey="shortName"
|
| 302 |
+
width={72}
|
| 303 |
+
tick={{ fontSize: 9, fontFamily: 'var(--font-mono)', fill: 'var(--text-secondary)' }}
|
| 304 |
+
tickLine={false}
|
| 305 |
+
axisLine={false}
|
| 306 |
+
/>
|
| 307 |
+
<Tooltip content={<ChartTooltip />} cursor={{ fill: 'rgba(245,240,232,0.03)' }} />
|
| 308 |
+
<ReferenceLine x={61.9} stroke="rgba(217,119,6,0.3)" strokeDasharray="3 3" label={{ value: 'Classical ceiling', position: 'top', fontSize: 8, fontFamily: 'var(--font-mono)', fill: '#d97706' }} />
|
| 309 |
+
<Bar dataKey="accuracy" radius={[0, 2, 2, 0]} maxBarSize={20}>
|
| 310 |
+
{MODELS.map((m) => (
|
| 311 |
+
<Cell key={m.name} fill={TIER_COLOR[m.tier]} fillOpacity={activeRow === m.name ? 1 : 0.75} />
|
| 312 |
+
))}
|
| 313 |
+
</Bar>
|
| 314 |
+
</BarChart>
|
| 315 |
+
</ResponsiveContainer>
|
| 316 |
+
</div>
|
| 317 |
+
|
| 318 |
+
{/* ββ Full results table βββββββββββββββββββββββββββββββββββββββββββββββ */}
|
| 319 |
+
<div className="fade-up-4 card" style={{ overflow: 'hidden' }}>
|
| 320 |
+
<div style={{ padding: '18px 20px 12px', borderBottom: '1px solid var(--border)' }}>
|
| 321 |
+
<h2 style={{
|
| 322 |
+
fontFamily: 'var(--font-display)',
|
| 323 |
+
fontWeight: 700,
|
| 324 |
+
fontSize: 13,
|
| 325 |
+
letterSpacing: '0.06em',
|
| 326 |
+
color: 'var(--text-primary)',
|
| 327 |
+
margin: 0,
|
| 328 |
+
}}>
|
| 329 |
+
Full Results
|
| 330 |
+
</h2>
|
| 331 |
+
</div>
|
| 332 |
+
<table style={{ width: '100%', borderCollapse: 'collapse' }}>
|
| 333 |
+
<thead>
|
| 334 |
+
<tr style={{ borderBottom: '1px solid var(--border)' }}>
|
| 335 |
+
{['Model', 'Accuracy', 'Tier', 'Lecture', 'Note'].map(h => (
|
| 336 |
+
<th key={h} style={{
|
| 337 |
+
padding: '8px 16px',
|
| 338 |
+
textAlign: h === 'Accuracy' ? 'right' : 'left',
|
| 339 |
+
fontFamily: 'var(--font-mono)',
|
| 340 |
+
fontSize: 9,
|
| 341 |
+
fontWeight: 700,
|
| 342 |
+
letterSpacing: '0.1em',
|
| 343 |
+
textTransform: 'uppercase',
|
| 344 |
+
color: 'var(--text-muted)',
|
| 345 |
+
}}>
|
| 346 |
+
{h}
|
| 347 |
+
</th>
|
| 348 |
+
))}
|
| 349 |
+
</tr>
|
| 350 |
+
</thead>
|
| 351 |
+
<tbody>
|
| 352 |
+
{MODELS.map((m, i) => (
|
| 353 |
+
<tr
|
| 354 |
+
key={m.name}
|
| 355 |
+
onMouseEnter={() => setActiveRow(m.name)}
|
| 356 |
+
onMouseLeave={() => setActiveRow(null)}
|
| 357 |
+
style={{
|
| 358 |
+
borderBottom: i < MODELS.length - 1 ? '1px solid var(--border)' : 'none',
|
| 359 |
+
background: activeRow === m.name ? 'var(--bg-elevated)' : 'transparent',
|
| 360 |
+
transition: 'background 0.1s',
|
| 361 |
+
borderLeft: `3px solid ${activeRow === m.name ? TIER_COLOR[m.tier] : 'transparent'}`,
|
| 362 |
+
}}
|
| 363 |
+
>
|
| 364 |
+
<td style={{ padding: '10px 16px', fontFamily: 'var(--font-display)', fontSize: 12, fontWeight: 600, color: 'var(--text-primary)' }}>
|
| 365 |
+
{m.name}
|
| 366 |
+
</td>
|
| 367 |
+
<td style={{ padding: '10px 16px', textAlign: 'right', fontFamily: 'var(--font-mono)', fontSize: 13, fontWeight: 700, color: TIER_COLOR[m.tier] }}>
|
| 368 |
+
{m.accuracy.toFixed(1)}%
|
| 369 |
+
</td>
|
| 370 |
+
<td style={{ padding: '10px 16px' }}>
|
| 371 |
+
<TierPill tier={m.tier} />
|
| 372 |
+
</td>
|
| 373 |
+
<td style={{ padding: '10px 16px', fontFamily: 'var(--font-mono)', fontSize: 10, color: 'var(--text-muted)' }}>
|
| 374 |
+
{m.lecture}
|
| 375 |
+
</td>
|
| 376 |
+
<td style={{ padding: '10px 16px', fontFamily: 'var(--font-body)', fontSize: 11, color: 'var(--text-secondary)', lineHeight: 1.5, maxWidth: 260 }}>
|
| 377 |
+
{m.note}
|
| 378 |
+
</td>
|
| 379 |
+
</tr>
|
| 380 |
+
))}
|
| 381 |
+
</tbody>
|
| 382 |
+
</table>
|
| 383 |
+
</div>
|
| 384 |
+
|
| 385 |
+
{/* ββ Footer note ββββββββββββββββββββββββββββββββββββββββββββββββββββββ */}
|
| 386 |
+
<p className="fade-up-5" style={{
|
| 387 |
+
marginTop: 20,
|
| 388 |
+
fontFamily: 'var(--font-mono)',
|
| 389 |
+
fontSize: 10,
|
| 390 |
+
color: 'var(--text-muted)',
|
| 391 |
+
lineHeight: 1.6,
|
| 392 |
+
}}>
|
| 393 |
+
* Val set is 21 samples from a handcrafted 100-sample dataset β ensemble 100% reflects
|
| 394 |
+
near-zero variance on a small holdout, not production accuracy. Transformer models were
|
| 395 |
+
trained on the larger combined dataset; classical models trained on the 79-sample split.
|
| 396 |
+
</p>
|
| 397 |
+
|
| 398 |
+
</main>
|
| 399 |
+
)
|
| 400 |
+
}
|
|
@@ -894,6 +894,20 @@ export default function VerifyPage() {
|
|
| 894 |
verdict={result.layer1?.verdict}
|
| 895 |
score={result.layer1?.confidence}
|
| 896 |
delay={0}>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 897 |
<p className="text-xs mt-2" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)', lineHeight: 1.6 }}>
|
| 898 |
{mlConfidenceExplanation(result.layer1?.confidence || 0, result.layer1?.verdict)}
|
| 899 |
</p>
|
|
@@ -923,6 +937,11 @@ export default function VerifyPage() {
|
|
| 923 |
<p className="text-xs mt-3" style={{ color: 'var(--text-muted)', fontFamily: 'var(--font-body)', lineHeight: 1.6 }}>
|
| 924 |
<span style={{ color: 'var(--text-secondary)' }}>Claim searched: </span>
|
| 925 |
"{result.layer2?.claim_used || 'No claim extracted'}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
</p>
|
| 927 |
</LayerCard>
|
| 928 |
</div>
|
|
@@ -1002,7 +1021,8 @@ export default function VerifyPage() {
|
|
| 1002 |
<span className="text-xs tabular" style={{ color: 'var(--text-muted)' }}>
|
| 1003 |
{src.source_name || src.source}
|
| 1004 |
</span>
|
| 1005 |
-
<span className="text-xs tabular"
|
|
|
|
| 1006 |
{src.stance}
|
| 1007 |
</span>
|
| 1008 |
<span className="text-xs tabular" style={{ color: 'var(--text-muted)' }}>
|
|
|
|
| 894 |
verdict={result.layer1?.verdict}
|
| 895 |
score={result.layer1?.confidence}
|
| 896 |
delay={0}>
|
| 897 |
+
{result.layer1?.model_tier && (
|
| 898 |
+
<span style={{
|
| 899 |
+
display: 'inline-block', fontSize: '0.62rem', padding: '2px 6px',
|
| 900 |
+
borderRadius: 3, marginTop: 6, marginBottom: 2,
|
| 901 |
+
background: result.layer1.model_tier === 'ensemble' ? 'rgba(217,119,6,0.12)' :
|
| 902 |
+
result.layer1.model_tier === 'xlmr' ? 'rgba(6,182,212,0.12)' : 'rgba(255,255,255,0.06)',
|
| 903 |
+
color: result.layer1.model_tier === 'ensemble' ? 'var(--accent-gold)' :
|
| 904 |
+
result.layer1.model_tier === 'xlmr' ? 'var(--accent-cyan)' : 'var(--text-muted)',
|
| 905 |
+
fontFamily: 'var(--font-mono)', textTransform: 'uppercase', letterSpacing: '0.08em',
|
| 906 |
+
border: '1px solid currentColor', opacity: 0.85,
|
| 907 |
+
}}>
|
| 908 |
+
{result.layer1.model_tier}
|
| 909 |
+
</span>
|
| 910 |
+
)}
|
| 911 |
<p className="text-xs mt-2" style={{ color: 'var(--text-secondary)', fontFamily: 'var(--font-body)', lineHeight: 1.6 }}>
|
| 912 |
{mlConfidenceExplanation(result.layer1?.confidence || 0, result.layer1?.verdict)}
|
| 913 |
</p>
|
|
|
|
| 937 |
<p className="text-xs mt-3" style={{ color: 'var(--text-muted)', fontFamily: 'var(--font-body)', lineHeight: 1.6 }}>
|
| 938 |
<span style={{ color: 'var(--text-secondary)' }}>Claim searched: </span>
|
| 939 |
"{result.layer2?.claim_used || 'No claim extracted'}"
|
| 940 |
+
{result.layer2?.claim_method && (
|
| 941 |
+
<span style={{ marginLeft: 6, fontSize: '0.62rem', fontFamily: 'var(--font-mono)', opacity: 0.55 }}>
|
| 942 |
+
[{result.layer2.claim_method}]
|
| 943 |
+
</span>
|
| 944 |
+
)}
|
| 945 |
</p>
|
| 946 |
</LayerCard>
|
| 947 |
</div>
|
|
|
|
| 1021 |
<span className="text-xs tabular" style={{ color: 'var(--text-muted)' }}>
|
| 1022 |
{src.source_name || src.source}
|
| 1023 |
</span>
|
| 1024 |
+
<span className="text-xs tabular" title={src.stance_reason || src.stance}
|
| 1025 |
+
style={{ color: stanceColor, fontFamily: 'var(--font-display)', letterSpacing: '0.06em', cursor: src.stance_reason ? 'help' : 'default' }}>
|
| 1026 |
{src.stance}
|
| 1027 |
</span>
|
| 1028 |
<span className="text-xs tabular" style={{ color: 'var(--text-muted)' }}>
|
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify β Bag of Words + Logistic Regression Classifier (Layer 1)
|
| 3 |
+
|
| 4 |
+
CountVectorizer (BoW) with LogisticRegression. Identical to TFIDFClassifier except
|
| 5 |
+
for the vectorizer β this isolates the BoW vs TF-IDF comparison in eval.py.
|
| 6 |
+
Supports optional WordNet lemmatization.
|
| 7 |
+
"""
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
from ml.naive_bayes_classifier import _lemmatize_tokens
|
| 11 |
+
from ml.tfidf_classifier import Layer1Result
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BoWClassifier:
|
| 17 |
+
"""
|
| 18 |
+
BoW (CountVectorizer) + LogisticRegression classifier.
|
| 19 |
+
|
| 20 |
+
Args:
|
| 21 |
+
train_samples: list[Sample] from ml.dataset. If None, uses the full 100-sample dataset.
|
| 22 |
+
lemmatize: apply WordNet lemmatization before vectorization.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 26 |
+
|
| 27 |
+
def __init__(self, train_samples=None, lemmatize: bool = False):
|
| 28 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 29 |
+
from sklearn.linear_model import LogisticRegression
|
| 30 |
+
|
| 31 |
+
self._lemmatize = lemmatize
|
| 32 |
+
|
| 33 |
+
if train_samples is None:
|
| 34 |
+
from ml.dataset import get_dataset
|
| 35 |
+
train_samples = get_dataset()
|
| 36 |
+
|
| 37 |
+
texts = [self._preprocess(s.text) for s in train_samples]
|
| 38 |
+
labels = [s.label for s in train_samples]
|
| 39 |
+
|
| 40 |
+
self._vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=1000)
|
| 41 |
+
X = self._vectorizer.fit_transform(texts)
|
| 42 |
+
|
| 43 |
+
self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
|
| 44 |
+
self._clf.fit(X, labels)
|
| 45 |
+
logger.info(
|
| 46 |
+
"BoWClassifier trained on %d samples (lemmatize=%s)",
|
| 47 |
+
len(texts), lemmatize,
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
def _preprocess(self, text: str) -> str:
|
| 51 |
+
text = text.lower()
|
| 52 |
+
if self._lemmatize:
|
| 53 |
+
return " ".join(_lemmatize_tokens(text.split()))
|
| 54 |
+
return text
|
| 55 |
+
|
| 56 |
+
def predict(self, text: str) -> Layer1Result:
|
| 57 |
+
processed = self._preprocess(text)
|
| 58 |
+
X = self._vectorizer.transform([processed])
|
| 59 |
+
pred_label = int(self._clf.predict(X)[0])
|
| 60 |
+
proba = self._clf.predict_proba(X)[0]
|
| 61 |
+
confidence = round(float(max(proba)) * 100, 1)
|
| 62 |
+
verdict = self._LABELS[pred_label]
|
| 63 |
+
|
| 64 |
+
feature_names = self._vectorizer.get_feature_names_out()
|
| 65 |
+
bow_scores = X.toarray()[0]
|
| 66 |
+
top_idx = bow_scores.argsort()[-5:][::-1]
|
| 67 |
+
triggered = [feature_names[i] for i in top_idx if bow_scores[i] > 0]
|
| 68 |
+
|
| 69 |
+
return Layer1Result(verdict=verdict, confidence=confidence, triggered_features=triggered)
|
|
@@ -10,6 +10,7 @@ Languages: English, Filipino/Tagalog, Taglish (code-switched)
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
|
|
|
| 13 |
from dataclasses import dataclass
|
| 14 |
|
| 15 |
LABEL_NAMES = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
|
@@ -199,3 +200,40 @@ def class_weights(samples: list[Sample]) -> list[float]:
|
|
| 199 |
for i in range(NUM_LABELS):
|
| 200 |
weights.append(total / (NUM_LABELS * max(counts[i], 1)))
|
| 201 |
return weights
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
+
import random as _random
|
| 14 |
from dataclasses import dataclass
|
| 15 |
|
| 16 |
LABEL_NAMES = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
|
|
|
| 200 |
for i in range(NUM_LABELS):
|
| 201 |
weights.append(total / (NUM_LABELS * max(counts[i], 1)))
|
| 202 |
return weights
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
# ββ Easy Data Augmentation (EDA) ββββββββββββββββββββββββββββββββββββββββββββββ
|
| 206 |
+
|
| 207 |
+
def _random_deletion(words: list[str], p: float = 0.12) -> list[str]:
|
| 208 |
+
"""Randomly delete each word with probability p."""
|
| 209 |
+
if len(words) == 1:
|
| 210 |
+
return words
|
| 211 |
+
kept = [w for w in words if _random.random() > p]
|
| 212 |
+
return kept if kept else [_random.choice(words)]
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def _random_swap(words: list[str], n: int = 1) -> list[str]:
|
| 216 |
+
"""Randomly swap n pairs of adjacent words."""
|
| 217 |
+
out = words[:]
|
| 218 |
+
for _ in range(n):
|
| 219 |
+
i, j = _random.sample(range(len(out)), 2)
|
| 220 |
+
out[i], out[j] = out[j], out[i]
|
| 221 |
+
return out
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def augment_samples(samples: list[Sample], seed: int = 42) -> list[Sample]:
|
| 225 |
+
"""
|
| 226 |
+
Return augmented copies of samples using random deletion and random swap.
|
| 227 |
+
The originals are NOT included β caller decides whether to combine them.
|
| 228 |
+
Produces up to 2Γ the number of input samples (one deletion + one swap
|
| 229 |
+
variant per sample; samples with fewer than 4 words are skipped).
|
| 230 |
+
"""
|
| 231 |
+
_random.seed(seed)
|
| 232 |
+
augmented: list[Sample] = []
|
| 233 |
+
for s in samples:
|
| 234 |
+
words = s.text.split()
|
| 235 |
+
if len(words) < 4:
|
| 236 |
+
continue
|
| 237 |
+
augmented.append(Sample(" ".join(_random_deletion(words[:])), s.label))
|
| 238 |
+
augmented.append(Sample(" ".join(_random_swap(words[:])), s.label))
|
| 239 |
+
return augmented
|
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify β Ensemble Classifier (Layer 1)
|
| 3 |
+
|
| 4 |
+
Averages softmax probabilities from XLMRobertaClassifier and
|
| 5 |
+
TagalogRobertaClassifier, then returns a single Layer1Result.
|
| 6 |
+
|
| 7 |
+
When only one classifier is passed the ensemble degrades gracefully
|
| 8 |
+
to that single model (no averaging needed, no performance penalty).
|
| 9 |
+
"""
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
from ml.xlm_roberta_classifier import Layer1Result
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
LABEL_NAMES = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class EnsembleClassifier:
|
| 22 |
+
"""
|
| 23 |
+
Soft-voting ensemble over one or more classifiers that implement
|
| 24 |
+
predict_probs(text) β (probs_tensor, attentions, input_ids).
|
| 25 |
+
|
| 26 |
+
Triggered features are taken from the classifier with the highest
|
| 27 |
+
individual confidence (the most "sure" model), then deduplicated.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(self, classifiers: list) -> None:
|
| 31 |
+
if not classifiers:
|
| 32 |
+
raise ValueError("EnsembleClassifier requires at least one classifier")
|
| 33 |
+
self._classifiers = classifiers
|
| 34 |
+
|
| 35 |
+
def predict(self, text: str) -> Layer1Result:
|
| 36 |
+
import torch
|
| 37 |
+
|
| 38 |
+
all_probs = []
|
| 39 |
+
all_attentions = []
|
| 40 |
+
all_input_ids = []
|
| 41 |
+
|
| 42 |
+
for clf in self._classifiers:
|
| 43 |
+
try:
|
| 44 |
+
probs, attentions, input_ids = clf.predict_probs(text)
|
| 45 |
+
all_probs.append(probs)
|
| 46 |
+
all_attentions.append((attentions, input_ids, clf))
|
| 47 |
+
except Exception as exc:
|
| 48 |
+
logger.warning("Classifier %s failed during ensemble: %s", clf, exc)
|
| 49 |
+
|
| 50 |
+
if not all_probs:
|
| 51 |
+
# All classifiers failed β return a neutral Unverified result
|
| 52 |
+
return Layer1Result(verdict="Unverified", confidence=33.3, triggered_features=[])
|
| 53 |
+
|
| 54 |
+
# Average probabilities across all classifiers that succeeded
|
| 55 |
+
avg_probs = torch.stack(all_probs).mean(dim=0) # (num_labels,)
|
| 56 |
+
pred_label = int(avg_probs.argmax().item())
|
| 57 |
+
confidence = round(float(avg_probs[pred_label].item()) * 100, 1)
|
| 58 |
+
verdict = LABEL_NAMES[pred_label]
|
| 59 |
+
|
| 60 |
+
# Triggered features: from the classifier with highest individual confidence
|
| 61 |
+
triggered: list[str] = []
|
| 62 |
+
best_conf = -1.0
|
| 63 |
+
for probs, (attentions, input_ids, clf) in zip(all_probs, all_attentions):
|
| 64 |
+
clf_conf = float(probs.max().item())
|
| 65 |
+
if clf_conf > best_conf and hasattr(clf, "_salient_tokens") and attentions:
|
| 66 |
+
best_conf = clf_conf
|
| 67 |
+
triggered = clf._salient_tokens(input_ids, attentions)
|
| 68 |
+
|
| 69 |
+
logger.debug(
|
| 70 |
+
"Ensemble (%d classifiers): %s %.1f%%", len(all_probs), verdict, confidence
|
| 71 |
+
)
|
| 72 |
+
return Layer1Result(
|
| 73 |
+
verdict=verdict,
|
| 74 |
+
confidence=confidence,
|
| 75 |
+
triggered_features=triggered,
|
| 76 |
+
)
|
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluate all PhilVerify classifiers on the held-out validation split.
|
| 3 |
+
|
| 4 |
+
Prints per-class precision/recall/F1, confusion matrix, and a side-by-side
|
| 5 |
+
accuracy summary for all model variants:
|
| 6 |
+
|
| 7 |
+
Classical (trained on train split):
|
| 8 |
+
BoW + LogReg
|
| 9 |
+
BoW + LogReg + Lemma
|
| 10 |
+
TF-IDF + LogReg (legacy SEED_DATA baseline)
|
| 11 |
+
TF-IDF + NB
|
| 12 |
+
TF-IDF + NB + Lemma
|
| 13 |
+
LDA features + LogReg
|
| 14 |
+
|
| 15 |
+
Transformer (loaded from saved checkpoints):
|
| 16 |
+
XLM-RoBERTa
|
| 17 |
+
Tagalog-RoBERTa
|
| 18 |
+
Ensemble (XLM-R + Tagalog-RoBERTa)
|
| 19 |
+
|
| 20 |
+
Usage:
|
| 21 |
+
cd PhilVerify
|
| 22 |
+
python -m ml.eval
|
| 23 |
+
python -m ml.eval --seed 42 --train-ratio 0.8 --skip-lda-analysis
|
| 24 |
+
"""
|
| 25 |
+
import argparse
|
| 26 |
+
import logging
|
| 27 |
+
|
| 28 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
| 29 |
+
|
| 30 |
+
from ml.bow_classifier import BoWClassifier
|
| 31 |
+
from ml.dataset import LABEL_NAMES, get_split
|
| 32 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 33 |
+
from ml.lda_analysis import LDAFeatureClassifier, run_topic_analysis
|
| 34 |
+
from ml.naive_bayes_classifier import NaiveBayesClassifier
|
| 35 |
+
from ml.tagalog_roberta_classifier import TagalogRobertaClassifier
|
| 36 |
+
from ml.tfidf_classifier import TFIDFClassifier
|
| 37 |
+
from ml.xlm_roberta_classifier import ModelNotFoundError, XLMRobertaClassifier
|
| 38 |
+
|
| 39 |
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
| 40 |
+
logger = logging.getLogger(__name__)
|
| 41 |
+
|
| 42 |
+
LABEL_LIST = [LABEL_NAMES[i] for i in sorted(LABEL_NAMES)]
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def evaluate_classifier(name: str, clf, samples: list) -> dict:
|
| 46 |
+
true_labels, pred_labels = [], []
|
| 47 |
+
for s in samples:
|
| 48 |
+
result = clf.predict(s.text)
|
| 49 |
+
true_labels.append(LABEL_NAMES[s.label])
|
| 50 |
+
pred_labels.append(result.verdict)
|
| 51 |
+
|
| 52 |
+
print(f"\n{'='*62}")
|
| 53 |
+
print(f" {name}")
|
| 54 |
+
print(f"{'='*62}")
|
| 55 |
+
print(classification_report(true_labels, pred_labels, labels=LABEL_LIST, zero_division=0))
|
| 56 |
+
|
| 57 |
+
print("Confusion matrix (rows = true, cols = predicted):")
|
| 58 |
+
print(f" {'':14}", " ".join(f"{lbl[:6]:>6}" for lbl in LABEL_LIST))
|
| 59 |
+
cm = confusion_matrix(true_labels, pred_labels, labels=LABEL_LIST)
|
| 60 |
+
for row_label, row in zip(LABEL_LIST, cm):
|
| 61 |
+
print(f" {row_label:<14}", " ".join(f"{v:>6}" for v in row))
|
| 62 |
+
|
| 63 |
+
acc = accuracy_score(true_labels, pred_labels)
|
| 64 |
+
return {"name": name, "accuracy": acc}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def main() -> None:
|
| 68 |
+
parser = argparse.ArgumentParser(description="Evaluate PhilVerify classifiers")
|
| 69 |
+
parser.add_argument("--seed", type=int, default=42,
|
| 70 |
+
help="Random seed (must match training seed)")
|
| 71 |
+
parser.add_argument("--train-ratio", type=float, default=0.8,
|
| 72 |
+
help="Train split ratio (must match training)")
|
| 73 |
+
parser.add_argument("--skip-lda-analysis", action="store_true",
|
| 74 |
+
help="Skip the LDA topic analysis printout")
|
| 75 |
+
args = parser.parse_args()
|
| 76 |
+
|
| 77 |
+
train_samples, val_samples = get_split(train_ratio=args.train_ratio, seed=args.seed)
|
| 78 |
+
logger.info(
|
| 79 |
+
"Train: %d samples | Val: %d samples (seed=%d, train_ratio=%.1f)",
|
| 80 |
+
len(train_samples), len(val_samples), args.seed, args.train_ratio,
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
# ββ LDA topic analysis (printed before classifier comparison) ββββββββββββ
|
| 84 |
+
if not args.skip_lda_analysis:
|
| 85 |
+
run_topic_analysis(train_samples)
|
| 86 |
+
|
| 87 |
+
results: list[dict] = []
|
| 88 |
+
|
| 89 |
+
# ββ Classical baselines (all trained on train_samples for fair comparison) β
|
| 90 |
+
|
| 91 |
+
results.append(evaluate_classifier(
|
| 92 |
+
"BoW + LogReg",
|
| 93 |
+
BoWClassifier(train_samples),
|
| 94 |
+
val_samples,
|
| 95 |
+
))
|
| 96 |
+
|
| 97 |
+
results.append(evaluate_classifier(
|
| 98 |
+
"BoW + LogReg + Lemma",
|
| 99 |
+
BoWClassifier(train_samples, lemmatize=True),
|
| 100 |
+
val_samples,
|
| 101 |
+
))
|
| 102 |
+
|
| 103 |
+
# Legacy baseline (trains on internal SEED_DATA, not the split β included for reference)
|
| 104 |
+
results.append(evaluate_classifier(
|
| 105 |
+
"TF-IDF + LogReg [legacy SEED_DATA]",
|
| 106 |
+
TFIDFClassifier(),
|
| 107 |
+
val_samples,
|
| 108 |
+
))
|
| 109 |
+
|
| 110 |
+
results.append(evaluate_classifier(
|
| 111 |
+
"TF-IDF + NB",
|
| 112 |
+
NaiveBayesClassifier(train_samples),
|
| 113 |
+
val_samples,
|
| 114 |
+
))
|
| 115 |
+
|
| 116 |
+
results.append(evaluate_classifier(
|
| 117 |
+
"TF-IDF + NB + Lemma",
|
| 118 |
+
NaiveBayesClassifier(train_samples, lemmatize=True),
|
| 119 |
+
val_samples,
|
| 120 |
+
))
|
| 121 |
+
|
| 122 |
+
results.append(evaluate_classifier(
|
| 123 |
+
"LDA features + LogReg",
|
| 124 |
+
LDAFeatureClassifier(train_samples),
|
| 125 |
+
val_samples,
|
| 126 |
+
))
|
| 127 |
+
|
| 128 |
+
# ββ Transformer models βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 129 |
+
xlmr = None
|
| 130 |
+
try:
|
| 131 |
+
xlmr = XLMRobertaClassifier()
|
| 132 |
+
results.append(evaluate_classifier("XLM-RoBERTa", xlmr, val_samples))
|
| 133 |
+
except ModelNotFoundError:
|
| 134 |
+
logger.warning("XLM-RoBERTa checkpoint not found β skipping")
|
| 135 |
+
|
| 136 |
+
tl = None
|
| 137 |
+
try:
|
| 138 |
+
tl = TagalogRobertaClassifier()
|
| 139 |
+
results.append(evaluate_classifier("Tagalog-RoBERTa", tl, val_samples))
|
| 140 |
+
except ModelNotFoundError:
|
| 141 |
+
logger.warning("Tagalog-RoBERTa checkpoint not found β skipping")
|
| 142 |
+
|
| 143 |
+
if xlmr is not None and tl is not None:
|
| 144 |
+
ensemble = EnsembleClassifier([xlmr, tl])
|
| 145 |
+
results.append(evaluate_classifier(
|
| 146 |
+
"Ensemble (XLM-R + Tagalog-RoBERTa)", ensemble, val_samples
|
| 147 |
+
))
|
| 148 |
+
|
| 149 |
+
# ββ Summary table ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 150 |
+
print(f"\n{'='*62}")
|
| 151 |
+
print(" Summary")
|
| 152 |
+
print(f"{'='*62}")
|
| 153 |
+
print(f" {'Model':<44} {'Accuracy':>8}")
|
| 154 |
+
print(f" {'-'*44} {'-'*8}")
|
| 155 |
+
|
| 156 |
+
classical_done = False
|
| 157 |
+
for r in results:
|
| 158 |
+
is_transformer = any(
|
| 159 |
+
kw in r["name"] for kw in ("XLM", "RoBERTa", "Tagalog", "Ensemble")
|
| 160 |
+
)
|
| 161 |
+
if is_transformer and not classical_done:
|
| 162 |
+
print() # blank separator between classical and transformer sections
|
| 163 |
+
classical_done = True
|
| 164 |
+
print(f" {r['name']:<44} {r['accuracy'] * 100:>7.1f}%")
|
| 165 |
+
|
| 166 |
+
best = max(results, key=lambda r: r["accuracy"])
|
| 167 |
+
print(f"\n Best: {best['name']} ({best['accuracy'] * 100:.1f}%)")
|
| 168 |
+
print()
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
main()
|
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify β LDA Topic Analysis + LDA Feature Classifier (Layer 1)
|
| 3 |
+
|
| 4 |
+
Two responsibilities:
|
| 5 |
+
|
| 6 |
+
1. run_topic_analysis(samples, n_topics)
|
| 7 |
+
Fits LDA on training texts, prints top-N words per topic and the dominant
|
| 8 |
+
topic distribution per class (Credible / Unverified / Likely Fake).
|
| 9 |
+
Call directly to explore what topics the model discovers.
|
| 10 |
+
|
| 11 |
+
2. LDAFeatureClassifier
|
| 12 |
+
Concatenates LDA topic distribution features with TF-IDF features and feeds
|
| 13 |
+
the combined vector into LogisticRegression. Same predict() interface as
|
| 14 |
+
TFIDFClassifier β slots directly into eval.py.
|
| 15 |
+
|
| 16 |
+
Usage:
|
| 17 |
+
python -m ml.lda_analysis # standalone topic analysis
|
| 18 |
+
python -m ml.eval # compare LDAFeatureClassifier against others
|
| 19 |
+
"""
|
| 20 |
+
import logging
|
| 21 |
+
|
| 22 |
+
import numpy as np
|
| 23 |
+
import scipy.sparse as sp
|
| 24 |
+
|
| 25 |
+
from ml.dataset import LABEL_NAMES, get_split
|
| 26 |
+
from ml.naive_bayes_classifier import _lemmatize_tokens
|
| 27 |
+
from ml.tfidf_classifier import Layer1Result
|
| 28 |
+
|
| 29 |
+
logger = logging.getLogger(__name__)
|
| 30 |
+
|
| 31 |
+
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
# ββ Standalone topic analysis ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
|
| 36 |
+
def run_topic_analysis(
|
| 37 |
+
samples,
|
| 38 |
+
n_topics: int = 5,
|
| 39 |
+
n_top_words: int = 10,
|
| 40 |
+
) -> None:
|
| 41 |
+
"""
|
| 42 |
+
Fit LDA on samples and print:
|
| 43 |
+
- Top-N words per topic
|
| 44 |
+
- Mean topic distribution per class label
|
| 45 |
+
"""
|
| 46 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 47 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 48 |
+
|
| 49 |
+
texts = [s.text.lower() for s in samples]
|
| 50 |
+
labels = [s.label for s in samples]
|
| 51 |
+
|
| 52 |
+
# LDA requires raw counts (not TF-IDF)
|
| 53 |
+
vectorizer = CountVectorizer(max_features=500, stop_words="english")
|
| 54 |
+
X = vectorizer.fit_transform(texts)
|
| 55 |
+
vocab = vectorizer.get_feature_names_out()
|
| 56 |
+
|
| 57 |
+
lda = LatentDirichletAllocation(
|
| 58 |
+
n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
|
| 59 |
+
)
|
| 60 |
+
doc_topics = lda.fit_transform(X) # (n_samples, n_topics)
|
| 61 |
+
|
| 62 |
+
print(f"\n{'='*62}")
|
| 63 |
+
print(f" LDA Topic Analysis ({n_topics} topics, {len(samples)} samples)")
|
| 64 |
+
print(f"{'='*62}")
|
| 65 |
+
|
| 66 |
+
for i, topic_vec in enumerate(lda.components_):
|
| 67 |
+
top_idx = topic_vec.argsort()[-n_top_words:][::-1]
|
| 68 |
+
top_words = [vocab[j] for j in top_idx]
|
| 69 |
+
print(f"\n Topic {i + 1}: {', '.join(top_words)}")
|
| 70 |
+
|
| 71 |
+
print(f"\n Per-class dominant topics:")
|
| 72 |
+
for label_id, label_name in sorted(LABEL_NAMES.items()):
|
| 73 |
+
class_idx = [i for i, l in enumerate(labels) if l == label_id]
|
| 74 |
+
if not class_idx:
|
| 75 |
+
continue
|
| 76 |
+
mean_dist = doc_topics[class_idx].mean(axis=0)
|
| 77 |
+
top2 = mean_dist.argsort()[-2:][::-1]
|
| 78 |
+
topic_str = " ".join(f"T{d+1}:{mean_dist[d]:.2f}" for d in top2)
|
| 79 |
+
print(f" {label_name:<14} {topic_str}")
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ββ LDA Feature Classifier βββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 83 |
+
|
| 84 |
+
class LDAFeatureClassifier:
|
| 85 |
+
"""
|
| 86 |
+
LDA topic distribution + TF-IDF features β LogisticRegression.
|
| 87 |
+
|
| 88 |
+
Feature vector = sparse_hstack([tfidf_features, lda_topic_distribution])
|
| 89 |
+
|
| 90 |
+
Args:
|
| 91 |
+
train_samples: list[Sample]. If None, uses the full 100-sample dataset.
|
| 92 |
+
n_topics: number of LDA topics (default 5).
|
| 93 |
+
lemmatize: apply WordNet lemmatization before vectorization.
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
+
def __init__(self, train_samples=None, n_topics: int = 5, lemmatize: bool = False):
|
| 97 |
+
from sklearn.decomposition import LatentDirichletAllocation
|
| 98 |
+
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
|
| 99 |
+
from sklearn.linear_model import LogisticRegression
|
| 100 |
+
|
| 101 |
+
self._lemmatize = lemmatize
|
| 102 |
+
self._n_topics = n_topics
|
| 103 |
+
|
| 104 |
+
if train_samples is None:
|
| 105 |
+
from ml.dataset import get_dataset
|
| 106 |
+
train_samples = get_dataset()
|
| 107 |
+
|
| 108 |
+
texts = [self._preprocess(s.text) for s in train_samples]
|
| 109 |
+
labels = [s.label for s in train_samples]
|
| 110 |
+
|
| 111 |
+
# TF-IDF part
|
| 112 |
+
self._tfidf = TfidfVectorizer(
|
| 113 |
+
ngram_range=(1, 2), max_features=1000, sublinear_tf=True
|
| 114 |
+
)
|
| 115 |
+
X_tfidf = self._tfidf.fit_transform(texts)
|
| 116 |
+
|
| 117 |
+
# LDA part (requires raw counts)
|
| 118 |
+
self._count_vec = CountVectorizer(max_features=500)
|
| 119 |
+
X_counts = self._count_vec.fit_transform(texts)
|
| 120 |
+
self._lda = LatentDirichletAllocation(
|
| 121 |
+
n_components=n_topics, random_state=42, max_iter=30, learning_method="batch"
|
| 122 |
+
)
|
| 123 |
+
X_lda = self._lda.fit_transform(X_counts) # dense (n_samples, n_topics)
|
| 124 |
+
|
| 125 |
+
# Combine: sparse TF-IDF + dense LDA β sparse
|
| 126 |
+
X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])
|
| 127 |
+
|
| 128 |
+
self._clf = LogisticRegression(max_iter=500, C=1.0, random_state=42)
|
| 129 |
+
self._clf.fit(X_combined, labels)
|
| 130 |
+
logger.info(
|
| 131 |
+
"LDAFeatureClassifier trained on %d samples (n_topics=%d, lemmatize=%s)",
|
| 132 |
+
len(texts), n_topics, lemmatize,
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
def _preprocess(self, text: str) -> str:
|
| 136 |
+
text = text.lower()
|
| 137 |
+
if self._lemmatize:
|
| 138 |
+
return " ".join(_lemmatize_tokens(text.split()))
|
| 139 |
+
return text
|
| 140 |
+
|
| 141 |
+
def predict(self, text: str) -> Layer1Result:
|
| 142 |
+
processed = self._preprocess(text)
|
| 143 |
+
X_tfidf = self._tfidf.transform([processed])
|
| 144 |
+
X_counts = self._count_vec.transform([processed])
|
| 145 |
+
X_lda = self._lda.transform(X_counts) # (1, n_topics)
|
| 146 |
+
X_combined = sp.hstack([X_tfidf, sp.csr_matrix(X_lda)])
|
| 147 |
+
|
| 148 |
+
pred_label = int(self._clf.predict(X_combined)[0])
|
| 149 |
+
proba = self._clf.predict_proba(X_combined)[0]
|
| 150 |
+
confidence = round(float(max(proba)) * 100, 1)
|
| 151 |
+
verdict = _LABELS[pred_label]
|
| 152 |
+
|
| 153 |
+
# Top TF-IDF features
|
| 154 |
+
feature_names = self._tfidf.get_feature_names_out()
|
| 155 |
+
tfidf_scores = X_tfidf.toarray()[0]
|
| 156 |
+
top_idx = tfidf_scores.argsort()[-4:][::-1]
|
| 157 |
+
triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0]
|
| 158 |
+
|
| 159 |
+
# Prepend dominant topic label
|
| 160 |
+
dominant_topic = int(X_lda[0].argmax()) + 1
|
| 161 |
+
triggered.insert(0, f"lda_topic_{dominant_topic}")
|
| 162 |
+
|
| 163 |
+
return Layer1Result(
|
| 164 |
+
verdict=verdict,
|
| 165 |
+
confidence=confidence,
|
| 166 |
+
triggered_features=triggered[:5],
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ββ Direct run βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 171 |
+
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
import argparse
|
| 174 |
+
|
| 175 |
+
parser = argparse.ArgumentParser(description="LDA topic analysis on PhilVerify dataset")
|
| 176 |
+
parser.add_argument("--n-topics", type=int, default=5)
|
| 177 |
+
parser.add_argument("--n-top-words", type=int, default=10)
|
| 178 |
+
parser.add_argument("--seed", type=int, default=42)
|
| 179 |
+
args = parser.parse_args()
|
| 180 |
+
|
| 181 |
+
train_samples, _ = get_split(seed=args.seed)
|
| 182 |
+
run_topic_analysis(train_samples, n_topics=args.n_topics, n_top_words=args.n_top_words)
|
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"RobertaForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": 2,
|
| 11 |
+
"gradient_checkpointing": false,
|
| 12 |
+
"hidden_act": "gelu",
|
| 13 |
+
"hidden_dropout_prob": 0.1,
|
| 14 |
+
"hidden_size": 768,
|
| 15 |
+
"id2label": {
|
| 16 |
+
"0": "Credible",
|
| 17 |
+
"1": "Unverified",
|
| 18 |
+
"2": "Likely Fake"
|
| 19 |
+
},
|
| 20 |
+
"initializer_range": 0.02,
|
| 21 |
+
"intermediate_size": 3072,
|
| 22 |
+
"is_decoder": false,
|
| 23 |
+
"label2id": {
|
| 24 |
+
"Credible": 0,
|
| 25 |
+
"Likely Fake": 2,
|
| 26 |
+
"Unverified": 1
|
| 27 |
+
},
|
| 28 |
+
"layer_norm_eps": 1e-05,
|
| 29 |
+
"max_position_embeddings": 514,
|
| 30 |
+
"model_type": "roberta",
|
| 31 |
+
"num_attention_heads": 12,
|
| 32 |
+
"num_hidden_layers": 12,
|
| 33 |
+
"pad_token_id": 1,
|
| 34 |
+
"position_embedding_type": "absolute",
|
| 35 |
+
"tie_word_embeddings": true,
|
| 36 |
+
"transformers_version": "5.3.0",
|
| 37 |
+
"type_vocab_size": 1,
|
| 38 |
+
"use_cache": true,
|
| 39 |
+
"vocab_size": 30000
|
| 40 |
+
}
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"backend": "tokenizers",
|
| 4 |
+
"bos_token": "<s>",
|
| 5 |
+
"cls_token": "<s>",
|
| 6 |
+
"eos_token": "</s>",
|
| 7 |
+
"errors": "replace",
|
| 8 |
+
"is_local": false,
|
| 9 |
+
"mask_token": "<mask>",
|
| 10 |
+
"max_length": 512,
|
| 11 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 12 |
+
"pad_token": "<pad>",
|
| 13 |
+
"sep_token": "</s>",
|
| 14 |
+
"stride": 0,
|
| 15 |
+
"tokenizer_class": "RobertaTokenizer",
|
| 16 |
+
"trim_offsets": true,
|
| 17 |
+
"truncation_side": "right",
|
| 18 |
+
"truncation_strategy": "longest_first",
|
| 19 |
+
"unk_token": "<unk>"
|
| 20 |
+
}
|
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify β TF-IDF + Naive Bayes Classifier (Layer 1)
|
| 3 |
+
|
| 4 |
+
MultinomialNB with TF-IDF features. Trains on the provided sample split so that
|
| 5 |
+
eval comparisons are fair (same train/val split as transformer models).
|
| 6 |
+
Supports optional WordNet lemmatization to measure its effect on Filipino/Taglish text.
|
| 7 |
+
"""
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
+
logger = logging.getLogger(__name__)
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def _lemmatize_tokens(tokens: list[str]) -> list[str]:
|
| 14 |
+
"""
|
| 15 |
+
Lemmatize tokens with POS-aware WordNet lemmatization.
|
| 16 |
+
Downloads required NLTK data on first call. Falls back to identity on any error.
|
| 17 |
+
Note: WordNet is English-biased β Tagalog tokens are returned unchanged.
|
| 18 |
+
"""
|
| 19 |
+
try:
|
| 20 |
+
import nltk
|
| 21 |
+
from nltk.corpus import wordnet
|
| 22 |
+
from nltk.stem import WordNetLemmatizer
|
| 23 |
+
|
| 24 |
+
for resource, path in [
|
| 25 |
+
("wordnet", "corpora/wordnet"),
|
| 26 |
+
("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
|
| 27 |
+
]:
|
| 28 |
+
try:
|
| 29 |
+
nltk.data.find(path)
|
| 30 |
+
except LookupError:
|
| 31 |
+
nltk.download(resource, quiet=True)
|
| 32 |
+
|
| 33 |
+
def _wn_pos(tag: str) -> str:
|
| 34 |
+
if tag.startswith("J"):
|
| 35 |
+
return wordnet.ADJ
|
| 36 |
+
if tag.startswith("V"):
|
| 37 |
+
return wordnet.VERB
|
| 38 |
+
if tag.startswith("R"):
|
| 39 |
+
return wordnet.ADV
|
| 40 |
+
return wordnet.NOUN
|
| 41 |
+
|
| 42 |
+
lemmatizer = WordNetLemmatizer()
|
| 43 |
+
tagged = nltk.pos_tag(tokens)
|
| 44 |
+
return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
|
| 45 |
+
except Exception as exc:
|
| 46 |
+
logger.debug("Lemmatization skipped (%s) β returning raw tokens", exc)
|
| 47 |
+
return tokens
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Import shared result type
|
| 51 |
+
from ml.tfidf_classifier import Layer1Result # noqa: E402
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class NaiveBayesClassifier:
|
| 55 |
+
"""
|
| 56 |
+
TF-IDF + MultinomialNB classifier. Same predict() interface as TFIDFClassifier.
|
| 57 |
+
|
| 58 |
+
Args:
|
| 59 |
+
train_samples: list[Sample] from ml.dataset. If None, uses the full 100-sample dataset.
|
| 60 |
+
lemmatize: apply WordNet lemmatization before vectorization.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
_LABELS = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 64 |
+
|
| 65 |
+
def __init__(self, train_samples=None, lemmatize: bool = False):
|
| 66 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 67 |
+
from sklearn.naive_bayes import MultinomialNB
|
| 68 |
+
|
| 69 |
+
self._lemmatize = lemmatize
|
| 70 |
+
|
| 71 |
+
if train_samples is None:
|
| 72 |
+
from ml.dataset import get_dataset
|
| 73 |
+
train_samples = get_dataset()
|
| 74 |
+
|
| 75 |
+
texts = [self._preprocess(s.text) for s in train_samples]
|
| 76 |
+
labels = [s.label for s in train_samples]
|
| 77 |
+
|
| 78 |
+
self._vectorizer = TfidfVectorizer(
|
| 79 |
+
ngram_range=(1, 2),
|
| 80 |
+
max_features=1000,
|
| 81 |
+
sublinear_tf=True,
|
| 82 |
+
)
|
| 83 |
+
X = self._vectorizer.fit_transform(texts)
|
| 84 |
+
|
| 85 |
+
self._clf = MultinomialNB(alpha=1.0)
|
| 86 |
+
self._clf.fit(X, labels)
|
| 87 |
+
logger.info(
|
| 88 |
+
"NaiveBayesClassifier trained on %d samples (lemmatize=%s)",
|
| 89 |
+
len(texts), lemmatize,
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
def _preprocess(self, text: str) -> str:
|
| 93 |
+
text = text.lower()
|
| 94 |
+
if self._lemmatize:
|
| 95 |
+
return " ".join(_lemmatize_tokens(text.split()))
|
| 96 |
+
return text
|
| 97 |
+
|
| 98 |
+
def predict(self, text: str) -> Layer1Result:
|
| 99 |
+
processed = self._preprocess(text)
|
| 100 |
+
X = self._vectorizer.transform([processed])
|
| 101 |
+
pred_label = int(self._clf.predict(X)[0])
|
| 102 |
+
proba = self._clf.predict_proba(X)[0]
|
| 103 |
+
confidence = round(float(max(proba)) * 100, 1)
|
| 104 |
+
verdict = self._LABELS[pred_label]
|
| 105 |
+
|
| 106 |
+
feature_names = self._vectorizer.get_feature_names_out()
|
| 107 |
+
tfidf_scores = X.toarray()[0]
|
| 108 |
+
top_idx = tfidf_scores.argsort()[-5:][::-1]
|
| 109 |
+
triggered = [feature_names[i] for i in top_idx if tfidf_scores[i] > 0]
|
| 110 |
+
|
| 111 |
+
return Layer1Result(verdict=verdict, confidence=confidence, triggered_features=triggered)
|
|
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PhilVerify β Tagalog-RoBERTa Sequence Classifier (Layer 1)
|
| 3 |
+
|
| 4 |
+
Fine-tuned on Philippine misinformation data using jcblaise/roberta-tagalog-base
|
| 5 |
+
as the backbone. This model was pre-trained on TLUnified β a large, topically-
|
| 6 |
+
varied Filipino corpus β and shows +4.47% average accuracy gain over prior
|
| 7 |
+
Filipino models on classification tasks.
|
| 8 |
+
|
| 9 |
+
Drop-in replacement for XLMRobertaClassifier β same predict() interface.
|
| 10 |
+
Checkpoint: ml/models/tagalog_roberta_model/ (populated by train_tagalog_roberta.py).
|
| 11 |
+
Raises ModelNotFoundError if checkpoint missing so the engine falls back gracefully.
|
| 12 |
+
"""
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import logging
|
| 16 |
+
from dataclasses import dataclass, field
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
from ml.xlm_roberta_classifier import Layer1Result, ModelNotFoundError
|
| 20 |
+
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
MODEL_DIR = Path(__file__).parent / "models" / "tagalog_roberta_model"
|
| 24 |
+
LABEL_NAMES = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
|
| 25 |
+
NUM_LABELS = 3
|
| 26 |
+
MAX_LENGTH = 256
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class TagalogRobertaClassifier:
|
| 30 |
+
"""
|
| 31 |
+
jcblaise/roberta-tagalog-base fine-tuned for misinformation classification.
|
| 32 |
+
|
| 33 |
+
Loading is lazy: the model is not loaded until the first call to predict().
|
| 34 |
+
Raises ModelNotFoundError on instantiation if the checkpoint is missing.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self) -> None:
|
| 38 |
+
if not MODEL_DIR.exists():
|
| 39 |
+
raise ModelNotFoundError(
|
| 40 |
+
f"Tagalog-RoBERTa checkpoint not found at {MODEL_DIR}. "
|
| 41 |
+
"Run `python ml/train_tagalog_roberta.py` to fine-tune the model first."
|
| 42 |
+
)
|
| 43 |
+
self._tokenizer = None
|
| 44 |
+
self._model = None
|
| 45 |
+
|
| 46 |
+
# ββ Lazy load βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 47 |
+
|
| 48 |
+
def _ensure_loaded(self) -> None:
|
| 49 |
+
if self._model is not None:
|
| 50 |
+
return
|
| 51 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 52 |
+
import torch
|
| 53 |
+
self._torch = torch
|
| 54 |
+
logger.info("Loading Tagalog-RoBERTa from %s β¦", MODEL_DIR)
|
| 55 |
+
self._tokenizer = AutoTokenizer.from_pretrained(str(MODEL_DIR))
|
| 56 |
+
self._model = AutoModelForSequenceClassification.from_pretrained(
|
| 57 |
+
str(MODEL_DIR),
|
| 58 |
+
num_labels=NUM_LABELS,
|
| 59 |
+
)
|
| 60 |
+
self._model.eval()
|
| 61 |
+
logger.info("Tagalog-RoBERTa loaded β device: %s", self._device)
|
| 62 |
+
|
| 63 |
+
@property
|
| 64 |
+
def _device(self) -> str:
|
| 65 |
+
try:
|
| 66 |
+
import torch
|
| 67 |
+
if torch.backends.mps.is_available():
|
| 68 |
+
return "mps"
|
| 69 |
+
except Exception:
|
| 70 |
+
pass
|
| 71 |
+
try:
|
| 72 |
+
import torch
|
| 73 |
+
if torch.cuda.is_available():
|
| 74 |
+
return "cuda"
|
| 75 |
+
except Exception:
|
| 76 |
+
pass
|
| 77 |
+
return "cpu"
|
| 78 |
+
|
| 79 |
+
# ββ Saliency: attention-based token importance ββββββββββββββββββββββββββββ
|
| 80 |
+
|
| 81 |
+
def _salient_tokens(self, input_ids, attentions, n: int = 5) -> list[str]:
|
| 82 |
+
import torch
|
| 83 |
+
last_layer_attn = attentions[-1]
|
| 84 |
+
cls_attn = last_layer_attn[0, :, 0, :].mean(0)
|
| 85 |
+
seq_len = cls_attn.shape[-1]
|
| 86 |
+
tokens = self._tokenizer.convert_ids_to_tokens(
|
| 87 |
+
input_ids[0].tolist()[:seq_len]
|
| 88 |
+
)
|
| 89 |
+
scored = []
|
| 90 |
+
for tok, score in zip(tokens, cls_attn.tolist()):
|
| 91 |
+
if tok in ("<s>", "</s>", "<pad>", "<unk>"):
|
| 92 |
+
continue
|
| 93 |
+
clean = tok.lstrip("β").strip()
|
| 94 |
+
if len(clean) >= 3 and clean.isalpha():
|
| 95 |
+
scored.append((clean, score))
|
| 96 |
+
|
| 97 |
+
seen: set[str] = set()
|
| 98 |
+
result = []
|
| 99 |
+
for word, _ in sorted(scored, key=lambda x: x[1], reverse=True):
|
| 100 |
+
if word.lower() not in seen:
|
| 101 |
+
seen.add(word.lower())
|
| 102 |
+
result.append(word)
|
| 103 |
+
if len(result) >= n:
|
| 104 |
+
break
|
| 105 |
+
return result
|
| 106 |
+
|
| 107 |
+
# ββ Public API ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 108 |
+
|
| 109 |
+
def predict(self, text: str) -> Layer1Result:
|
| 110 |
+
self._ensure_loaded()
|
| 111 |
+
import torch
|
| 112 |
+
|
| 113 |
+
encoding = self._tokenizer(
|
| 114 |
+
text,
|
| 115 |
+
truncation=True,
|
| 116 |
+
max_length=MAX_LENGTH,
|
| 117 |
+
return_tensors="pt",
|
| 118 |
+
)
|
| 119 |
+
with torch.no_grad():
|
| 120 |
+
outputs = self._model(
|
| 121 |
+
input_ids=encoding["input_ids"],
|
| 122 |
+
attention_mask=encoding["attention_mask"],
|
| 123 |
+
output_attentions=True,
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
logits = outputs.logits[0]
|
| 127 |
+
probs = torch.softmax(logits, dim=-1)
|
| 128 |
+
pred_label = int(probs.argmax().item())
|
| 129 |
+
confidence = round(float(probs[pred_label].item()) * 100, 1)
|
| 130 |
+
|
| 131 |
+
# SDPA attention doesn't return attentions; fallback to empty
|
| 132 |
+
triggered = self._salient_tokens(encoding["input_ids"], outputs.attentions) if outputs.attentions else []
|
| 133 |
+
|
| 134 |
+
return Layer1Result(
|
| 135 |
+
verdict=LABEL_NAMES[pred_label],
|
| 136 |
+
confidence=confidence,
|
| 137 |
+
triggered_features=triggered,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
def predict_probs(self, text: str):
|
| 141 |
+
"""Return raw softmax probability tensor for ensemble averaging."""
|
| 142 |
+
self._ensure_loaded()
|
| 143 |
+
import torch
|
| 144 |
+
|
| 145 |
+
encoding = self._tokenizer(
|
| 146 |
+
text,
|
| 147 |
+
truncation=True,
|
| 148 |
+
max_length=MAX_LENGTH,
|
| 149 |
+
return_tensors="pt",
|
| 150 |
+
)
|
| 151 |
+
with torch.no_grad():
|
| 152 |
+
outputs = self._model(
|
| 153 |
+
input_ids=encoding["input_ids"],
|
| 154 |
+
attention_mask=encoding["attention_mask"],
|
| 155 |
+
output_attentions=True,
|
| 156 |
+
)
|
| 157 |
+
return torch.softmax(outputs.logits[0], dim=-1), outputs.attentions, encoding["input_ids"]
|
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
PhilVerify β Tagalog-RoBERTa Fine-tuning Script
|
| 4 |
+
|
| 5 |
+
Fine-tunes jcblaise/roberta-tagalog-base on the PhilVerify labeled dataset.
|
| 6 |
+
The model was pre-trained on TLUnified, a large Filipino corpus, and
|
| 7 |
+
outperforms XLM-RoBERTa-base on Tagalog classification by ~4.47% accuracy.
|
| 8 |
+
|
| 9 |
+
Saves the checkpoint to ml/models/tagalog_roberta_model/ for use by
|
| 10 |
+
TagalogRobertaClassifier and the EnsembleClassifier.
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
cd PhilVerify/
|
| 14 |
+
source venv/bin/activate
|
| 15 |
+
python ml/train_tagalog_roberta.py [--epochs N] [--lr FLOAT] [--batch-size N]
|
| 16 |
+
|
| 17 |
+
Typical runtime (CPU, MacBook M1): ~8β12 minutes for 5 epochs
|
| 18 |
+
Typical runtime (GPU/MPS): ~1β2 minutes
|
| 19 |
+
"""
|
| 20 |
+
from __future__ import annotations
|
| 21 |
+
|
| 22 |
+
import argparse
|
| 23 |
+
import logging
|
| 24 |
+
import sys
|
| 25 |
+
import time
|
| 26 |
+
from pathlib import Path
|
| 27 |
+
|
| 28 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 29 |
+
|
| 30 |
+
logging.basicConfig(
|
| 31 |
+
level=logging.INFO,
|
| 32 |
+
format="%(asctime)s %(levelname)-8s %(message)s",
|
| 33 |
+
datefmt="%H:%M:%S",
|
| 34 |
+
)
|
| 35 |
+
logger = logging.getLogger(__name__)
|
| 36 |
+
|
| 37 |
+
OUTPUT_DIR = Path(__file__).parent / "models" / "tagalog_roberta_model"
|
| 38 |
+
BASE_MODEL = "jcblaise/roberta-tagalog-base"
|
| 39 |
+
MAX_LENGTH = 256
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
# ββ PyTorch Dataset βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 43 |
+
|
| 44 |
+
class PhilVerifyDataset:
|
| 45 |
+
def __init__(self, samples, tokenizer) -> None:
|
| 46 |
+
self.encodings = tokenizer(
|
| 47 |
+
[s.text for s in samples],
|
| 48 |
+
truncation=True,
|
| 49 |
+
padding="max_length",
|
| 50 |
+
max_length=MAX_LENGTH,
|
| 51 |
+
return_tensors="pt",
|
| 52 |
+
)
|
| 53 |
+
import torch
|
| 54 |
+
self.labels = torch.tensor([s.label for s in samples], dtype=torch.long)
|
| 55 |
+
|
| 56 |
+
def __len__(self) -> int:
|
| 57 |
+
return len(self.labels)
|
| 58 |
+
|
| 59 |
+
def __getitem__(self, idx: int):
|
| 60 |
+
return {
|
| 61 |
+
"input_ids": self.encodings["input_ids"][idx],
|
| 62 |
+
"attention_mask": self.encodings["attention_mask"][idx],
|
| 63 |
+
"labels": self.labels[idx],
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ Freeze helpers ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 68 |
+
|
| 69 |
+
def freeze_lower_layers(model, keep_top_n: int = 2) -> int:
|
| 70 |
+
frozen = 0
|
| 71 |
+
total_layers = len(model.roberta.encoder.layer)
|
| 72 |
+
unfreeze_from = total_layers - keep_top_n
|
| 73 |
+
|
| 74 |
+
for i, layer in enumerate(model.roberta.encoder.layer):
|
| 75 |
+
if i < unfreeze_from:
|
| 76 |
+
for p in layer.parameters():
|
| 77 |
+
p.requires_grad = False
|
| 78 |
+
frozen += p.numel()
|
| 79 |
+
|
| 80 |
+
for p in model.roberta.embeddings.parameters():
|
| 81 |
+
p.requires_grad = False
|
| 82 |
+
frozen += p.numel()
|
| 83 |
+
|
| 84 |
+
logger.info(
|
| 85 |
+
"Frozen %d / %d encoder layers (keeping top %d + classifier head). "
|
| 86 |
+
"%d params frozen.",
|
| 87 |
+
unfreeze_from, total_layers, keep_top_n, frozen,
|
| 88 |
+
)
|
| 89 |
+
return frozen
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
# ββ Metrics βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 93 |
+
|
| 94 |
+
def evaluate(model, loader, device) -> dict:
|
| 95 |
+
import torch
|
| 96 |
+
model.eval()
|
| 97 |
+
all_preds, all_labels = [], []
|
| 98 |
+
total_loss = 0.0
|
| 99 |
+
n_batches = 0
|
| 100 |
+
loss_fn = torch.nn.CrossEntropyLoss()
|
| 101 |
+
|
| 102 |
+
with torch.no_grad():
|
| 103 |
+
for batch in loader:
|
| 104 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
| 105 |
+
labels = batch["labels"]
|
| 106 |
+
outputs = model(
|
| 107 |
+
input_ids=batch["input_ids"],
|
| 108 |
+
attention_mask=batch["attention_mask"],
|
| 109 |
+
)
|
| 110 |
+
loss = loss_fn(outputs.logits, labels)
|
| 111 |
+
total_loss += loss.item()
|
| 112 |
+
preds = outputs.logits.argmax(dim=-1)
|
| 113 |
+
all_preds.extend(preds.cpu().tolist())
|
| 114 |
+
all_labels.extend(labels.cpu().tolist())
|
| 115 |
+
n_batches += 1
|
| 116 |
+
|
| 117 |
+
correct = sum(p == l for p, l in zip(all_preds, all_labels))
|
| 118 |
+
return {
|
| 119 |
+
"loss": round(total_loss / max(n_batches, 1), 4),
|
| 120 |
+
"accuracy": round(correct / max(len(all_labels), 1) * 100, 1),
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ββ Main training loop ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
|
| 126 |
+
def train(
|
| 127 |
+
epochs: int = 5,
|
| 128 |
+
lr: float = 2e-5,
|
| 129 |
+
batch_size: int = 8,
|
| 130 |
+
freeze: bool = True,
|
| 131 |
+
keep_top_n: int = 2,
|
| 132 |
+
seed: int = 42,
|
| 133 |
+
) -> None:
|
| 134 |
+
import torch
|
| 135 |
+
from torch.utils.data import DataLoader
|
| 136 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 137 |
+
from ml.combined_dataset import get_split, class_weights, LABEL_NAMES, NUM_LABELS
|
| 138 |
+
from ml.dataset import augment_samples
|
| 139 |
+
|
| 140 |
+
torch.manual_seed(seed)
|
| 141 |
+
|
| 142 |
+
if torch.backends.mps.is_available():
|
| 143 |
+
device = torch.device("mps")
|
| 144 |
+
elif torch.cuda.is_available():
|
| 145 |
+
device = torch.device("cuda")
|
| 146 |
+
else:
|
| 147 |
+
device = torch.device("cpu")
|
| 148 |
+
logger.info("Device: %s", device)
|
| 149 |
+
|
| 150 |
+
train_samples, val_samples = get_split(train_ratio=0.8, seed=seed)
|
| 151 |
+
aug = augment_samples(train_samples, seed=seed)
|
| 152 |
+
train_samples = train_samples + aug
|
| 153 |
+
logger.info(
|
| 154 |
+
"Dataset: %d train (%d original + %d augmented) / %d val",
|
| 155 |
+
len(train_samples), len(train_samples) - len(aug), len(aug), len(val_samples),
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
logger.info("Loading tokenizer: %s β¦", BASE_MODEL)
|
| 159 |
+
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
| 160 |
+
|
| 161 |
+
train_ds = PhilVerifyDataset(train_samples, tokenizer)
|
| 162 |
+
val_ds = PhilVerifyDataset(val_samples, tokenizer)
|
| 163 |
+
|
| 164 |
+
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
|
| 165 |
+
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
|
| 166 |
+
|
| 167 |
+
logger.info("Loading model: %s β¦", BASE_MODEL)
|
| 168 |
+
model = AutoModelForSequenceClassification.from_pretrained(
|
| 169 |
+
BASE_MODEL,
|
| 170 |
+
num_labels=NUM_LABELS,
|
| 171 |
+
id2label=LABEL_NAMES,
|
| 172 |
+
label2id={v: k for k, v in LABEL_NAMES.items()},
|
| 173 |
+
)
|
| 174 |
+
if freeze:
|
| 175 |
+
freeze_lower_layers(model, keep_top_n=keep_top_n)
|
| 176 |
+
model.to(device)
|
| 177 |
+
|
| 178 |
+
total_params = sum(p.numel() for p in model.parameters())
|
| 179 |
+
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
|
| 180 |
+
logger.info(
|
| 181 |
+
"Parameters: %d total / %d trainable (%.1f%%)",
|
| 182 |
+
total_params, trainable_params, trainable_params / total_params * 100,
|
| 183 |
+
)
|
| 184 |
+
|
| 185 |
+
weights = torch.tensor(
|
| 186 |
+
class_weights(train_samples), dtype=torch.float
|
| 187 |
+
).to(device)
|
| 188 |
+
logger.info("Class weights: %s", [round(w, 3) for w in weights.tolist()])
|
| 189 |
+
loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
|
| 190 |
+
|
| 191 |
+
optimizer = torch.optim.AdamW(
|
| 192 |
+
filter(lambda p: p.requires_grad, model.parameters()),
|
| 193 |
+
lr=lr,
|
| 194 |
+
weight_decay=0.01,
|
| 195 |
+
)
|
| 196 |
+
|
| 197 |
+
total_steps = epochs * len(train_loader)
|
| 198 |
+
warmup_steps = max(1, total_steps // 10)
|
| 199 |
+
|
| 200 |
+
def lr_lambda(step: int) -> float:
|
| 201 |
+
if step < warmup_steps:
|
| 202 |
+
return step / warmup_steps
|
| 203 |
+
progress = (step - warmup_steps) / max(total_steps - warmup_steps, 1)
|
| 204 |
+
return max(0.05, 1.0 - progress)
|
| 205 |
+
|
| 206 |
+
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
|
| 207 |
+
|
| 208 |
+
best_val_acc = 0.0
|
| 209 |
+
best_epoch = 0
|
| 210 |
+
global_step = 0
|
| 211 |
+
|
| 212 |
+
for epoch in range(1, epochs + 1):
|
| 213 |
+
model.train()
|
| 214 |
+
epoch_loss = 0.0
|
| 215 |
+
t0 = time.time()
|
| 216 |
+
|
| 217 |
+
for batch in train_loader:
|
| 218 |
+
batch = {k: v.to(device) for k, v in batch.items()}
|
| 219 |
+
labels = batch["labels"]
|
| 220 |
+
optimizer.zero_grad()
|
| 221 |
+
outputs = model(
|
| 222 |
+
input_ids=batch["input_ids"],
|
| 223 |
+
attention_mask=batch["attention_mask"],
|
| 224 |
+
)
|
| 225 |
+
loss = loss_fn(outputs.logits, labels)
|
| 226 |
+
loss.backward()
|
| 227 |
+
torch.nn.utils.clip_grad_norm_(
|
| 228 |
+
filter(lambda p: p.requires_grad, model.parameters()), 1.0
|
| 229 |
+
)
|
| 230 |
+
optimizer.step()
|
| 231 |
+
scheduler.step()
|
| 232 |
+
epoch_loss += loss.item()
|
| 233 |
+
global_step += 1
|
| 234 |
+
|
| 235 |
+
avg_loss = epoch_loss / max(len(train_loader), 1)
|
| 236 |
+
val_metrics = evaluate(model, val_loader, device)
|
| 237 |
+
elapsed = time.time() - t0
|
| 238 |
+
|
| 239 |
+
logger.info(
|
| 240 |
+
"Epoch %d/%d train_loss=%.4f val_loss=%.4f val_acc=%.1f%% (%.0fs)",
|
| 241 |
+
epoch, epochs, avg_loss,
|
| 242 |
+
val_metrics["loss"], val_metrics["accuracy"], elapsed,
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
if val_metrics["accuracy"] >= best_val_acc:
|
| 246 |
+
best_val_acc = val_metrics["accuracy"]
|
| 247 |
+
best_epoch = epoch
|
| 248 |
+
_save(model, tokenizer)
|
| 249 |
+
|
| 250 |
+
logger.info(
|
| 251 |
+
"Training complete. Best val_acc=%.1f%% at epoch %d. Saved β %s",
|
| 252 |
+
best_val_acc, best_epoch, OUTPUT_DIR,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def _save(model, tokenizer) -> None:
|
| 257 |
+
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
| 258 |
+
model.save_pretrained(str(OUTPUT_DIR))
|
| 259 |
+
tokenizer.save_pretrained(str(OUTPUT_DIR))
|
| 260 |
+
logger.info("Checkpoint saved to %s", OUTPUT_DIR)
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# ββ CLI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 264 |
+
|
| 265 |
+
def parse_args() -> argparse.Namespace:
|
| 266 |
+
p = argparse.ArgumentParser(
|
| 267 |
+
description="Fine-tune jcblaise/roberta-tagalog-base for PhilVerify",
|
| 268 |
+
)
|
| 269 |
+
p.add_argument("--epochs", type=int, default=5, help="Training epochs (default: 5)")
|
| 270 |
+
p.add_argument("--lr", type=float, default=2e-5, help="Learning rate (default: 2e-5)")
|
| 271 |
+
p.add_argument("--batch-size", type=int, default=8, help="Batch size (default: 8)")
|
| 272 |
+
p.add_argument("--keep-top-n", type=int, default=2, help="Unfrozen encoder layers (default: 2)")
|
| 273 |
+
p.add_argument("--no-freeze", action="store_true", help="Train all layers")
|
| 274 |
+
p.add_argument("--seed", type=int, default=42, help="Random seed (default: 42)")
|
| 275 |
+
return p.parse_args()
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
if __name__ == "__main__":
|
| 279 |
+
args = parse_args()
|
| 280 |
+
train(
|
| 281 |
+
epochs=args.epochs,
|
| 282 |
+
lr=args.lr,
|
| 283 |
+
batch_size=args.batch_size,
|
| 284 |
+
freeze=not args.no_freeze,
|
| 285 |
+
keep_top_n=args.keep_top_n,
|
| 286 |
+
seed=args.seed,
|
| 287 |
+
)
|
|
@@ -138,6 +138,7 @@ def train(
|
|
| 138 |
from torch.utils.data import DataLoader
|
| 139 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 140 |
from ml.combined_dataset import get_split, class_weights, LABEL_NAMES, NUM_LABELS
|
|
|
|
| 141 |
|
| 142 |
# ββ Reproducibility βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 143 |
torch.manual_seed(seed)
|
|
@@ -153,7 +154,12 @@ def train(
|
|
| 153 |
|
| 154 |
# ββ Data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 155 |
train_samples, val_samples = get_split(train_ratio=0.8, seed=seed)
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
logger.info("Loading tokenizer: %s β¦", BASE_MODEL)
|
| 159 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
|
|
|
| 138 |
from torch.utils.data import DataLoader
|
| 139 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 140 |
from ml.combined_dataset import get_split, class_weights, LABEL_NAMES, NUM_LABELS
|
| 141 |
+
from ml.dataset import augment_samples
|
| 142 |
|
| 143 |
# ββ Reproducibility βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 144 |
torch.manual_seed(seed)
|
|
|
|
| 154 |
|
| 155 |
# ββ Data ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 156 |
train_samples, val_samples = get_split(train_ratio=0.8, seed=seed)
|
| 157 |
+
aug = augment_samples(train_samples, seed=seed)
|
| 158 |
+
train_samples = train_samples + aug
|
| 159 |
+
logger.info(
|
| 160 |
+
"Dataset: %d train (%d original + %d augmented) / %d val",
|
| 161 |
+
len(train_samples), len(train_samples) - len(aug), len(aug), len(val_samples),
|
| 162 |
+
)
|
| 163 |
|
| 164 |
logger.info("Loading tokenizer: %s β¦", BASE_MODEL)
|
| 165 |
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
|
|
@@ -136,6 +136,25 @@ class XLMRobertaClassifier:
|
|
| 136 |
|
| 137 |
# ββ Public API (same interface as TFIDFClassifier) ββββββββββββββββββββββββ
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
def predict(self, text: str) -> Layer1Result:
|
| 140 |
self._ensure_loaded()
|
| 141 |
import torch
|
|
@@ -162,7 +181,8 @@ class XLMRobertaClassifier:
|
|
| 162 |
confidence = round(float(probs[pred_label].item()) * 100, 1)
|
| 163 |
verdict = LABEL_NAMES[pred_label]
|
| 164 |
|
| 165 |
-
|
|
|
|
| 166 |
|
| 167 |
return Layer1Result(
|
| 168 |
verdict=verdict,
|
|
|
|
| 136 |
|
| 137 |
# ββ Public API (same interface as TFIDFClassifier) ββββββββββββββββββββββββ
|
| 138 |
|
| 139 |
+
def predict_probs(self, text: str):
|
| 140 |
+
"""Return raw softmax probability tensor for ensemble averaging."""
|
| 141 |
+
self._ensure_loaded()
|
| 142 |
+
import torch
|
| 143 |
+
|
| 144 |
+
encoding = self._tokenizer(
|
| 145 |
+
text,
|
| 146 |
+
truncation=True,
|
| 147 |
+
max_length=MAX_LENGTH,
|
| 148 |
+
return_tensors="pt",
|
| 149 |
+
)
|
| 150 |
+
with torch.no_grad():
|
| 151 |
+
outputs = self._model(
|
| 152 |
+
input_ids=encoding["input_ids"],
|
| 153 |
+
attention_mask=encoding["attention_mask"],
|
| 154 |
+
output_attentions=True,
|
| 155 |
+
)
|
| 156 |
+
return torch.softmax(outputs.logits[0], dim=-1), outputs.attentions, encoding["input_ids"]
|
| 157 |
+
|
| 158 |
def predict(self, text: str) -> Layer1Result:
|
| 159 |
self._ensure_loaded()
|
| 160 |
import torch
|
|
|
|
| 181 |
confidence = round(float(probs[pred_label].item()) * 100, 1)
|
| 182 |
verdict = LABEL_NAMES[pred_label]
|
| 183 |
|
| 184 |
+
# SDPA attention doesn't return attentions; fallback to empty
|
| 185 |
+
triggered = self._salient_tokens(input_ids, outputs.attentions) if outputs.attentions else []
|
| 186 |
|
| 187 |
return Layer1Result(
|
| 188 |
verdict=verdict,
|
|
@@ -1,8 +1,15 @@
|
|
| 1 |
"""
|
| 2 |
PhilVerify β Claim Extractor
|
| 3 |
Extracts the key falsifiable claim from noisy social media text.
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
import re
|
| 8 |
import logging
|
|
@@ -12,73 +19,72 @@ logger = logging.getLogger(__name__)
|
|
| 12 |
|
| 13 |
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
@dataclass
|
| 17 |
class ClaimResult:
|
| 18 |
claim: str
|
| 19 |
-
method: str # "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
class ClaimExtractor:
|
| 23 |
"""
|
| 24 |
-
Extracts the single most falsifiable claim from input text
|
| 25 |
-
|
|
|
|
| 26 |
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
toward extracting assertions rather than summaries.
|
| 30 |
"""
|
| 31 |
|
| 32 |
-
_TASK_PREFIX = "Extract the main factual claim: "
|
| 33 |
-
|
| 34 |
-
def __init__(self):
|
| 35 |
-
self._pipe = None
|
| 36 |
-
self._loaded = False
|
| 37 |
-
|
| 38 |
-
def _load_model(self):
|
| 39 |
-
if self._loaded:
|
| 40 |
-
return
|
| 41 |
-
try:
|
| 42 |
-
from transformers import pipeline
|
| 43 |
-
self._pipe = pipeline(
|
| 44 |
-
"summarization",
|
| 45 |
-
model="sshleifer/distilbart-cnn-6-6",
|
| 46 |
-
max_length=80,
|
| 47 |
-
min_length=10,
|
| 48 |
-
do_sample=False,
|
| 49 |
-
)
|
| 50 |
-
logger.info("Claim extractor model loaded (distilbart-cnn-6-6)")
|
| 51 |
-
except Exception as e:
|
| 52 |
-
logger.warning("Summarization model not available (%s) β using sentence heuristic", e)
|
| 53 |
-
self._loaded = True
|
| 54 |
-
|
| 55 |
-
def _sentence_heuristic(self, text: str) -> str:
|
| 56 |
-
"""Return the first 1-2 sentences as the claim (fast fallback)."""
|
| 57 |
-
sentences = _SENTENCE_SPLIT.split(text.strip())
|
| 58 |
-
candidates = [s.strip() for s in sentences if len(s.strip()) > 20]
|
| 59 |
-
if not candidates:
|
| 60 |
-
return text[:200].strip()
|
| 61 |
-
return " ".join(candidates[:2])
|
| 62 |
-
|
| 63 |
def extract(self, text: str) -> ClaimResult:
|
| 64 |
-
self._load_model()
|
| 65 |
-
|
| 66 |
if not text or len(text.strip()) < 20:
|
| 67 |
return ClaimResult(claim=text.strip(), method="passthrough")
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
|
|
|
|
| 81 |
return ClaimResult(
|
| 82 |
-
claim=
|
| 83 |
method="sentence_heuristic",
|
| 84 |
)
|
|
|
|
| 1 |
"""
|
| 2 |
PhilVerify β Claim Extractor
|
| 3 |
Extracts the key falsifiable claim from noisy social media text.
|
| 4 |
+
|
| 5 |
+
Strategy: sentence scoring based on presence of named entities,
|
| 6 |
+
verbs, dates, and numbers β no heavy model required.
|
| 7 |
+
|
| 8 |
+
Filipino fake news headlines almost always embed the checkworthy
|
| 9 |
+
assertion in a sentence that contains a specific number/date + person/org
|
| 10 |
+
name + an attribution verb (sinabi, ayon, announced, confirmed, etc.).
|
| 11 |
+
Scoring these signals finds the right sentence faster and more reliably
|
| 12 |
+
than a summarization model that was trained on English news compression.
|
| 13 |
"""
|
| 14 |
import re
|
| 15 |
import logging
|
|
|
|
| 19 |
|
| 20 |
_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")
|
| 21 |
|
| 22 |
+
# Numbers, percentages, or month names signal a specific, verifiable claim
|
| 23 |
+
_DATE_OR_NUM = re.compile(
|
| 24 |
+
r"\b(\d[\d,.%]*"
|
| 25 |
+
r"|(?:January|February|March|April|May|June|July|August|"
|
| 26 |
+
r"September|October|November|December)"
|
| 27 |
+
r"|(?:Enero|Pebrero|Marso|Abril|Mayo|Hunyo|Hulyo|Agosto|"
|
| 28 |
+
r"Setyembre|Oktubre|Nobyembre|Disyembre))\b",
|
| 29 |
+
re.IGNORECASE,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Attribution / assertion verbs in English and Filipino
|
| 33 |
+
_VERB_PATTERN = re.compile(
|
| 34 |
+
r"\b(is|are|was|were|has|have|had|will|would"
|
| 35 |
+
r"|said|says|announced|confirmed|reported|claims|showed"
|
| 36 |
+
r"|found|revealed|arrested|killed|died|signed|approved|ordered"
|
| 37 |
+
r"|sinabi|ipinahayag|inanunsyo|kinumpirma|ayon|nagpahayag"
|
| 38 |
+
r"|inihayag|iniutos|nagsabi|ipinag-utos)\b",
|
| 39 |
+
re.IGNORECASE,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
|
| 43 |
@dataclass
|
| 44 |
class ClaimResult:
|
| 45 |
claim: str
|
| 46 |
+
method: str # "sentence_scoring" | "sentence_heuristic"
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _score_sentence(sent: str) -> float:
|
| 50 |
+
"""Score a sentence by how likely it is to contain a falsifiable claim."""
|
| 51 |
+
score = 0.0
|
| 52 |
+
if _DATE_OR_NUM.search(sent):
|
| 53 |
+
score += 2.0
|
| 54 |
+
score += min(3.0, len(_VERB_PATTERN.findall(sent)) * 1.0)
|
| 55 |
+
if len(sent) > 30:
|
| 56 |
+
score += 1.0
|
| 57 |
+
return score
|
| 58 |
|
| 59 |
|
| 60 |
class ClaimExtractor:
|
| 61 |
"""
|
| 62 |
+
Extracts the single most falsifiable claim from input text using
|
| 63 |
+
sentence scoring. No heavy model required β spaCy already loaded
|
| 64 |
+
for NER; this module uses only stdlib regex.
|
| 65 |
|
| 66 |
+
The highest-scoring sentence (by date/number + verb density) is
|
| 67 |
+
returned as the claim for downstream NewsAPI evidence retrieval.
|
|
|
|
| 68 |
"""
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def extract(self, text: str) -> ClaimResult:
|
|
|
|
|
|
|
| 71 |
if not text or len(text.strip()) < 20:
|
| 72 |
return ClaimResult(claim=text.strip(), method="passthrough")
|
| 73 |
|
| 74 |
+
sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text.strip())]
|
| 75 |
+
candidates = [s for s in sentences if len(s) > 15]
|
| 76 |
+
|
| 77 |
+
if not candidates:
|
| 78 |
+
return ClaimResult(claim=text[:200].strip(), method="sentence_heuristic")
|
| 79 |
+
|
| 80 |
+
scored = [(s, _score_sentence(s)) for s in candidates]
|
| 81 |
+
best_sent, best_score = max(scored, key=lambda x: x[1])
|
| 82 |
+
|
| 83 |
+
if best_score > 0:
|
| 84 |
+
return ClaimResult(claim=best_sent, method="sentence_scoring")
|
| 85 |
|
| 86 |
+
# All scores zero β fall back to first two sentences
|
| 87 |
return ClaimResult(
|
| 88 |
+
claim=" ".join(candidates[:2]),
|
| 89 |
method="sentence_heuristic",
|
| 90 |
)
|
|
@@ -46,8 +46,9 @@ class NERResult:
|
|
| 46 |
|
| 47 |
class EntityExtractor:
|
| 48 |
"""
|
| 49 |
-
NER using
|
| 50 |
-
Falls back to
|
|
|
|
| 51 |
"""
|
| 52 |
|
| 53 |
def __init__(self):
|
|
@@ -58,12 +59,17 @@ class EntityExtractor:
|
|
| 58 |
if self._loaded:
|
| 59 |
return
|
| 60 |
try:
|
| 61 |
-
import
|
| 62 |
-
self._nlp =
|
| 63 |
-
logger.info("
|
| 64 |
-
except Exception
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
self._loaded = True
|
| 68 |
|
| 69 |
def _hint_based_extract(self, text: str) -> NERResult:
|
|
|
|
| 46 |
|
| 47 |
class EntityExtractor:
|
| 48 |
"""
|
| 49 |
+
NER using calamanCy (tl_calamancy_lg) for Tagalog-aware entity extraction.
|
| 50 |
+
Falls back to spaCy en_core_web_sm, then to regex-based hint extraction.
|
| 51 |
+
calamanCy uses the same spaCy doc.ents interface so extract() is unchanged.
|
| 52 |
"""
|
| 53 |
|
| 54 |
def __init__(self):
|
|
|
|
| 59 |
if self._loaded:
|
| 60 |
return
|
| 61 |
try:
|
| 62 |
+
import calamancy
|
| 63 |
+
self._nlp = calamancy.load("tl_calamancy_lg")
|
| 64 |
+
logger.info("calamanCy tl_calamancy_lg loaded")
|
| 65 |
+
except Exception:
|
| 66 |
+
try:
|
| 67 |
+
import spacy
|
| 68 |
+
self._nlp = spacy.load("en_core_web_sm")
|
| 69 |
+
logger.info("spaCy en_core_web_sm loaded (calamancy unavailable)")
|
| 70 |
+
except Exception as e:
|
| 71 |
+
logger.warning("spaCy not available (%s) β using hint-based NER", e)
|
| 72 |
+
self._nlp = None
|
| 73 |
self._loaded = True
|
| 74 |
|
| 75 |
def _hint_based_extract(self, text: str) -> NERResult:
|
|
@@ -57,6 +57,7 @@ class PreprocessResult:
|
|
| 57 |
normalized: str
|
| 58 |
tokens: list[str] = field(default_factory=list)
|
| 59 |
filtered_tokens: list[str] = field(default_factory=list)
|
|
|
|
| 60 |
char_count: int = 0
|
| 61 |
word_count: int = 0
|
| 62 |
|
|
@@ -66,18 +67,62 @@ class TextPreprocessor:
|
|
| 66 |
Multi-step text cleaner for Tagalog / English / Taglish content.
|
| 67 |
|
| 68 |
Pipeline:
|
| 69 |
-
1. strip_html
|
| 70 |
-
2. strip_urls
|
| 71 |
-
3. strip_mentions
|
| 72 |
-
4. strip_hashtags
|
| 73 |
-
5. strip_emojis
|
| 74 |
-
6. lowercase
|
| 75 |
-
7. normalize_chars
|
| 76 |
-
8. strip_punct
|
| 77 |
-
9. tokenize
|
| 78 |
-
10. remove_stopwords
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
"""
|
| 80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
def clean(self, text: str) -> str:
|
| 82 |
"""Steps 1-6: structural cleaning."""
|
| 83 |
text = _HTML_TAG_PATTERN.sub(" ", text)
|
|
@@ -113,12 +158,14 @@ class TextPreprocessor:
|
|
| 113 |
normalized = self.normalize(cleaned)
|
| 114 |
tokens = self.tokenize(normalized)
|
| 115 |
filtered = self.remove_stopwords(tokens)
|
|
|
|
| 116 |
return PreprocessResult(
|
| 117 |
original=text,
|
| 118 |
cleaned=cleaned,
|
| 119 |
normalized=normalized,
|
| 120 |
tokens=tokens,
|
| 121 |
filtered_tokens=filtered,
|
|
|
|
| 122 |
char_count=len(normalized),
|
| 123 |
word_count=len(tokens),
|
| 124 |
)
|
|
|
|
| 57 |
normalized: str
|
| 58 |
tokens: list[str] = field(default_factory=list)
|
| 59 |
filtered_tokens: list[str] = field(default_factory=list)
|
| 60 |
+
lemmatized_tokens: list[str] = field(default_factory=list)
|
| 61 |
char_count: int = 0
|
| 62 |
word_count: int = 0
|
| 63 |
|
|
|
|
| 67 |
Multi-step text cleaner for Tagalog / English / Taglish content.
|
| 68 |
|
| 69 |
Pipeline:
|
| 70 |
+
1. strip_html β remove HTML tags
|
| 71 |
+
2. strip_urls β remove hyperlinks
|
| 72 |
+
3. strip_mentions β remove @user
|
| 73 |
+
4. strip_hashtags β remove #tag text (keep token)
|
| 74 |
+
5. strip_emojis β remove Unicode emoji
|
| 75 |
+
6. lowercase β normalize case
|
| 76 |
+
7. normalize_chars β collapse repeated chars, excessive !??
|
| 77 |
+
8. strip_punct β remove punctuation except apostrophe
|
| 78 |
+
9. tokenize β split on whitespace
|
| 79 |
+
10. remove_stopwords β drop EN + TL stopwords
|
| 80 |
+
11. lemmatize β WordNet lemmatization (opt-in, English-biased;
|
| 81 |
+
Tagalog tokens are returned unchanged)
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
lemmatize: if True, step 11 is applied and lemmatized_tokens is populated.
|
| 85 |
+
Off by default β transformer models handle subword tokenization
|
| 86 |
+
themselves and do not benefit from lemmatization.
|
| 87 |
"""
|
| 88 |
|
| 89 |
+
def __init__(self, lemmatize: bool = False):
|
| 90 |
+
self.lemmatize = lemmatize
|
| 91 |
+
|
| 92 |
+
def _lemmatize_tokens(self, tokens: list[str]) -> list[str]:
|
| 93 |
+
"""
|
| 94 |
+
POS-aware WordNet lemmatization. Downloads NLTK data on first call.
|
| 95 |
+
Falls back to identity on any error (e.g. missing corpus).
|
| 96 |
+
"""
|
| 97 |
+
try:
|
| 98 |
+
import nltk
|
| 99 |
+
from nltk.corpus import wordnet
|
| 100 |
+
from nltk.stem import WordNetLemmatizer
|
| 101 |
+
|
| 102 |
+
for resource, path in [
|
| 103 |
+
("wordnet", "corpora/wordnet"),
|
| 104 |
+
("averaged_perceptron_tagger_eng", "taggers/averaged_perceptron_tagger_eng"),
|
| 105 |
+
]:
|
| 106 |
+
try:
|
| 107 |
+
nltk.data.find(path)
|
| 108 |
+
except LookupError:
|
| 109 |
+
nltk.download(resource, quiet=True)
|
| 110 |
+
|
| 111 |
+
def _wn_pos(tag: str) -> str:
|
| 112 |
+
if tag.startswith("J"):
|
| 113 |
+
return wordnet.ADJ
|
| 114 |
+
if tag.startswith("V"):
|
| 115 |
+
return wordnet.VERB
|
| 116 |
+
if tag.startswith("R"):
|
| 117 |
+
return wordnet.ADV
|
| 118 |
+
return wordnet.NOUN
|
| 119 |
+
|
| 120 |
+
lemmatizer = WordNetLemmatizer()
|
| 121 |
+
tagged = nltk.pos_tag(tokens)
|
| 122 |
+
return [lemmatizer.lemmatize(w, _wn_pos(t)) for w, t in tagged]
|
| 123 |
+
except Exception:
|
| 124 |
+
return tokens
|
| 125 |
+
|
| 126 |
def clean(self, text: str) -> str:
|
| 127 |
"""Steps 1-6: structural cleaning."""
|
| 128 |
text = _HTML_TAG_PATTERN.sub(" ", text)
|
|
|
|
| 158 |
normalized = self.normalize(cleaned)
|
| 159 |
tokens = self.tokenize(normalized)
|
| 160 |
filtered = self.remove_stopwords(tokens)
|
| 161 |
+
lemmatized = self._lemmatize_tokens(filtered) if self.lemmatize else []
|
| 162 |
return PreprocessResult(
|
| 163 |
original=text,
|
| 164 |
cleaned=cleaned,
|
| 165 |
normalized=normalized,
|
| 166 |
tokens=tokens,
|
| 167 |
filtered_tokens=filtered,
|
| 168 |
+
lemmatized_tokens=lemmatized,
|
| 169 |
char_count=len(normalized),
|
| 170 |
word_count=len(tokens),
|
| 171 |
)
|
|
@@ -15,6 +15,7 @@ sentence-transformers==3.3.1
|
|
| 15 |
scikit-learn==1.5.2
|
| 16 |
safetensors>=0.4.3 # Faster, safer model serialisation (used by transformers)
|
| 17 |
spacy==3.8.2
|
|
|
|
| 18 |
langdetect==1.0.9
|
| 19 |
nltk==3.9.1
|
| 20 |
|
|
|
|
| 15 |
scikit-learn==1.5.2
|
| 16 |
safetensors>=0.4.3 # Faster, safer model serialisation (used by transformers)
|
| 17 |
spacy==3.8.2
|
| 18 |
+
calamancy>=0.2.0 # Tagalog NER (calamanCy tl_calamancy_lg)
|
| 19 |
langdetect==1.0.9
|
| 20 |
nltk==3.9.1
|
| 21 |
|
|
@@ -81,6 +81,7 @@ async def run_verification(
|
|
| 81 |
from nlp.clickbait import ClickbaitDetector
|
| 82 |
from nlp.claim_extractor import ClaimExtractor
|
| 83 |
from evidence.news_fetcher import fetch_evidence, compute_similarity
|
|
|
|
| 84 |
|
| 85 |
# ββ Step 1: Preprocess ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 86 |
preprocessor = _get_nlp("preprocessor", TextPreprocessor)
|
|
@@ -103,26 +104,41 @@ async def run_verification(
|
|
| 103 |
claim_result = claim_extractor.extract(proc.cleaned)
|
| 104 |
|
| 105 |
# ββ Step 7: Layer 1 β ML Classifier ββββββββββββββββββββββββββββββββββββββ
|
| 106 |
-
#
|
| 107 |
-
#
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
try:
|
| 110 |
from ml.xlm_roberta_classifier import XLMRobertaClassifier, ModelNotFoundError
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
except ModelNotFoundError:
|
| 113 |
logger.info("XLM-RoBERTa checkpoint not found β falling back to TF-IDF baseline")
|
| 114 |
-
from ml.tfidf_classifier import TFIDFClassifier
|
| 115 |
-
def _make_tfidf():
|
| 116 |
-
c = TFIDFClassifier(); c.train(); return c
|
| 117 |
-
classifier = _get_nlp("tfidf_classifier", _make_tfidf)
|
| 118 |
-
model_tier = "tfidf"
|
| 119 |
except Exception as exc:
|
| 120 |
logger.warning("XLM-RoBERTa load failed (%s) β falling back to TF-IDF", exc)
|
|
|
|
|
|
|
| 121 |
from ml.tfidf_classifier import TFIDFClassifier
|
| 122 |
-
def _make_tfidf():
|
| 123 |
c = TFIDFClassifier(); c.train(); return c
|
| 124 |
classifier = _get_nlp("tfidf_classifier", _make_tfidf)
|
| 125 |
-
model_tier = "tfidf"
|
| 126 |
|
| 127 |
l1 = classifier.predict(proc.cleaned)
|
| 128 |
logger.debug("Layer-1 (%s): %s %.1f%%", model_tier, l1.verdict, l1.confidence)
|
|
@@ -137,6 +153,7 @@ async def run_verification(
|
|
| 137 |
verdict=Verdict(l1.verdict),
|
| 138 |
confidence=l1.confidence,
|
| 139 |
triggered_features=l1.triggered_features,
|
|
|
|
| 140 |
)
|
| 141 |
|
| 142 |
# ββ Step 8: Layer 2 β Evidence Retrieval ββββββββββββββββββββββββββββββββββ
|
|
@@ -170,19 +187,21 @@ async def run_verification(
|
|
| 170 |
domain = (art.get("source", {}) or {}).get("name", "unknown").lower()
|
| 171 |
tier = get_domain_tier(domain)
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
| 180 |
|
| 181 |
evidence_sources.append(EvidenceSource(
|
| 182 |
title=art.get("title", ""),
|
| 183 |
url=art.get("url", ""),
|
| 184 |
similarity=sim,
|
| 185 |
stance=stance,
|
|
|
|
| 186 |
domain_tier=tier or DomainTier.SUSPICIOUS,
|
| 187 |
published_at=art.get("publishedAt"),
|
| 188 |
source_name=art.get("source", {}).get("name"),
|
|
@@ -208,6 +227,7 @@ async def run_verification(
|
|
| 208 |
evidence_score=round(evidence_score, 1),
|
| 209 |
sources=evidence_sources,
|
| 210 |
claim_used=claim_result.claim,
|
|
|
|
| 211 |
)
|
| 212 |
|
| 213 |
# ββ Step 9: Final Score βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 81 |
from nlp.clickbait import ClickbaitDetector
|
| 82 |
from nlp.claim_extractor import ClaimExtractor
|
| 83 |
from evidence.news_fetcher import fetch_evidence, compute_similarity
|
| 84 |
+
from evidence.stance_detector import detect_stance as _detect_stance
|
| 85 |
|
| 86 |
# ββ Step 1: Preprocess ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 87 |
preprocessor = _get_nlp("preprocessor", TextPreprocessor)
|
|
|
|
| 104 |
claim_result = claim_extractor.extract(proc.cleaned)
|
| 105 |
|
| 106 |
# ββ Step 7: Layer 1 β ML Classifier ββββββββββββββββββββββββββββββββββββββ
|
| 107 |
+
# Priority: Ensemble (XLM-R + Tagalog-RoBERTa) β XLM-R alone β TF-IDF.
|
| 108 |
+
# Tagalog-RoBERTa requires its own fine-tuned checkpoint; if missing the
|
| 109 |
+
# engine silently falls back to XLM-R only without breaking anything.
|
| 110 |
+
model_tier = "tfidf"
|
| 111 |
+
classifier = None
|
| 112 |
try:
|
| 113 |
from ml.xlm_roberta_classifier import XLMRobertaClassifier, ModelNotFoundError
|
| 114 |
+
from ml.tagalog_roberta_classifier import TagalogRobertaClassifier
|
| 115 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 116 |
+
|
| 117 |
+
xlmr = _get_nlp("xlmr_classifier", XLMRobertaClassifier)
|
| 118 |
+
members = [xlmr]
|
| 119 |
+
model_tier = "xlmr"
|
| 120 |
+
|
| 121 |
+
try:
|
| 122 |
+
tl = _get_nlp("tagalog_classifier", TagalogRobertaClassifier)
|
| 123 |
+
members.append(tl)
|
| 124 |
+
model_tier = "ensemble"
|
| 125 |
+
except ModelNotFoundError:
|
| 126 |
+
logger.info("Tagalog-RoBERTa checkpoint not found β using XLM-R only")
|
| 127 |
+
except Exception as exc:
|
| 128 |
+
logger.warning("Tagalog-RoBERTa load failed (%s) β using XLM-R only", exc)
|
| 129 |
+
|
| 130 |
+
classifier = EnsembleClassifier(members)
|
| 131 |
+
|
| 132 |
except ModelNotFoundError:
|
| 133 |
logger.info("XLM-RoBERTa checkpoint not found β falling back to TF-IDF baseline")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
except Exception as exc:
|
| 135 |
logger.warning("XLM-RoBERTa load failed (%s) β falling back to TF-IDF", exc)
|
| 136 |
+
|
| 137 |
+
if classifier is None:
|
| 138 |
from ml.tfidf_classifier import TFIDFClassifier
|
| 139 |
+
def _make_tfidf():
|
| 140 |
c = TFIDFClassifier(); c.train(); return c
|
| 141 |
classifier = _get_nlp("tfidf_classifier", _make_tfidf)
|
|
|
|
| 142 |
|
| 143 |
l1 = classifier.predict(proc.cleaned)
|
| 144 |
logger.debug("Layer-1 (%s): %s %.1f%%", model_tier, l1.verdict, l1.confidence)
|
|
|
|
| 153 |
verdict=Verdict(l1.verdict),
|
| 154 |
confidence=l1.confidence,
|
| 155 |
triggered_features=l1.triggered_features,
|
| 156 |
+
model_tier=model_tier,
|
| 157 |
)
|
| 158 |
|
| 159 |
# ββ Step 8: Layer 2 β Evidence Retrieval ββββββββββββββββββββββββββββββββββ
|
|
|
|
| 187 |
domain = (art.get("source", {}) or {}).get("name", "unknown").lower()
|
| 188 |
tier = get_domain_tier(domain)
|
| 189 |
|
| 190 |
+
stance_result = _detect_stance(
|
| 191 |
+
claim=claim_result.claim,
|
| 192 |
+
article_title=art.get("title", ""),
|
| 193 |
+
article_description=art.get("description", "") or "",
|
| 194 |
+
article_url=art.get("url", ""),
|
| 195 |
+
similarity=sim,
|
| 196 |
+
)
|
| 197 |
+
stance = Stance(stance_result.stance.value)
|
| 198 |
|
| 199 |
evidence_sources.append(EvidenceSource(
|
| 200 |
title=art.get("title", ""),
|
| 201 |
url=art.get("url", ""),
|
| 202 |
similarity=sim,
|
| 203 |
stance=stance,
|
| 204 |
+
stance_reason=stance_result.reason,
|
| 205 |
domain_tier=tier or DomainTier.SUSPICIOUS,
|
| 206 |
published_at=art.get("publishedAt"),
|
| 207 |
source_name=art.get("source", {}).get("name"),
|
|
|
|
| 227 |
evidence_score=round(evidence_score, 1),
|
| 228 |
sources=evidence_sources,
|
| 229 |
claim_used=claim_result.claim,
|
| 230 |
+
claim_method=claim_result.method,
|
| 231 |
)
|
| 232 |
|
| 233 |
# ββ Step 9: Final Score βββββββββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -0,0 +1,409 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for the 5 NLP pipeline improvements:
|
| 3 |
+
1. calamanCy NER fallback chain
|
| 4 |
+
2. Tagalog-RoBERTa classifier (ModelNotFoundError)
|
| 5 |
+
3. EnsembleClassifier
|
| 6 |
+
4. EDA augmentation
|
| 7 |
+
5. Sentence-scoring ClaimExtractor
|
| 8 |
+
6. NLI stance detection (Rule 1.5)
|
| 9 |
+
"""
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from unittest.mock import patch, MagicMock
|
| 13 |
+
|
| 14 |
+
import pytest
|
| 15 |
+
|
| 16 |
+
# Ensure project root is on path
|
| 17 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ββ Helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 21 |
+
|
| 22 |
+
def _make_sample(text: str, label: int = 0):
|
| 23 |
+
from ml.dataset import Sample
|
| 24 |
+
return Sample(text=text, label=label)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
# Part 1 β EDA Augmentation
|
| 29 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
|
| 31 |
+
class TestEDAugmentation:
|
| 32 |
+
def test_empty_input_returns_empty(self):
|
| 33 |
+
from ml.dataset import augment_samples
|
| 34 |
+
assert augment_samples([]) == []
|
| 35 |
+
|
| 36 |
+
def test_augment_produces_two_variants_per_sample(self):
|
| 37 |
+
from ml.dataset import augment_samples
|
| 38 |
+
samples = [_make_sample("DOH confirms 500 new COVID cases today", 0)]
|
| 39 |
+
aug = augment_samples(samples, seed=42)
|
| 40 |
+
# One deletion + one swap variant per sample
|
| 41 |
+
assert len(aug) == 2
|
| 42 |
+
|
| 43 |
+
def test_augmented_labels_match_originals(self):
|
| 44 |
+
from ml.dataset import augment_samples
|
| 45 |
+
samples = [
|
| 46 |
+
_make_sample("Senate passes new bill on health care reform", 0),
|
| 47 |
+
_make_sample("SHOCKING truth about vaccines hidden by government", 2),
|
| 48 |
+
]
|
| 49 |
+
aug = augment_samples(samples, seed=42)
|
| 50 |
+
orig_labels = {s.label for s in samples}
|
| 51 |
+
for a in aug:
|
| 52 |
+
assert a.label in orig_labels
|
| 53 |
+
|
| 54 |
+
def test_short_samples_skipped(self):
|
| 55 |
+
from ml.dataset import augment_samples
|
| 56 |
+
samples = [
|
| 57 |
+
_make_sample("ok", 1), # 1 word β too short
|
| 58 |
+
_make_sample("fake news", 2), # 2 words β too short
|
| 59 |
+
]
|
| 60 |
+
aug = augment_samples(samples, seed=42)
|
| 61 |
+
assert aug == []
|
| 62 |
+
|
| 63 |
+
def test_augmented_texts_differ_from_original(self):
|
| 64 |
+
from ml.dataset import augment_samples
|
| 65 |
+
original = "GRABE sinabi ng DOH na 200 bata ang nagkasakit sa bagong virus"
|
| 66 |
+
samples = [_make_sample(original, 2)]
|
| 67 |
+
aug = augment_samples(samples, seed=99)
|
| 68 |
+
# At least one variant should differ
|
| 69 |
+
assert any(a.text != original for a in aug)
|
| 70 |
+
|
| 71 |
+
def test_augment_triples_training_set_size(self):
|
| 72 |
+
from ml.dataset import get_split, augment_samples
|
| 73 |
+
train, _ = get_split()
|
| 74 |
+
aug = augment_samples(train, seed=42)
|
| 75 |
+
# aug should be at most 2Γ train size (some short samples may be skipped)
|
| 76 |
+
assert len(aug) >= len(train)
|
| 77 |
+
assert len(aug) <= 2 * len(train)
|
| 78 |
+
|
| 79 |
+
def test_augmented_samples_are_non_empty(self):
|
| 80 |
+
from ml.dataset import augment_samples
|
| 81 |
+
samples = [_make_sample("The senator confirmed signing the new law today", 0)]
|
| 82 |
+
aug = augment_samples(samples, seed=42)
|
| 83 |
+
for a in aug:
|
| 84 |
+
assert len(a.text.strip()) > 0
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 88 |
+
# Part 2 β Sentence-scoring ClaimExtractor
|
| 89 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 90 |
+
|
| 91 |
+
class TestClaimExtractor:
|
| 92 |
+
def test_instantiates_without_loading_model(self):
|
| 93 |
+
"""New ClaimExtractor has no lazy model loading at all."""
|
| 94 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 95 |
+
ce = ClaimExtractor()
|
| 96 |
+
# No _pipe, no _loaded attributes
|
| 97 |
+
assert not hasattr(ce, '_pipe')
|
| 98 |
+
assert not hasattr(ce, '_loaded')
|
| 99 |
+
|
| 100 |
+
def test_passthrough_for_short_text(self):
|
| 101 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 102 |
+
result = ClaimExtractor().extract("hi")
|
| 103 |
+
assert result.method == "passthrough"
|
| 104 |
+
assert result.claim == "hi"
|
| 105 |
+
|
| 106 |
+
def test_sentence_scoring_method_on_informative_sentence(self):
|
| 107 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 108 |
+
# Has a date, a verb, and named org β should score high
|
| 109 |
+
text = "GRABE! Sinabi ng DOH noong Martes na 200 bata ang nagkasakit sa bagong virus sa Maynila."
|
| 110 |
+
result = ClaimExtractor().extract(text)
|
| 111 |
+
# Should pick the DOH sentence, not all text or just "GRABE!"
|
| 112 |
+
assert result.method == "sentence_scoring"
|
| 113 |
+
assert "DOH" in result.claim or "200" in result.claim
|
| 114 |
+
|
| 115 |
+
def test_heuristic_fallback_when_no_scored_sentences(self):
|
| 116 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 117 |
+
# Text with no dates, no numbers, no verbs
|
| 118 |
+
text = "Wow amazing incredible unbelievable spectacular incomprehensible."
|
| 119 |
+
result = ClaimExtractor().extract(text)
|
| 120 |
+
assert result.method in ("sentence_heuristic", "sentence_scoring")
|
| 121 |
+
|
| 122 |
+
def test_returns_claim_result_dataclass(self):
|
| 123 |
+
from nlp.claim_extractor import ClaimExtractor, ClaimResult
|
| 124 |
+
result = ClaimExtractor().extract("The president signed the new healthcare law today.")
|
| 125 |
+
assert isinstance(result, ClaimResult)
|
| 126 |
+
assert isinstance(result.claim, str)
|
| 127 |
+
assert isinstance(result.method, str)
|
| 128 |
+
|
| 129 |
+
def test_picks_specific_sentence_over_clickbait_opener(self):
|
| 130 |
+
from nlp.claim_extractor import ClaimExtractor
|
| 131 |
+
text = "OMG! Natuklasan ng mga siyentipiko na 5,000 tao ang namatay dahil sa bagong sakit ngayong Enero."
|
| 132 |
+
result = ClaimExtractor().extract(text)
|
| 133 |
+
# The specific claim (5000 deaths) should be preferred over "OMG!"
|
| 134 |
+
assert "5,000" in result.claim or "siyentipiko" in result.claim or result.method == "sentence_scoring"
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 138 |
+
# Part 3 β TagalogRobertaClassifier
|
| 139 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 140 |
+
|
| 141 |
+
class TestTagalogRobertaClassifier:
|
| 142 |
+
def test_raises_model_not_found_when_checkpoint_missing(self, tmp_path, monkeypatch):
|
| 143 |
+
"""ModelNotFoundError raised when checkpoint directory doesn't exist."""
|
| 144 |
+
import ml.tagalog_roberta_classifier as mod
|
| 145 |
+
monkeypatch.setattr(mod, "MODEL_DIR", tmp_path / "nonexistent_model")
|
| 146 |
+
with pytest.raises(mod.ModelNotFoundError):
|
| 147 |
+
mod.TagalogRobertaClassifier()
|
| 148 |
+
|
| 149 |
+
def test_model_not_found_is_subclass_of_file_not_found(self):
|
| 150 |
+
from ml.xlm_roberta_classifier import ModelNotFoundError
|
| 151 |
+
assert issubclass(ModelNotFoundError, FileNotFoundError)
|
| 152 |
+
|
| 153 |
+
def test_shares_same_model_not_found_error(self):
|
| 154 |
+
"""Engine catches ModelNotFoundError from xlm_roberta_classifier β
|
| 155 |
+
tagalog module re-uses the same class, so the same except clause catches it."""
|
| 156 |
+
from ml.xlm_roberta_classifier import ModelNotFoundError as E1
|
| 157 |
+
from ml.tagalog_roberta_classifier import ModelNotFoundError as E2
|
| 158 |
+
assert E1 is E2
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 162 |
+
# Part 4 β EnsembleClassifier
|
| 163 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 164 |
+
|
| 165 |
+
class TestEnsembleClassifier:
|
| 166 |
+
def _make_stub(self, probs_list: list[float]):
|
| 167 |
+
"""Return a stub classifier whose predict_probs returns fixed probabilities."""
|
| 168 |
+
import torch
|
| 169 |
+
stub = MagicMock()
|
| 170 |
+
stub.predict_probs.return_value = (
|
| 171 |
+
torch.tensor(probs_list, dtype=torch.float32),
|
| 172 |
+
None,
|
| 173 |
+
None,
|
| 174 |
+
)
|
| 175 |
+
stub._salient_tokens = MagicMock(return_value=["token1"])
|
| 176 |
+
return stub
|
| 177 |
+
|
| 178 |
+
def test_raises_value_error_for_empty_list(self):
|
| 179 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 180 |
+
with pytest.raises(ValueError):
|
| 181 |
+
EnsembleClassifier([])
|
| 182 |
+
|
| 183 |
+
def test_single_classifier_returns_its_prediction(self):
|
| 184 |
+
import torch
|
| 185 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 186 |
+
stub = self._make_stub([0.7, 0.2, 0.1])
|
| 187 |
+
ens = EnsembleClassifier([stub])
|
| 188 |
+
result = ens.predict("any text")
|
| 189 |
+
assert result.verdict == "Credible"
|
| 190 |
+
assert abs(result.confidence - 70.0) < 1.0
|
| 191 |
+
|
| 192 |
+
def test_two_classifiers_averages_probabilities(self):
|
| 193 |
+
import torch
|
| 194 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 195 |
+
# First: [0.8, 0.1, 0.1] β Credible 80%
|
| 196 |
+
# Second: [0.4, 0.5, 0.1] β Unverified 50%
|
| 197 |
+
# Average: [0.6, 0.3, 0.1] β Credible 60%
|
| 198 |
+
stub1 = self._make_stub([0.8, 0.1, 0.1])
|
| 199 |
+
stub2 = self._make_stub([0.4, 0.5, 0.1])
|
| 200 |
+
ens = EnsembleClassifier([stub1, stub2])
|
| 201 |
+
result = ens.predict("test text")
|
| 202 |
+
assert result.verdict == "Credible"
|
| 203 |
+
assert abs(result.confidence - 60.0) < 1.5
|
| 204 |
+
|
| 205 |
+
def test_failing_classifier_gracefully_skipped(self):
|
| 206 |
+
import torch
|
| 207 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 208 |
+
good = self._make_stub([0.1, 0.1, 0.8]) # Likely Fake
|
| 209 |
+
bad = MagicMock()
|
| 210 |
+
bad.predict_probs.side_effect = RuntimeError("model failed")
|
| 211 |
+
ens = EnsembleClassifier([good, bad])
|
| 212 |
+
result = ens.predict("test text")
|
| 213 |
+
# Should still get a result from the good classifier
|
| 214 |
+
assert result.verdict == "Likely Fake"
|
| 215 |
+
|
| 216 |
+
def test_all_classifiers_failing_returns_unverified_neutral(self):
|
| 217 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 218 |
+
bad = MagicMock()
|
| 219 |
+
bad.predict_probs.side_effect = RuntimeError("fail")
|
| 220 |
+
ens = EnsembleClassifier([bad])
|
| 221 |
+
result = ens.predict("test")
|
| 222 |
+
assert result.verdict == "Unverified"
|
| 223 |
+
assert result.confidence == 33.3
|
| 224 |
+
|
| 225 |
+
def test_result_has_correct_type(self):
|
| 226 |
+
import torch
|
| 227 |
+
from ml.ensemble_classifier import EnsembleClassifier
|
| 228 |
+
from ml.xlm_roberta_classifier import Layer1Result
|
| 229 |
+
stub = self._make_stub([0.5, 0.3, 0.2])
|
| 230 |
+
ens = EnsembleClassifier([stub])
|
| 231 |
+
result = ens.predict("test")
|
| 232 |
+
assert isinstance(result, Layer1Result)
|
| 233 |
+
assert isinstance(result.triggered_features, list)
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 237 |
+
# Part 5 β NLI Stance Detection
|
| 238 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 239 |
+
|
| 240 |
+
class TestNLIStanceDetector:
|
| 241 |
+
def _reset_nli_cache(self):
|
| 242 |
+
"""Reset the module-level NLI singleton between tests."""
|
| 243 |
+
import evidence.stance_detector as mod
|
| 244 |
+
mod._nli_pipe = None
|
| 245 |
+
mod._nli_loaded = False
|
| 246 |
+
|
| 247 |
+
def test_falls_through_to_keywords_when_nli_unavailable(self):
|
| 248 |
+
"""When NLI model can't be loaded, keyword rules still work."""
|
| 249 |
+
import evidence.stance_detector as mod
|
| 250 |
+
self._reset_nli_cache()
|
| 251 |
+
with patch.object(mod, '_get_nli', return_value=None):
|
| 252 |
+
result = mod.detect_stance(
|
| 253 |
+
claim="Vaccines are safe",
|
| 254 |
+
article_title="Fact check: COVID vaccines proven effective",
|
| 255 |
+
article_description="Experts confirm vaccines are safe and effective after extensive testing.",
|
| 256 |
+
article_url="",
|
| 257 |
+
similarity=0.7,
|
| 258 |
+
)
|
| 259 |
+
from evidence.stance_detector import Stance
|
| 260 |
+
# "confirmed" in article β Supports keyword rule
|
| 261 |
+
assert result.stance in (Stance.SUPPORTS, Stance.NOT_ENOUGH_INFO, Stance.REFUTES)
|
| 262 |
+
# Should not crash
|
| 263 |
+
|
| 264 |
+
def test_nli_supports_high_confidence(self):
|
| 265 |
+
"""When NLI returns 'supports' at β₯0.65, stance is SUPPORTS with NLI reason."""
|
| 266 |
+
import evidence.stance_detector as mod
|
| 267 |
+
self._reset_nli_cache()
|
| 268 |
+
mock_nli = MagicMock()
|
| 269 |
+
mock_nli.return_value = {
|
| 270 |
+
"labels": ["supports the claim", "contradicts the claim", "unrelated"],
|
| 271 |
+
"scores": [0.82, 0.12, 0.06],
|
| 272 |
+
}
|
| 273 |
+
with patch.object(mod, '_get_nli', return_value=mock_nli):
|
| 274 |
+
result = mod.detect_stance(
|
| 275 |
+
claim="Government confirmed 500 new cases",
|
| 276 |
+
article_title="Government says 500 new cases recorded",
|
| 277 |
+
article_description="Officials confirmed today that 500 new cases were recorded nationwide.",
|
| 278 |
+
similarity=0.75,
|
| 279 |
+
)
|
| 280 |
+
from evidence.stance_detector import Stance
|
| 281 |
+
assert result.stance == Stance.SUPPORTS
|
| 282 |
+
assert "NLI" in result.reason
|
| 283 |
+
|
| 284 |
+
def test_nli_contradicts_high_confidence(self):
|
| 285 |
+
"""When NLI returns 'contradicts' at β₯0.65, stance is REFUTES with NLI reason."""
|
| 286 |
+
import evidence.stance_detector as mod
|
| 287 |
+
self._reset_nli_cache()
|
| 288 |
+
mock_nli = MagicMock()
|
| 289 |
+
mock_nli.return_value = {
|
| 290 |
+
"labels": ["contradicts the claim", "supports the claim", "unrelated"],
|
| 291 |
+
"scores": [0.78, 0.15, 0.07],
|
| 292 |
+
}
|
| 293 |
+
with patch.object(mod, '_get_nli', return_value=mock_nli):
|
| 294 |
+
result = mod.detect_stance(
|
| 295 |
+
claim="There is no evidence of fraud",
|
| 296 |
+
article_title="Evidence of widespread fraud found",
|
| 297 |
+
article_description="Investigators found extensive evidence of fraud in the election.",
|
| 298 |
+
similarity=0.6,
|
| 299 |
+
)
|
| 300 |
+
from evidence.stance_detector import Stance
|
| 301 |
+
assert result.stance == Stance.REFUTES
|
| 302 |
+
assert "NLI" in result.reason
|
| 303 |
+
|
| 304 |
+
def test_nli_low_confidence_falls_through_to_keywords(self):
|
| 305 |
+
"""NLI confidence < 0.65 β should fall through and use keyword rules."""
|
| 306 |
+
import evidence.stance_detector as mod
|
| 307 |
+
self._reset_nli_cache()
|
| 308 |
+
mock_nli = MagicMock()
|
| 309 |
+
mock_nli.return_value = {
|
| 310 |
+
"labels": ["supports the claim", "contradicts the claim", "unrelated"],
|
| 311 |
+
"scores": [0.45, 0.35, 0.20], # below 0.65 threshold
|
| 312 |
+
}
|
| 313 |
+
with patch.object(mod, '_get_nli', return_value=mock_nli):
|
| 314 |
+
result = mod.detect_stance(
|
| 315 |
+
claim="Senator is guilty of corruption",
|
| 316 |
+
article_title="Fact check: False claim about senator",
|
| 317 |
+
article_description="This claim has been debunked by multiple fact-checkers.",
|
| 318 |
+
similarity=0.5,
|
| 319 |
+
)
|
| 320 |
+
from evidence.stance_detector import Stance
|
| 321 |
+
# Keyword "debunked" should trigger REFUTES
|
| 322 |
+
assert result.stance == Stance.REFUTES
|
| 323 |
+
|
| 324 |
+
def test_short_description_skips_nli(self):
|
| 325 |
+
"""Article description shorter than 30 chars β NLI skipped, no error."""
|
| 326 |
+
import evidence.stance_detector as mod
|
| 327 |
+
self._reset_nli_cache()
|
| 328 |
+
mock_nli = MagicMock()
|
| 329 |
+
with patch.object(mod, '_get_nli', return_value=mock_nli):
|
| 330 |
+
result = mod.detect_stance(
|
| 331 |
+
claim="Some claim",
|
| 332 |
+
article_title="Short article",
|
| 333 |
+
article_description="Short.", # <30 chars
|
| 334 |
+
similarity=0.5,
|
| 335 |
+
)
|
| 336 |
+
# NLI should not have been called
|
| 337 |
+
mock_nli.assert_not_called()
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 341 |
+
# Part 6 β calamanCy NER Fallback Chain
|
| 342 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 343 |
+
|
| 344 |
+
class TestCalamanCyNERFallback:
|
| 345 |
+
def _fresh_extractor(self):
|
| 346 |
+
"""Return a fresh (unloaded) EntityExtractor."""
|
| 347 |
+
import importlib
|
| 348 |
+
import nlp.ner
|
| 349 |
+
importlib.reload(nlp.ner)
|
| 350 |
+
return nlp.ner.EntityExtractor()
|
| 351 |
+
|
| 352 |
+
def test_falls_back_to_spacy_when_calamancy_missing(self, monkeypatch):
|
| 353 |
+
"""When calamancy import fails, _nlp is set via spaCy en_core_web_sm."""
|
| 354 |
+
import nlp.ner as mod
|
| 355 |
+
extractor = mod.EntityExtractor()
|
| 356 |
+
extractor._loaded = False # force reload
|
| 357 |
+
|
| 358 |
+
# Simulate calamancy not installed
|
| 359 |
+
original_load = extractor._load_model.__func__
|
| 360 |
+
|
| 361 |
+
def patched_load(self):
|
| 362 |
+
self._loaded = True
|
| 363 |
+
try:
|
| 364 |
+
raise ImportError("No module named 'calamancy'")
|
| 365 |
+
except ImportError:
|
| 366 |
+
try:
|
| 367 |
+
import spacy
|
| 368 |
+
self._nlp = spacy.load("en_core_web_sm")
|
| 369 |
+
except Exception:
|
| 370 |
+
self._nlp = None
|
| 371 |
+
|
| 372 |
+
import types
|
| 373 |
+
extractor._load_model = types.MethodType(patched_load, extractor)
|
| 374 |
+
extractor._load_model()
|
| 375 |
+
# Either spaCy loaded successfully or fell back to None
|
| 376 |
+
assert extractor._loaded is True
|
| 377 |
+
|
| 378 |
+
def test_hint_based_fallback_when_both_unavailable(self):
|
| 379 |
+
"""When both calamancy and spaCy fail, hint-based NER still works."""
|
| 380 |
+
import nlp.ner as mod
|
| 381 |
+
extractor = mod.EntityExtractor()
|
| 382 |
+
extractor._loaded = True
|
| 383 |
+
extractor._nlp = None # force hint-based path
|
| 384 |
+
|
| 385 |
+
result = extractor.extract("Sinabi ni Marcos sa Davao tungkol sa DOH")
|
| 386 |
+
assert isinstance(result.persons, list)
|
| 387 |
+
assert isinstance(result.organizations, list)
|
| 388 |
+
assert isinstance(result.locations, list)
|
| 389 |
+
# Should find hint-based entities
|
| 390 |
+
assert any("Marcos" in p for p in result.persons)
|
| 391 |
+
|
| 392 |
+
def test_ner_result_method_reflects_path(self):
|
| 393 |
+
"""method field on NERResult reflects which extraction path was used."""
|
| 394 |
+
import nlp.ner as mod
|
| 395 |
+
extractor = mod.EntityExtractor()
|
| 396 |
+
extractor._loaded = True
|
| 397 |
+
extractor._nlp = None
|
| 398 |
+
|
| 399 |
+
result = extractor._hint_based_extract("Marcos is in Manila with DOH")
|
| 400 |
+
assert result.method == "hints"
|
| 401 |
+
|
| 402 |
+
def test_extract_with_no_model_returns_ner_result(self):
|
| 403 |
+
from nlp.ner import EntityExtractor, NERResult
|
| 404 |
+
e = EntityExtractor()
|
| 405 |
+
e._loaded = True
|
| 406 |
+
e._nlp = None
|
| 407 |
+
result = e.extract("DOH confirmed 500 cases in Cebu on January 2026")
|
| 408 |
+
assert isinstance(result, NERResult)
|
| 409 |
+
assert len(result.dates) > 0 # Should find "January 2026"
|