File size: 12,192 Bytes
b1c84b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
"""
ml/data_sources/base.py
Abstract base class and shared utilities for PhilVerify data source adapters.

Provides:
  - NormalizedSample  : canonical dataclass for all ingested samples
  - DataSource        : ABC that every source adapter must implement
  - clean_text        : HTML-strip + Unicode normalization + whitespace collapse
  - detect_language   : langdetect wrapper returning "tl" / "en" / "mixed"
  - domain_to_credibility_score : looks up domain tier from domain_credibility.json
  - binary_to_three_class       : maps raw dataset labels to {0, 1, 2}

Label schema
------------
  0 β†’ Credible
  1 β†’ Unverified
  2 β†’ Likely Fake
"""

from __future__ import annotations

import json
import logging
import re
import unicodedata
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from pathlib import Path
from typing import ClassVar

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Default path: ml/data_sources/ β†’ ml/ β†’ PhilVerify/ β†’ domain_credibility.json
# ---------------------------------------------------------------------------
_DEFAULT_CREDIBILITY_JSON: Path = (
    Path(__file__).parent.parent.parent / "domain_credibility.json"
)

# Module-level cache so the JSON file is only read from disk once per process.
_credibility_cache: dict[str, dict] = {}


# ---------------------------------------------------------------------------
# Dataclass
# ---------------------------------------------------------------------------


@dataclass
class NormalizedSample:
    """A single article or headline normalized to PhilVerify's label schema.

    Attributes
    ----------
    text:
        Cleaned article text or headline.
    label:
        Integer label in {0, 1, 2} (Credible / Unverified / Likely Fake).
    source:
        Dataset identifier, e.g. ``"jcblaise/fake_news_filipino"``.
    language:
        BCP-47-style language code: ``"tl"``, ``"en"``, or ``"mixed"``.
    original_label:
        The raw label string from the upstream dataset, e.g. ``"fake"``,
        ``"real"``, ``"pants-fire"``.  Preserved for debugging / auditing.
    confidence:
        A float in [0.0, 1.0] representing how confident the label mapping is.
        Defaults to ``1.0`` for unambiguous remappings; use lower values for
        heuristic or model-assisted mappings.
    """

    text: str
    label: int
    source: str
    language: str
    original_label: str
    confidence: float = field(default=1.0)

    def __post_init__(self) -> None:
        if self.label not in {0, 1, 2}:
            raise ValueError(
                f"label must be 0, 1, or 2; got {self.label!r}"
            )
        if not (0.0 <= self.confidence <= 1.0):
            raise ValueError(
                f"confidence must be in [0.0, 1.0]; got {self.confidence!r}"
            )


# ---------------------------------------------------------------------------
# Abstract base class
# ---------------------------------------------------------------------------


class DataSource(ABC):
    """Abstract base class for PhilVerify data source adapters.

    Subclasses must implement :meth:`fetch` and the :attr:`source_name`
    property.  Callers should use :meth:`load`, which wraps :meth:`fetch`
    with logging and error handling.

    Class Attributes
    ----------------
    LABEL_NAMES:
        Human-readable names for each integer label.
    """

    LABEL_NAMES: ClassVar[dict[int, str]] = {
        0: "Credible",
        1: "Unverified",
        2: "Likely Fake",
    }

    # -- Abstract interface --------------------------------------------------

    @property
    @abstractmethod
    def source_name(self) -> str:
        """A stable, unique identifier for this data source.

        Recommended format: ``"<owner>/<dataset>"`` for HuggingFace datasets,
        or a descriptive slug for scraped / local sources.

        Example: ``"jcblaise/fake_news_filipino"``
        """

    @abstractmethod
    def fetch(self) -> list[NormalizedSample]:
        """Download or load raw data and return normalized samples.

        This method may perform network I/O and should not swallow exceptions;
        error handling is the responsibility of :meth:`load`.

        Returns
        -------
        list[NormalizedSample]
            Every sample extracted from this source after normalization.
        """

    # -- Concrete helpers ----------------------------------------------------

    def load(self) -> list[NormalizedSample]:
        """Call :meth:`fetch`, log progress, and handle errors gracefully.

        Returns an empty list (rather than raising) if fetching fails, so that
        a single broken source does not abort a multi-source pipeline.

        Returns
        -------
        list[NormalizedSample]
            Normalized samples, or ``[]`` on failure.
        """
        logger.info("Loading data source: %s", self.source_name)
        try:
            samples = self.fetch()
            logger.info(
                "Loaded %d samples from %s", len(samples), self.source_name
            )
            return samples
        except Exception:  # noqa: BLE001
            logger.warning(
                "Failed to load data source '%s'. Returning empty list.",
                self.source_name,
                exc_info=True,
            )
            return []


# ---------------------------------------------------------------------------
# NLP utility functions
# ---------------------------------------------------------------------------


_HTML_TAG_RE = re.compile(r"<[^>]+>", re.UNICODE)
_WHITESPACE_RE = re.compile(r"\s+", re.UNICODE)
_MIN_TEXT_LENGTH = 10


def clean_text(text: str) -> str:
    """Clean article text for downstream tokenization.

    Steps applied in order:

    1. Strip HTML / XML tags with a regex (no third-party HTML parser needed).
    2. Normalize Unicode to NFC (handles combining characters, full-width
       glyphs, etc.).
    3. Collapse consecutive whitespace characters (spaces, tabs, newlines) to
       a single ASCII space.
    4. Strip leading and trailing whitespace.
    5. Return an empty string if the result is shorter than 10 characters
       (avoids feeding near-empty strings to the model).

    Parameters
    ----------
    text:
        Raw text, possibly containing HTML markup.

    Returns
    -------
    str
        Cleaned text, or ``""`` if the cleaned result is too short.
    """
    if not text:
        return ""

    # 1. Remove HTML tags
    cleaned = _HTML_TAG_RE.sub(" ", text)

    # 2. Unicode NFC normalization
    cleaned = unicodedata.normalize("NFC", cleaned)

    # 3. Collapse whitespace
    cleaned = _WHITESPACE_RE.sub(" ", cleaned)

    # 4. Strip edges
    cleaned = cleaned.strip()

    # 5. Minimum length guard
    if len(cleaned) < _MIN_TEXT_LENGTH:
        return ""

    return cleaned


def detect_language(text: str) -> str:
    """Detect the primary language of *text*.

    Uses ``langdetect`` (which must be installed in the environment).

    Returns
    -------
    str
        ``"tl"`` for Filipino/Tagalog, ``"en"`` for English,
        ``"mixed"`` for any other detected language or on detection failure.
    """
    try:
        from langdetect import detect  # type: ignore[import-untyped]
        from langdetect.lang_detect_exception import (  # type: ignore[import-untyped]
            LangDetectException,
        )

        try:
            lang = detect(text)
            if lang == "tl":
                return "tl"
            if lang == "en":
                return "en"
            return "mixed"
        except LangDetectException:
            return "mixed"

    except ImportError:
        logger.warning(
            "langdetect is not installed; defaulting language to 'mixed'."
        )
        return "mixed"


def domain_to_credibility_score(
    domain: str,
    credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON,
) -> int:
    """Look up a domain's credibility tier score.

    Reads ``domain_credibility.json`` (cached after the first call) and maps
    the domain to a numeric score:

    +---------+-------+---------------------------+
    | Tier    | Score | Meaning                   |
    +=========+=======+===========================+
    | tier1   |   100 | High-credibility outlet   |
    +---------+-------+---------------------------+
    | tier2   |    50 | Mainstream / mid-tier     |
    +---------+-------+---------------------------+
    | tier3   |    25 | Low-credibility           |
    +---------+-------+---------------------------+
    | tier4   |     0 | Known misinformation site |
    +---------+-------+---------------------------+
    | unknown |    50 | Domain not found (default)|
    +---------+-------+---------------------------+

    Parameters
    ----------
    domain:
        Bare domain name, e.g. ``"rappler.com"``.
    credibility_json_path:
        Path to ``domain_credibility.json``.  Defaults to the file at the
        PhilVerify project root.

    Returns
    -------
    int
        Credibility score for the domain.
    """
    cache_key = str(credibility_json_path)

    if cache_key not in _credibility_cache:
        try:
            with credibility_json_path.open(encoding="utf-8") as fh:
                _credibility_cache[cache_key] = json.load(fh)
        except (FileNotFoundError, json.JSONDecodeError):
            logger.warning(
                "Could not load domain_credibility.json from %s; "
                "all domains will receive a default score of 50.",
                credibility_json_path,
            )
            _credibility_cache[cache_key] = {}

    data: dict = _credibility_cache[cache_key]

    tier_scores: dict[str, int] = {
        "tier1": 100,
        "tier2": 50,
        "tier3": 25,
        "tier4": 0,
    }

    for tier, score in tier_scores.items():
        tier_domains: list[str] = data.get(tier, [])
        if domain in tier_domains:
            return score

    # Domain not found β†’ treat as tier2 / unknown
    return 50


def binary_to_three_class(
    raw_label: str,
    domain: str | None,
    credibility_json_path: Path = _DEFAULT_CREDIBILITY_JSON,
) -> int:
    """Map a raw dataset label string to PhilVerify's three-class schema.

    Label mapping rules
    -------------------
    * ``"fake"`` / ``"0"`` / ``"FALSE"`` / ``"pants-fire"`` / ``"false"``
      β†’ **2** (Likely Fake)

    * ``"real"`` / ``"1"`` / ``"TRUE"`` / ``"true"``
      β†’ credibility-aware decision:

      - domain score β‰₯ 75 β†’ **0** (Credible)
      - domain score β‰₯ 40 β†’ **0** (Credible, mainstream source)
      - domain score <  40 β†’ **1** (Unverified, low-credibility domain)

    * ``"mostly-true"``
      β†’ **0** (Credible)

    * ``"half-true"`` / ``"barely-true"``
      β†’ **1** (Unverified)

    * *anything else*
      β†’ **1** (Unverified, safe default)

    Parameters
    ----------
    raw_label:
        The label string exactly as it appears in the upstream dataset.
    domain:
        The publisher domain used for credibility lookup when the raw label
        indicates truth.  Pass ``None`` to skip domain lookup (score β†’ 50).
    credibility_json_path:
        Path to ``domain_credibility.json``.

    Returns
    -------
    int
        An integer in ``{0, 1, 2}``.
    """
    _FAKE_LABELS: frozenset[str] = frozenset(
        {"fake", "0", "FALSE", "pants-fire", "false"}
    )
    _TRUE_LABELS: frozenset[str] = frozenset({"real", "1", "TRUE", "true"})

    if raw_label in _FAKE_LABELS:
        return 2

    if raw_label in _TRUE_LABELS:
        if domain:
            score = domain_to_credibility_score(domain, credibility_json_path)
        else:
            score = 50  # neutral default when no domain is available

        if score >= 75:
            return 0  # Credible
        if score >= 40:
            return 0  # Credible β€” mainstream source
        return 1  # Unverified β€” low-credibility domain

    if raw_label == "mostly-true":
        return 0

    if raw_label in {"half-true", "barely-true"}:
        return 1

    # Default: treat as Unverified
    return 1