philverify-api / ml /data_sources /isot_dataset.py
Ryan Christian D. Deniega
fix: cold start 502, favicon, verify state persistence
b1c84b5
"""
ml/data_sources/isot_dataset.py
================================
PhilVerify adapter for the ISOT Fake News Dataset.
The ISOT dataset consists of two CSV files (``True.csv`` and ``Fake.csv``)
that must be placed locally by the user. ``True.csv`` contains Reuters
articles labelled as real; ``Fake.csv`` contains articles flagged as
fabricated. This adapter maps those two classes to PhilVerify's three-class
schema (class 1 / Unverified is intentionally absent from this binary source).
Label mapping
-------------
True.csv β†’ 0 Credible (confidence 1.00)
Fake.csv β†’ 2 Likely Fake (confidence 1.00)
Usage
-----
from ml.data_sources.isot_dataset import ISOTDataset
from pathlib import Path
ds = ISOTDataset() # default data_dir
# ds = ISOTDataset(data_dir=Path("/custom/path"))
samples = ds.fetch()
Download
--------
https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset
Place True.csv and Fake.csv in:
ml/data/raw/isot/
References
----------
Ahmed H., Traore I., Saad S. (2018).
Detecting opinion spam and fake news using text n-gram features.
Digital Investigation, 27, 244–258.
"""
from __future__ import annotations
import logging
import random
from pathlib import Path
from typing import Optional
from tqdm import tqdm
from .base import DataSource, NormalizedSample, clean_text
try:
import pandas as pd # type: ignore[import-untyped]
except ImportError: # pragma: no cover
pd = None # type: ignore[assignment] # lazy; guarded in _load_csv
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
#: Expected CSV filenames inside data_dir.
_TRUE_CSV = "True.csv"
_FAKE_CSV = "Fake.csv"
#: Kaggle download URL shown in the warning when files are absent.
_KAGGLE_URL = (
"https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset"
)
#: Minimum cleaned-text length (chars) below which a sample is discarded.
_MIN_TEXT_LEN = 10
# ---------------------------------------------------------------------------
# Adapter
# ---------------------------------------------------------------------------
class ISOTDataset(DataSource):
"""PhilVerify adapter for the local ISOT Fake News Dataset.
Parameters
----------
max_samples:
Total number of samples to return. The cap is split evenly between
``True.csv`` (class 0) and ``Fake.csv`` (class 2): each contributes
at most ``max_samples // 2`` rows. Defaults to ``2000``.
data_dir:
Directory that contains ``True.csv`` and ``Fake.csv``. Defaults to
``<project_root>/ml/data/raw/isot/``.
Examples
--------
>>> ds = ISOTDataset(max_samples=200)
>>> samples = ds.fetch() # returns [] if CSVs not found
"""
def __init__(
self,
max_samples: int = 2000,
data_dir: Optional[Path] = None,
) -> None:
self.max_samples = max_samples
self.data_dir: Path = (
data_dir
if data_dir is not None
else Path(__file__).parent.parent / "data" / "raw" / "isot"
)
# -- DataSource interface ------------------------------------------------
@property
def source_name(self) -> str:
"""Dataset identifier."""
return "isot"
def fetch(self) -> list[NormalizedSample]:
"""Load ISOT CSVs, normalise text, and return capped samples.
When the CSV files cannot be found this method logs an informative
warning with download instructions and returns an empty list rather
than raising an exception β€” consistent with the PhilVerify multi-source
pipeline convention.
Returns
-------
list[NormalizedSample]
Normalised English samples labelled 0 (Credible) or 2 (Likely Fake).
"""
true_path = self.data_dir / _TRUE_CSV
fake_path = self.data_dir / _FAKE_CSV
# ---- Existence check β€” try kagglehub auto-download if needed --------
missing = [p for p in (true_path, fake_path) if not p.is_file()]
if missing:
logger.info("[isot] CSV files not found locally β€” attempting kagglehub auto-download …")
self._auto_download(self.data_dir)
missing = [p for p in (true_path, fake_path) if not p.is_file()]
if missing:
self._warn_missing(missing)
return []
# ---- Load, clean, cap ----------------------------------------------
per_class_cap = max(1, self.max_samples // 2)
logger.info("[isot] Loading %s …", true_path)
print(f"[isot] Loading {true_path} …")
true_samples = self._load_csv(
path=true_path,
label=0,
original_label="real",
confidence=1.00,
cap=per_class_cap,
)
logger.info("[isot] Loading %s …", fake_path)
print(f"[isot] Loading {fake_path} …")
fake_samples = self._load_csv(
path=fake_path,
label=2,
original_label="fake",
confidence=1.00,
cap=per_class_cap,
)
print(f"[isot] True.csv β†’ {len(true_samples)} samples (class 0 Credible)")
print(f"[isot] Fake.csv β†’ {len(fake_samples)} samples (class 2 Likely Fake)")
logger.info(
"[isot] Loaded %d credible + %d fake = %d total.",
len(true_samples),
len(fake_samples),
len(true_samples) + len(fake_samples),
)
samples = true_samples + fake_samples
# Final shuffle for good measure (deterministic)
rng = random.Random(42)
rng.shuffle(samples)
self.log_class_distribution(samples)
return samples
# -- Private helpers -----------------------------------------------------
def _load_csv(
self,
path: Path,
label: int,
original_label: str,
confidence: float,
cap: int,
) -> list[NormalizedSample]:
"""Read one ISOT CSV and return up to *cap* NormalizedSamples.
The text fed to the model is the concatenation of the ``title`` and
``text`` columns (``"<title> <text>"``). Leading/trailing whitespace
from each column is stripped before joining, and the combined string is
then passed through :func:`clean_text`.
Parameters
----------
path:
Absolute path to the CSV file.
label:
Integer PhilVerify class for all rows in this file.
original_label:
Raw label string preserved in :class:`NormalizedSample`.
confidence:
Annotation confidence (1.0 for both ISOT splits).
cap:
Maximum number of samples to return from this file.
Returns
-------
list[NormalizedSample]
"""
try:
import pandas as _pd # noqa: PLC0415
df = _pd.read_csv(path, dtype=str)
except ImportError as exc:
raise ImportError(
"The 'pandas' package is required to load ISOT. "
"Install it with: pip install pandas"
) from exc
except Exception as exc: # noqa: BLE001
logger.error("[isot] Failed to read %s: %s", path, exc)
print(f"[isot] ERROR: could not read {path}: {exc}")
return []
# ---- Validate expected columns -------------------------------------
for col in ("title", "text"):
if col not in df.columns:
logger.warning(
"[isot] Expected column '%s' not found in %s. "
"Available columns: %s",
col,
path.name,
list(df.columns),
)
# Fall back to whatever text-like columns are available
df[col] = ""
# ---- Shuffle within class before capping ---------------------------
rng = random.Random(42)
indices = list(range(len(df)))
rng.shuffle(indices)
df = df.iloc[indices].reset_index(drop=True)
# ---- Build samples -------------------------------------------------
samples: list[NormalizedSample] = []
for _, row in tqdm(
df.head(cap * 3).iterrows(), # oversample then trim after filtering
total=min(cap * 3, len(df)),
desc=f"[isot] {path.name}",
leave=False,
):
sample = self._process_row(
row=row,
label=label,
original_label=original_label,
confidence=confidence,
)
if sample is not None:
samples.append(sample)
if len(samples) >= cap:
break
return samples
@staticmethod
def _process_row(
row: "pd.Series",
label: int,
original_label: str,
confidence: float,
) -> Optional[NormalizedSample]:
"""Convert a single DataFrame row to a :class:`NormalizedSample`.
Combines the ``title`` and ``text`` columns, cleans the result,
and returns ``None`` if the cleaned text is empty (i.e. too short).
Parameters
----------
row:
A pandas Series representing one CSV row.
label:
Integer PhilVerify class.
original_label:
Raw label string (``"real"`` or ``"fake"``).
confidence:
Annotation confidence.
Returns
-------
NormalizedSample | None
"""
title: str = str(row.get("title") or "").strip()
body: str = str(row.get("text") or "").strip()
# Combine title + body; a space separation is sufficient
raw_text = f"{title} {body}" if title and body else (title or body)
text = clean_text(raw_text)
if not text or len(text) < _MIN_TEXT_LEN:
return None
return NormalizedSample(
text=text,
label=label,
source="isot",
language="en",
original_label=original_label,
confidence=confidence,
)
@staticmethod
def _auto_download(data_dir: Path) -> None:
"""Attempt to download ISOT CSVs via ``kagglehub``.
Calls ``kagglehub.dataset_download("csmalarkodi/isot-fake-news-dataset")``
and copies ``True.csv`` / ``Fake.csv`` into *data_dir*.
Silently skips on any error so the caller can fall back to the manual
download message.
Parameters
----------
data_dir:
Destination directory where the CSVs should end up.
"""
import shutil
try:
import kagglehub # type: ignore[import-untyped]
except ImportError:
logger.warning(
"[isot] 'kagglehub' not installed β€” run: pip install kagglehub. "
"Falling back to manual download."
)
return
try:
logger.info("[isot] kagglehub: downloading csmalarkodi/isot-fake-news-dataset …")
dl_path = Path(kagglehub.dataset_download("csmalarkodi/isot-fake-news-dataset"))
logger.info("[isot] kagglehub download path: %s", dl_path)
except Exception as exc:
logger.warning("[isot] kagglehub download failed: %s", exc)
return
data_dir.mkdir(parents=True, exist_ok=True)
for target_name in ("True.csv", "Fake.csv"):
# Search recursively β€” kagglehub may nest files in a subdirectory
found = list(dl_path.rglob(target_name))
if not found:
logger.warning("[isot] '%s' not found in kagglehub download.", target_name)
continue
src = found[0]
dst = data_dir / target_name
if not dst.exists():
shutil.copy2(src, dst)
logger.info("[isot] Copied %s β†’ %s", src, dst)
else:
logger.info("[isot] '%s' already exists β€” skipping copy.", target_name)
def _warn_missing(self, missing: list[Path]) -> None:
"""Emit a clear, actionable warning when CSV files are absent."""
missing_names = ", ".join(p.name for p in missing)
msg = (
f"\n{'=' * 60}\n"
f"[isot] WARNING: ISOT dataset file(s) not found: {missing_names}\n"
f"\n"
f"Download the dataset from Kaggle:\n"
f" {_KAGGLE_URL}\n"
f"\n"
f"Then place True.csv and Fake.csv in:\n"
f" {self.data_dir}\n"
f"\n"
f"The ISOT source will be skipped for this run.\n"
f"{'=' * 60}\n"
)
print(msg)
logger.warning(msg)
# Delegate log_class_distribution to a local print-based implementation
def log_class_distribution(self, samples: list[NormalizedSample]) -> None:
"""Log class frequencies for the fetched sample list."""
label_names = {0: "Credible", 1: "Unverified", 2: "Likely Fake"}
counts: dict[int, int] = {0: 0, 1: 0, 2: 0}
for s in samples:
counts[s.label] = counts.get(s.label, 0) + 1
total = len(samples)
print(f"[{self.source_name}] Class distribution ({total} total):")
for lbl, name in label_names.items():
n = counts.get(lbl, 0)
pct = 100 * n / total if total else 0.0
print(f"[{self.source_name}] {lbl} {name:<15} {n:>5} ({pct:.1f}%)")
# ---------------------------------------------------------------------------
# Smoke test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
stream=sys.stdout,
)
print("=" * 60)
print("ISOTDataset β€” smoke test")
print("=" * 60)
ds = ISOTDataset(max_samples=200)
samples = ds.fetch()
if samples:
print(f"\nReturned {len(samples)} samples.")
print("\nFirst 5 samples:")
for i, s in enumerate(samples[:5]):
print(
f" [{i}] label={s.label} conf={s.confidence:.2f} "
f"orig={s.original_label!r:>6} text={s.text[:80]!r}"
)
else:
print(
"\nNo samples returned. "
"Place True.csv and Fake.csv under ml/data/raw/isot/ and re-run."
)