""" CSV Labeled Data Loader Expected CSV columns (minimum): - text (string) OR claim (string) - evidence (string) - label (int or string: Đúng/Sai, True/False, Supported/Refuted) Optional: - timestamp (ISO string or unix seconds) """ from datetime import datetime, timezone import pandas as pd from loguru import logger class CSVLabeledLoader: def __init__(self, csv_path: str): self.csv_path = csv_path def load(self) -> pd.DataFrame: df = pd.read_csv(self.csv_path) if "text" not in df.columns and "claim" in df.columns: df = df.rename(columns={"claim": "text"}) required_cols = {"text", "evidence", "label"} if not required_cols.issubset(df.columns): missing = ", ".join(sorted(required_cols - set(df.columns))) raise ValueError( f"CSV must contain columns: text (or claim), evidence, label. Missing: {missing}" ) df = df.copy() df["evidence"] = df["evidence"].fillna("").astype(str) # Handle evidence that looks like a list string: "['item1', 'item2']" def parse_evidence(ev): ev_str = str(ev).strip() if ev_str.startswith("[") and ev_str.endswith("]"): try: import ast parsed = ast.literal_eval(ev_str) if isinstance(parsed, list): # Use ||| as separator for clear article boundaries return "|||".join(str(item) for item in parsed) except (ValueError, SyntaxError): pass return ev_str df["evidence"] = df["evidence"].apply(parse_evidence) # Normalize labels to stable 3-class IDs: # 0=Đúng (true), 1=Sai (false), 2=NEI (not enough info) label_map = { # Positive/support variants (ID: 0) "ĐÚNG": 0, "DUNG": 0, "TRUE": 0, "SUPPORTED": 0, "LEGIT": 0, "LEGITIMATE": 0, "0": 0, # Negative/refuted variants (ID: 1) "SAI": 1, "FALSE": 1, "REFUTED": 1, "SCAM": 1, "1": 1, # Not-enough-information variants (ID: 2) "NEI": 2, "NOT ENOUGH INFO": 2, "NOT ENOUGH INFORMATION": 2, "INSUFFICIENT": 2, "2": 2, } labels = df["label"] if pd.api.types.is_bool_dtype(labels): df["label"] = labels.map({True: 0, False: 1}) else: label_numeric = pd.to_numeric(labels, errors="coerce") label_numeric = label_numeric.where(label_numeric.isin([0, 1, 2])) label_str = labels.astype(str).str.strip().str.upper() label_mapped = label_str.map(label_map) df["label"] = label_numeric.fillna(label_mapped) unmapped = df["label"].isna().sum() if unmapped > 0: logger.warning( f"{unmapped} labels could not be mapped. Defaulting to NEI class (2)." ) df["label"] = df["label"].fillna(2) df["label"] = df["label"].astype(int) if "timestamp" in df.columns: df["timestamp"] = df["timestamp"].apply(self._parse_timestamp) return df def _parse_timestamp(self, value): if pd.isna(value): return datetime.now(timezone.utc) if isinstance(value, (int, float)): return datetime.fromtimestamp(float(value), tz=timezone.utc) try: return datetime.fromisoformat(str(value)) except Exception: return datetime.now(timezone.utc)