Spaces:
Sleeping
Sleeping
File size: 3,718 Bytes
398a289 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | """
CSV Labeled Data Loader
Expected CSV columns (minimum):
- text (string) OR claim (string)
- evidence (string)
- label (int or string: Đúng/Sai, True/False, Supported/Refuted)
Optional:
- timestamp (ISO string or unix seconds)
"""
from datetime import datetime, timezone
import pandas as pd
from loguru import logger
class CSVLabeledLoader:
def __init__(self, csv_path: str):
self.csv_path = csv_path
def load(self) -> pd.DataFrame:
df = pd.read_csv(self.csv_path)
if "text" not in df.columns and "claim" in df.columns:
df = df.rename(columns={"claim": "text"})
required_cols = {"text", "evidence", "label"}
if not required_cols.issubset(df.columns):
missing = ", ".join(sorted(required_cols - set(df.columns)))
raise ValueError(
f"CSV must contain columns: text (or claim), evidence, label. Missing: {missing}"
)
df = df.copy()
df["evidence"] = df["evidence"].fillna("").astype(str)
# Handle evidence that looks like a list string: "['item1', 'item2']"
def parse_evidence(ev):
ev_str = str(ev).strip()
if ev_str.startswith("[") and ev_str.endswith("]"):
try:
import ast
parsed = ast.literal_eval(ev_str)
if isinstance(parsed, list):
# Use ||| as separator for clear article boundaries
return "|||".join(str(item) for item in parsed)
except (ValueError, SyntaxError):
pass
return ev_str
df["evidence"] = df["evidence"].apply(parse_evidence)
# Normalize labels to stable 3-class IDs:
# 0=Đúng (true), 1=Sai (false), 2=NEI (not enough info)
label_map = {
# Positive/support variants (ID: 0)
"ĐÚNG": 0,
"DUNG": 0,
"TRUE": 0,
"SUPPORTED": 0,
"LEGIT": 0,
"LEGITIMATE": 0,
"0": 0,
# Negative/refuted variants (ID: 1)
"SAI": 1,
"FALSE": 1,
"REFUTED": 1,
"SCAM": 1,
"1": 1,
# Not-enough-information variants (ID: 2)
"NEI": 2,
"NOT ENOUGH INFO": 2,
"NOT ENOUGH INFORMATION": 2,
"INSUFFICIENT": 2,
"2": 2,
}
labels = df["label"]
if pd.api.types.is_bool_dtype(labels):
df["label"] = labels.map({True: 0, False: 1})
else:
label_numeric = pd.to_numeric(labels, errors="coerce")
label_numeric = label_numeric.where(label_numeric.isin([0, 1, 2]))
label_str = labels.astype(str).str.strip().str.upper()
label_mapped = label_str.map(label_map)
df["label"] = label_numeric.fillna(label_mapped)
unmapped = df["label"].isna().sum()
if unmapped > 0:
logger.warning(
f"{unmapped} labels could not be mapped. Defaulting to NEI class (2)."
)
df["label"] = df["label"].fillna(2)
df["label"] = df["label"].astype(int)
if "timestamp" in df.columns:
df["timestamp"] = df["timestamp"].apply(self._parse_timestamp)
return df
def _parse_timestamp(self, value):
if pd.isna(value):
return datetime.now(timezone.utc)
if isinstance(value, (int, float)):
return datetime.fromtimestamp(float(value), tz=timezone.utc)
try:
return datetime.fromisoformat(str(value))
except Exception:
return datetime.now(timezone.utc)
|