File size: 3,718 Bytes
398a289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
CSV Labeled Data Loader

Expected CSV columns (minimum):
  - text (string) OR claim (string)
  - evidence (string)
  - label (int or string: Đúng/Sai, True/False, Supported/Refuted)

Optional:
  - timestamp (ISO string or unix seconds)
"""

from datetime import datetime, timezone

import pandas as pd
from loguru import logger


class CSVLabeledLoader:
    def __init__(self, csv_path: str):
        self.csv_path = csv_path

    def load(self) -> pd.DataFrame:
        df = pd.read_csv(self.csv_path)
        if "text" not in df.columns and "claim" in df.columns:
            df = df.rename(columns={"claim": "text"})

        required_cols = {"text", "evidence", "label"}
        if not required_cols.issubset(df.columns):
            missing = ", ".join(sorted(required_cols - set(df.columns)))
            raise ValueError(
                f"CSV must contain columns: text (or claim), evidence, label. Missing: {missing}"
            )

        df = df.copy()
        df["evidence"] = df["evidence"].fillna("").astype(str)

        # Handle evidence that looks like a list string: "['item1', 'item2']"
        def parse_evidence(ev):
            ev_str = str(ev).strip()
            if ev_str.startswith("[") and ev_str.endswith("]"):
                try:
                    import ast

                    parsed = ast.literal_eval(ev_str)
                    if isinstance(parsed, list):
                        # Use ||| as separator for clear article boundaries
                        return "|||".join(str(item) for item in parsed)
                except (ValueError, SyntaxError):
                    pass
            return ev_str

        df["evidence"] = df["evidence"].apply(parse_evidence)
        # Normalize labels to stable 3-class IDs:
        # 0=Đúng (true), 1=Sai (false), 2=NEI (not enough info)
        label_map = {
            # Positive/support variants (ID: 0)
            "ĐÚNG": 0,
            "DUNG": 0,
            "TRUE": 0,
            "SUPPORTED": 0,
            "LEGIT": 0,
            "LEGITIMATE": 0,
            "0": 0,
            # Negative/refuted variants (ID: 1)
            "SAI": 1,
            "FALSE": 1,
            "REFUTED": 1,
            "SCAM": 1,
            "1": 1,
            # Not-enough-information variants (ID: 2)
            "NEI": 2,
            "NOT ENOUGH INFO": 2,
            "NOT ENOUGH INFORMATION": 2,
            "INSUFFICIENT": 2,
            "2": 2,
        }

        labels = df["label"]
        if pd.api.types.is_bool_dtype(labels):
            df["label"] = labels.map({True: 0, False: 1})
        else:
            label_numeric = pd.to_numeric(labels, errors="coerce")
            label_numeric = label_numeric.where(label_numeric.isin([0, 1, 2]))
            label_str = labels.astype(str).str.strip().str.upper()
            label_mapped = label_str.map(label_map)
            df["label"] = label_numeric.fillna(label_mapped)

        unmapped = df["label"].isna().sum()
        if unmapped > 0:
            logger.warning(
                f"{unmapped} labels could not be mapped. Defaulting to NEI class (2)."
            )
            df["label"] = df["label"].fillna(2)

        df["label"] = df["label"].astype(int)

        if "timestamp" in df.columns:
            df["timestamp"] = df["timestamp"].apply(self._parse_timestamp)

        return df

    def _parse_timestamp(self, value):
        if pd.isna(value):
            return datetime.now(timezone.utc)
        if isinstance(value, (int, float)):
            return datetime.fromtimestamp(float(value), tz=timezone.utc)
        try:
            return datetime.fromisoformat(str(value))
        except Exception:
            return datetime.now(timezone.utc)