Spaces:

ktoan911
/

Fake_Crypto_Claim_Detector

Sleeping

App Files Files Community

Fake_Crypto_Claim_Detector / src /data_process /csv_loader.py

ktoan911

Upload folder using huggingface_hub

398a289 verified about 1 month ago

raw

history blame contribute delete

3.72 kB

	"""
	CSV Labeled Data Loader

	Expected CSV columns (minimum):
	- text (string) OR claim (string)
	- evidence (string)
	- label (int or string: Đúng/Sai, True/False, Supported/Refuted)

	Optional:
	- timestamp (ISO string or unix seconds)
	"""

	from datetime import datetime, timezone

	import pandas as pd
	from loguru import logger


	class CSVLabeledLoader:
	def __init__(self, csv_path: str):
	self.csv_path = csv_path

	def load(self) -> pd.DataFrame:
	df = pd.read_csv(self.csv_path)
	if "text" not in df.columns and "claim" in df.columns:
	df = df.rename(columns={"claim": "text"})

	required_cols = {"text", "evidence", "label"}
	if not required_cols.issubset(df.columns):
	missing = ", ".join(sorted(required_cols - set(df.columns)))
	raise ValueError(
	f"CSV must contain columns: text (or claim), evidence, label. Missing: {missing}"
	)

	df = df.copy()
	df["evidence"] = df["evidence"].fillna("").astype(str)

	# Handle evidence that looks like a list string: "['item1', 'item2']"
	def parse_evidence(ev):
	ev_str = str(ev).strip()
	if ev_str.startswith("[") and ev_str.endswith("]"):
	try:
	import ast

	parsed = ast.literal_eval(ev_str)
	if isinstance(parsed, list):
	# Use \|\|\| as separator for clear article boundaries
	return "\|\|\|".join(str(item) for item in parsed)
	except (ValueError, SyntaxError):
	pass
	return ev_str

	df["evidence"] = df["evidence"].apply(parse_evidence)
	# Normalize labels to stable 3-class IDs:
	# 0=Đúng (true), 1=Sai (false), 2=NEI (not enough info)
	label_map = {
	# Positive/support variants (ID: 0)
	"ĐÚNG": 0,
	"DUNG": 0,
	"TRUE": 0,
	"SUPPORTED": 0,
	"LEGIT": 0,
	"LEGITIMATE": 0,
	"0": 0,
	# Negative/refuted variants (ID: 1)
	"SAI": 1,
	"FALSE": 1,
	"REFUTED": 1,
	"SCAM": 1,
	"1": 1,
	# Not-enough-information variants (ID: 2)
	"NEI": 2,
	"NOT ENOUGH INFO": 2,
	"NOT ENOUGH INFORMATION": 2,
	"INSUFFICIENT": 2,
	"2": 2,
	}

	labels = df["label"]
	if pd.api.types.is_bool_dtype(labels):
	df["label"] = labels.map({True: 0, False: 1})
	else:
	label_numeric = pd.to_numeric(labels, errors="coerce")
	label_numeric = label_numeric.where(label_numeric.isin([0, 1, 2]))
	label_str = labels.astype(str).str.strip().str.upper()
	label_mapped = label_str.map(label_map)
	df["label"] = label_numeric.fillna(label_mapped)

	unmapped = df["label"].isna().sum()
	if unmapped > 0:
	logger.warning(
	f"{unmapped} labels could not be mapped. Defaulting to NEI class (2)."
	)
	df["label"] = df["label"].fillna(2)

	df["label"] = df["label"].astype(int)

	if "timestamp" in df.columns:
	df["timestamp"] = df["timestamp"].apply(self._parse_timestamp)

	return df

	def _parse_timestamp(self, value):
	if pd.isna(value):
	return datetime.now(timezone.utc)
	if isinstance(value, (int, float)):
	return datetime.fromtimestamp(float(value), tz=timezone.utc)
	try:
	return datetime.fromisoformat(str(value))
	except Exception:
	return datetime.now(timezone.utc)