fake-news-api / src /data /dataset.py
aviseth's picture
Initial deployment
06e73d2
"""
Dataset loader β€” reads Dataset_Clean.csv and returns tokenized HuggingFace DatasetDict splits.
"""
import pandas as pd
from pathlib import Path
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from src.data.preprocessing import clean_text
LABEL2ID = {"True": 0, "Fake": 1, "Satire": 2, "Bias": 3}
ID2LABEL = {v: k for k, v in LABEL2ID.items()}
DEFAULT_CSV = Path(__file__).parents[2] / \
"data" / "processed" / "Dataset_Clean.csv"
MAX_LENGTH = 256
VAL_SPLIT = 0.10
TEST_SPLIT = 0.10
RANDOM_SEED = 42
def load_dataframe(csv_path: str | Path = DEFAULT_CSV) -> pd.DataFrame:
"""Load and clean Dataset_Clean.csv. Returns a DataFrame with columns: text, label (int)."""
df = pd.read_csv(csv_path, low_memory=False)
df["label_text"] = df["label_text"].astype(
str).str.strip().str.capitalize()
df = df[df["label_text"].isin(LABEL2ID)].copy()
df["content"] = df["content"].fillna("").astype(str)
df["title"] = df["title"].fillna("").astype(str)
df["text"] = df.apply(lambda r: r["content"] if len(
r["content"]) > 30 else r["title"], axis=1)
df["text"] = df["text"].apply(clean_text)
df = df[df["text"].str.len() > 10].copy()
df["label"] = df["label_text"].map(LABEL2ID).astype(int)
print(f"[dataset] Loaded {len(df):,} rows")
print(
f"[dataset] Label distribution:\n{df['label_text'].value_counts().to_string()}\n")
return df[["text", "label"]].reset_index(drop=True)
def make_splits(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Stratified train / val / test split."""
train_df, temp_df = train_test_split(
df, test_size=VAL_SPLIT + TEST_SPLIT, stratify=df["label"], random_state=RANDOM_SEED
)
val_df, test_df = train_test_split(
temp_df, test_size=TEST_SPLIT / (VAL_SPLIT + TEST_SPLIT),
stratify=temp_df["label"], random_state=RANDOM_SEED
)
print(
f"[dataset] Train: {len(train_df):,} Val: {len(val_df):,} Test: {len(test_df):,}")
return train_df, val_df, test_df
def tokenize_dataset(dataset_dict: DatasetDict, tokenizer_name: str, max_length: int = MAX_LENGTH) -> DatasetDict:
"""Tokenize all splits."""
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
def _tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_length)
tokenized = dataset_dict.map(_tokenize, batched=True, batch_size=512, remove_columns=[
"text"], desc="Tokenizing")
tokenized.set_format("torch")
return tokenized
def build_dataset(
csv_path: str | Path = DEFAULT_CSV,
tokenizer_name: str = "distilbert-base-uncased",
max_length: int = MAX_LENGTH,
) -> DatasetDict:
"""Full pipeline: CSV β†’ cleaned DataFrame β†’ HuggingFace DatasetDict β†’ tokenized splits."""
df = load_dataframe(csv_path)
train_df, val_df, test_df = make_splits(df)
raw = DatasetDict({
"train": Dataset.from_pandas(train_df, preserve_index=False),
"validation": Dataset.from_pandas(val_df, preserve_index=False),
"test": Dataset.from_pandas(test_df, preserve_index=False),
})
return tokenize_dataset(raw, tokenizer_name, max_length)
if __name__ == "__main__":
ds = build_dataset()
print(ds)
print("Sample:", ds["train"][0])