TruthCheck / src /data /download_datasets.py
adnaan05's picture
Initial commit for Hugging Face Space
469c254
import os
import pandas as pd
import requests
import zipfile
from pathlib import Path
import logging
from tqdm import tqdm
import json
# import kaggle
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DatasetDownloader:
def __init__(self):
self.project_root = Path(__file__).parent.parent.parent
self.raw_data_dir = self.project_root / "data" / "raw"
self.processed_data_dir = self.project_root / "data" / "processed"
# Create directories if they don't exist
os.makedirs(self.raw_data_dir, exist_ok=True)
os.makedirs(self.processed_data_dir, exist_ok=True)
def process_kaggle_dataset(self):
"""Process the Kaggle dataset."""
logger.info("Processing Kaggle dataset...")
# Read fake and real news files
fake_df = pd.read_csv(self.raw_data_dir / "Fake.csv")
true_df = pd.read_csv(self.raw_data_dir / "True.csv")
# Add labels
fake_df['label'] = 1 # 1 for fake
true_df['label'] = 0 # 0 for real
# Combine datasets
combined_df = pd.concat([fake_df, true_df], ignore_index=True)
# Save processed data
combined_df.to_csv(self.processed_data_dir / "kaggle_processed.csv", index=False)
logger.info(f"Saved {len(combined_df)} articles from Kaggle dataset")
def process_liar(self):
"""Process LIAR dataset."""
logger.info("Processing LIAR dataset...")
# Read LIAR dataset
liar_file = self.raw_data_dir / "liar" / "train.tsv"
if not liar_file.exists():
logger.error("LIAR dataset not found!")
return
# Read TSV file
df = pd.read_csv(liar_file, sep='\t', header=None)
# Rename columns
df.columns = [
'id', 'label', 'statement', 'subject', 'speaker',
'job_title', 'state_info', 'party_affiliation',
'barely_true', 'false', 'half_true', 'mostly_true',
'pants_on_fire', 'venue'
]
# Convert labels to binary (0 for true, 1 for false)
label_map = {
'true': 0,
'mostly-true': 0,
'half-true': 0,
'barely-true': 1,
'false': 1,
'pants-fire': 1
}
df['label'] = df['label'].map(label_map)
# Select relevant columns
df = df[['statement', 'label', 'subject', 'speaker', 'party_affiliation']]
df.columns = ['text', 'label', 'subject', 'speaker', 'party']
# Save processed data
df.to_csv(self.processed_data_dir / "liar_processed.csv", index=False)
logger.info(f"Saved {len(df)} articles from LIAR dataset")
def combine_datasets(self):
"""Combine processed datasets."""
logger.info("Combining datasets...")
# Read processed datasets
kaggle_df = pd.read_csv(self.processed_data_dir / "kaggle_processed.csv")
liar_df = pd.read_csv(self.processed_data_dir / "liar_processed.csv")
# Combine datasets
combined_df = pd.concat([
kaggle_df[['text', 'label']],
liar_df[['text', 'label']]
], ignore_index=True)
# Save combined dataset
combined_df.to_csv(self.processed_data_dir / "combined_dataset.csv", index=False)
logger.info(f"Combined dataset contains {len(combined_df)} articles")
def main():
downloader = DatasetDownloader()
# Process datasets
downloader.process_kaggle_dataset()
downloader.process_liar()
# Combine datasets
downloader.combine_datasets()
logger.info("Dataset preparation completed!")
if __name__ == "__main__":
main()