AITextDetector / scripts /download_kagglehub.py
ChauHPham's picture
Upload folder using huggingface_hub
25faba3 verified
"""
Download Kaggle datasets directly using kagglehub (no API token needed!)
Usage:
python scripts/download_kagglehub.py
# Or download specific dataset:
python scripts/download_kagglehub.py --dataset shamimhasan8/ai-vs-human-text-dataset
"""
import os
import kagglehub
import pandas as pd
import glob
from pathlib import Path
import argparse
DATA_DIR = os.path.join(os.path.dirname(__file__), "..", "data")
os.makedirs(DATA_DIR, exist_ok=True)
def download_dataset(dataset_slug: str, output_name: str = None):
"""
Download a Kaggle dataset using kagglehub.
Args:
dataset_slug: Kaggle dataset slug (e.g., "shamimhasan8/ai-vs-human-text-dataset")
output_name: Optional name for the output CSV file
"""
print(f"πŸ“₯ Downloading dataset: {dataset_slug}")
print(" (No API token needed with kagglehub!)")
# Download dataset - returns path to downloaded files
path = kagglehub.dataset_download(dataset_slug)
print(f"βœ… Downloaded to: {path}")
# Find all CSV files in the downloaded directory
csv_files = list(Path(path).glob("*.csv"))
if not csv_files:
print(f"⚠️ No CSV files found in {path}")
print(f" Files found: {list(Path(path).iterdir())}")
return None
print(f"\nπŸ“Š Found {len(csv_files)} CSV file(s):")
for csv_file in csv_files:
print(f" - {csv_file.name}")
# If multiple CSVs, try to find the main one or merge them
if len(csv_files) == 1:
main_csv = csv_files[0]
else:
# Look for common names
main_csv = None
for csv_file in csv_files:
name_lower = csv_file.name.lower()
if any(keyword in name_lower for keyword in ['train', 'main', 'dataset', 'data']):
main_csv = csv_file
break
if not main_csv:
# Use the largest CSV
main_csv = max(csv_files, key=lambda p: p.stat().st_size)
print(f" Using largest file: {main_csv.name}")
# Copy to data directory
output_path = os.path.join(DATA_DIR, output_name or main_csv.name)
# Read and save (this also normalizes the file)
print(f"\nπŸ“ Processing and saving to: {output_path}")
df = pd.read_csv(main_csv)
print(f" Rows: {len(df):,}")
print(f" Columns: {list(df.columns)}")
df.to_csv(output_path, index=False)
print(f"βœ… Saved to: {output_path}")
# If there are other CSVs, mention them
other_csvs = [f for f in csv_files if f != main_csv]
if other_csvs:
print(f"\nπŸ’‘ Other CSV files available in {path}:")
for csv_file in other_csvs:
print(f" - {csv_file.name}")
print(f" You can manually copy them to {DATA_DIR} if needed")
return output_path
def main():
parser = argparse.ArgumentParser(description="Download Kaggle datasets using kagglehub")
parser.add_argument(
"--dataset",
default="shamimhasan8/ai-vs-human-text-dataset",
help="Kaggle dataset slug (default: shamimhasan8/ai-vs-human-text-dataset)"
)
parser.add_argument(
"--output",
help="Output filename (default: uses dataset filename)"
)
args = parser.parse_args()
output_path = download_dataset(args.dataset, args.output)
if output_path:
print(f"\n🎯 Next steps:")
print(f" 1. Update configs/default.yaml: data_path: {output_path}")
print(f" 2. Or use: python scripts/run_train.py --data {output_path}")
print(f"\nπŸ’‘ Tip: Use scripts/sample_dataset.py to create smaller subsets for testing")
if __name__ == "__main__":
main()