#!/bin/bash # Download Kaggle datasets for Human-Pattern Anti-AI training # Requires: pip install kaggle # Setup: Place kaggle.json API key at ~/.kaggle/kaggle.json # Get key: kaggle.com → Account → Create New API Token # # Run: bash scripts/download_kaggle_datasets.sh set -e mkdir -p data/raw/shanegerami data/raw/starblasters8 echo "=== Downloading Kaggle Datasets ===" echo "" # Dataset 1: AI vs Human Text (500K essays) echo "Downloading: shanegerami/ai-vs-human-text..." if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then kaggle datasets download -d shanegerami/ai-vs-human-text \ -p data/raw/shanegerami --unzip echo " ✓ Dataset 1 downloaded" else echo " ✓ Dataset 1 already exists" fi echo "" # Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs) echo "Downloading: starblasters8/human-vs-llm-text-corpus..." if [ ! -f "data/raw/starblasters8/data.parquet" ]; then kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \ -p data/raw/starblasters8 --unzip echo " ✓ Dataset 2 downloaded" else echo " ✓ Dataset 2 already exists" fi echo "" echo "=== Kaggle datasets download complete ===" echo "Dataset 1 (CSV): data/raw/shanegerami/train_essays.csv" echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"