| # Download Kaggle datasets for Human-Pattern Anti-AI training | |
| # Requires: pip install kaggle | |
| # Setup: Place kaggle.json API key at ~/.kaggle/kaggle.json | |
| # Get key: kaggle.com β Account β Create New API Token | |
| # | |
| # Run: bash scripts/download_kaggle_datasets.sh | |
| set -e | |
| mkdir -p data/raw/shanegerami data/raw/starblasters8 | |
| echo "=== Downloading Kaggle Datasets ===" | |
| echo "" | |
| # Dataset 1: AI vs Human Text (500K essays) | |
| echo "Downloading: shanegerami/ai-vs-human-text..." | |
| if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then | |
| kaggle datasets download -d shanegerami/ai-vs-human-text \ | |
| -p data/raw/shanegerami --unzip | |
| echo " β Dataset 1 downloaded" | |
| else | |
| echo " β Dataset 1 already exists" | |
| fi | |
| echo "" | |
| # Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs) | |
| echo "Downloading: starblasters8/human-vs-llm-text-corpus..." | |
| if [ ! -f "data/raw/starblasters8/data.parquet" ]; then | |
| kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \ | |
| -p data/raw/starblasters8 --unzip | |
| echo " β Dataset 2 downloaded" | |
| else | |
| echo " β Dataset 2 already exists" | |
| fi | |
| echo "" | |
| echo "=== Kaggle datasets download complete ===" | |
| echo "Dataset 1 (CSV): data/raw/shanegerami/train_essays.csv" | |
| echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet" | |