File size: 1,311 Bytes
3df5819 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | #!/bin/bash
# Download Kaggle datasets for Human-Pattern Anti-AI training
# Requires: pip install kaggle
# Setup: Place kaggle.json API key at ~/.kaggle/kaggle.json
# Get key: kaggle.com → Account → Create New API Token
#
# Run: bash scripts/download_kaggle_datasets.sh
set -e
mkdir -p data/raw/shanegerami data/raw/starblasters8
echo "=== Downloading Kaggle Datasets ==="
echo ""
# Dataset 1: AI vs Human Text (500K essays)
echo "Downloading: shanegerami/ai-vs-human-text..."
if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then
kaggle datasets download -d shanegerami/ai-vs-human-text \
-p data/raw/shanegerami --unzip
echo " ✓ Dataset 1 downloaded"
else
echo " ✓ Dataset 1 already exists"
fi
echo ""
# Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs)
echo "Downloading: starblasters8/human-vs-llm-text-corpus..."
if [ ! -f "data/raw/starblasters8/data.parquet" ]; then
kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \
-p data/raw/starblasters8 --unzip
echo " ✓ Dataset 2 downloaded"
else
echo " ✓ Dataset 2 already exists"
fi
echo ""
echo "=== Kaggle datasets download complete ==="
echo "Dataset 1 (CSV): data/raw/shanegerami/train_essays.csv"
echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"
|