rewrite / scripts /download_kaggle_datasets.sh
morpheuslord's picture
Add files using upload-large-folder tool
3df5819 verified
#!/bin/bash
# Download Kaggle datasets for Human-Pattern Anti-AI training
# Requires: pip install kaggle
# Setup: Place kaggle.json API key at ~/.kaggle/kaggle.json
# Get key: kaggle.com β†’ Account β†’ Create New API Token
#
# Run: bash scripts/download_kaggle_datasets.sh
set -e
mkdir -p data/raw/shanegerami data/raw/starblasters8
echo "=== Downloading Kaggle Datasets ==="
echo ""
# Dataset 1: AI vs Human Text (500K essays)
echo "Downloading: shanegerami/ai-vs-human-text..."
if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then
kaggle datasets download -d shanegerami/ai-vs-human-text \
-p data/raw/shanegerami --unzip
echo " βœ“ Dataset 1 downloaded"
else
echo " βœ“ Dataset 1 already exists"
fi
echo ""
# Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs)
echo "Downloading: starblasters8/human-vs-llm-text-corpus..."
if [ ! -f "data/raw/starblasters8/data.parquet" ]; then
kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \
-p data/raw/starblasters8 --unzip
echo " βœ“ Dataset 2 downloaded"
else
echo " βœ“ Dataset 2 already exists"
fi
echo ""
echo "=== Kaggle datasets download complete ==="
echo "Dataset 1 (CSV): data/raw/shanegerami/train_essays.csv"
echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"