File size: 1,311 Bytes
3df5819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
# Download Kaggle datasets for Human-Pattern Anti-AI training
# Requires: pip install kaggle
# Setup:    Place kaggle.json API key at ~/.kaggle/kaggle.json
# Get key:  kaggle.com → Account → Create New API Token
#
# Run: bash scripts/download_kaggle_datasets.sh

set -e

mkdir -p data/raw/shanegerami data/raw/starblasters8

echo "=== Downloading Kaggle Datasets ==="
echo ""

# Dataset 1: AI vs Human Text (500K essays)
echo "Downloading: shanegerami/ai-vs-human-text..."
if [ ! -f "data/raw/shanegerami/train_essays.csv" ]; then
    kaggle datasets download -d shanegerami/ai-vs-human-text \
        -p data/raw/shanegerami --unzip
    echo "  ✓ Dataset 1 downloaded"
else
    echo "  ✓ Dataset 1 already exists"
fi

echo ""

# Dataset 2: Human vs LLM Text Corpus (800K, 63 LLMs)
echo "Downloading: starblasters8/human-vs-llm-text-corpus..."
if [ ! -f "data/raw/starblasters8/data.parquet" ]; then
    kaggle datasets download -d starblasters8/human-vs-llm-text-corpus \
        -p data/raw/starblasters8 --unzip
    echo "  ✓ Dataset 2 downloaded"
else
    echo "  ✓ Dataset 2 already exists"
fi

echo ""
echo "=== Kaggle datasets download complete ==="
echo "Dataset 1 (CSV):     data/raw/shanegerami/train_essays.csv"
echo "Dataset 2 (Parquet): data/raw/starblasters8/data.parquet"