File size: 2,150 Bytes
444cefa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/bin/bash
# ═══════════════════════════════════════════════════════════════
# RAE Dataset Generation Script
# ═══════════════════════════════════════════════════════════════
# 
# Generates RAE-structured training data.
#
# Usage:
#   ./scripts/generate_dataset.sh                    # Template mode (no API)
#   ./scripts/generate_dataset.sh --use-api          # API mode (requires ANTHROPIC_API_KEY)
#   ./scripts/generate_dataset.sh --use-api --large  # Large dataset
# ═══════════════════════════════════════════════════════════════

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"

cd "$PROJECT_DIR"

# Parse args
USE_API=""
NUM_VARIATIONS=2

for arg in "$@"; do
    case $arg in
        --use-api)  USE_API="--use_api" ;;
        --large)    NUM_VARIATIONS=5 ;;
    esac
done

echo "═══════════════════════════════════════════════════════"
echo "  RAE DATASET GENERATOR"
echo "  Mode: ${USE_API:-template}"
echo "  Variations per problem: $NUM_VARIATIONS"
echo "═══════════════════════════════════════════════════════"

# Install dependencies if needed
pip install -q jsonlines tqdm anthropic 2>/dev/null || true

# Generate dataset
python src/dataset_generator.py \
    --output data/rae_training_data \
    --num_variations "$NUM_VARIATIONS" \
    --train_split 0.9 \
    $USE_API

echo ""
echo "Dataset ready at: data/rae_training_data/"
echo "  train.jsonl"
echo "  validation.jsonl"
echo "  metadata.json"
echo ""
echo "Next: run training with ./scripts/run_training.sh"