#!/bin/bash # ═══════════════════════════════════════════════════════════════ # RAE Dataset Generation Script # ═══════════════════════════════════════════════════════════════ # # Generates RAE-structured training data. # # Usage: # ./scripts/generate_dataset.sh # Template mode (no API) # ./scripts/generate_dataset.sh --use-api # API mode (requires ANTHROPIC_API_KEY) # ./scripts/generate_dataset.sh --use-api --large # Large dataset # ═══════════════════════════════════════════════════════════════ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_DIR="$(dirname "$SCRIPT_DIR")" cd "$PROJECT_DIR" # Parse args USE_API="" NUM_VARIATIONS=2 for arg in "$@"; do case $arg in --use-api) USE_API="--use_api" ;; --large) NUM_VARIATIONS=5 ;; esac done echo "═══════════════════════════════════════════════════════" echo " RAE DATASET GENERATOR" echo " Mode: ${USE_API:-template}" echo " Variations per problem: $NUM_VARIATIONS" echo "═══════════════════════════════════════════════════════" # Install dependencies if needed pip install -q jsonlines tqdm anthropic 2>/dev/null || true # Generate dataset python src/dataset_generator.py \ --output data/rae_training_data \ --num_variations "$NUM_VARIATIONS" \ --train_split 0.9 \ $USE_API echo "" echo "Dataset ready at: data/rae_training_data/" echo " train.jsonl" echo " validation.jsonl" echo " metadata.json" echo "" echo "Next: run training with ./scripts/run_training.sh"