| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # RAE Dataset Generation Script | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # | |
| # Generates RAE-structured training data. | |
| # | |
| # Usage: | |
| # ./scripts/generate_dataset.sh # Template mode (no API) | |
| # ./scripts/generate_dataset.sh --use-api # API mode (requires ANTHROPIC_API_KEY) | |
| # ./scripts/generate_dataset.sh --use-api --large # Large dataset | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| set -euo pipefail | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| PROJECT_DIR="$(dirname "$SCRIPT_DIR")" | |
| cd "$PROJECT_DIR" | |
| # Parse args | |
| USE_API="" | |
| NUM_VARIATIONS=2 | |
| for arg in "$@"; do | |
| case $arg in | |
| --use-api) USE_API="--use_api" ;; | |
| --large) NUM_VARIATIONS=5 ;; | |
| esac | |
| done | |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo " RAE DATASET GENERATOR" | |
| echo " Mode: ${USE_API:-template}" | |
| echo " Variations per problem: $NUM_VARIATIONS" | |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| # Install dependencies if needed | |
| pip install -q jsonlines tqdm anthropic 2>/dev/null || true | |
| # Generate dataset | |
| python src/dataset_generator.py \ | |
| --output data/rae_training_data \ | |
| --num_variations "$NUM_VARIATIONS" \ | |
| --train_split 0.9 \ | |
| $USE_API | |
| echo "" | |
| echo "Dataset ready at: data/rae_training_data/" | |
| echo " train.jsonl" | |
| echo " validation.jsonl" | |
| echo " metadata.json" | |
| echo "" | |
| echo "Next: run training with ./scripts/run_training.sh" | |