File size: 2,150 Bytes
444cefa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | #!/bin/bash
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# RAE Dataset Generation Script
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
#
# Generates RAE-structured training data.
#
# Usage:
# ./scripts/generate_dataset.sh # Template mode (no API)
# ./scripts/generate_dataset.sh --use-api # API mode (requires ANTHROPIC_API_KEY)
# ./scripts/generate_dataset.sh --use-api --large # Large dataset
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_DIR"
# Parse args
USE_API=""
NUM_VARIATIONS=2
for arg in "$@"; do
case $arg in
--use-api) USE_API="--use_api" ;;
--large) NUM_VARIATIONS=5 ;;
esac
done
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo " RAE DATASET GENERATOR"
echo " Mode: ${USE_API:-template}"
echo " Variations per problem: $NUM_VARIATIONS"
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
# Install dependencies if needed
pip install -q jsonlines tqdm anthropic 2>/dev/null || true
# Generate dataset
python src/dataset_generator.py \
--output data/rae_training_data \
--num_variations "$NUM_VARIATIONS" \
--train_split 0.9 \
$USE_API
echo ""
echo "Dataset ready at: data/rae_training_data/"
echo " train.jsonl"
echo " validation.jsonl"
echo " metadata.json"
echo ""
echo "Next: run training with ./scripts/run_training.sh"
|