rae-training / scripts /generate_dataset.sh
TrueV1sion123's picture
Upload scripts/generate_dataset.sh with huggingface_hub
444cefa verified
#!/bin/bash
# ═══════════════════════════════════════════════════════════════
# RAE Dataset Generation Script
# ═══════════════════════════════════════════════════════════════
#
# Generates RAE-structured training data.
#
# Usage:
# ./scripts/generate_dataset.sh # Template mode (no API)
# ./scripts/generate_dataset.sh --use-api # API mode (requires ANTHROPIC_API_KEY)
# ./scripts/generate_dataset.sh --use-api --large # Large dataset
# ═══════════════════════════════════════════════════════════════
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
cd "$PROJECT_DIR"
# Parse args
USE_API=""
NUM_VARIATIONS=2
for arg in "$@"; do
case $arg in
--use-api) USE_API="--use_api" ;;
--large) NUM_VARIATIONS=5 ;;
esac
done
echo "═══════════════════════════════════════════════════════"
echo " RAE DATASET GENERATOR"
echo " Mode: ${USE_API:-template}"
echo " Variations per problem: $NUM_VARIATIONS"
echo "═══════════════════════════════════════════════════════"
# Install dependencies if needed
pip install -q jsonlines tqdm anthropic 2>/dev/null || true
# Generate dataset
python src/dataset_generator.py \
--output data/rae_training_data \
--num_variations "$NUM_VARIATIONS" \
--train_split 0.9 \
$USE_API
echo ""
echo "Dataset ready at: data/rae_training_data/"
echo " train.jsonl"
echo " validation.jsonl"
echo " metadata.json"
echo ""
echo "Next: run training with ./scripts/run_training.sh"