TrueV1sion123 commited on
Commit
444cefa
Β·
verified Β·
1 Parent(s): 275f774

Upload scripts/generate_dataset.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/generate_dataset.sh +54 -0
scripts/generate_dataset.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # ═══════════════════════════════════════════════════════════════
3
+ # RAE Dataset Generation Script
4
+ # ═══════════════════════════════════════════════════════════════
5
+ #
6
+ # Generates RAE-structured training data.
7
+ #
8
+ # Usage:
9
+ # ./scripts/generate_dataset.sh # Template mode (no API)
10
+ # ./scripts/generate_dataset.sh --use-api # API mode (requires ANTHROPIC_API_KEY)
11
+ # ./scripts/generate_dataset.sh --use-api --large # Large dataset
12
+ # ═══════════════════════════════════════════════════════════════
13
+
14
+ set -euo pipefail
15
+
16
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
17
+ PROJECT_DIR="$(dirname "$SCRIPT_DIR")"
18
+
19
+ cd "$PROJECT_DIR"
20
+
21
+ # Parse args
22
+ USE_API=""
23
+ NUM_VARIATIONS=2
24
+
25
+ for arg in "$@"; do
26
+ case $arg in
27
+ --use-api) USE_API="--use_api" ;;
28
+ --large) NUM_VARIATIONS=5 ;;
29
+ esac
30
+ done
31
+
32
+ echo "═══════════════════════════════════════════════════════"
33
+ echo " RAE DATASET GENERATOR"
34
+ echo " Mode: ${USE_API:-template}"
35
+ echo " Variations per problem: $NUM_VARIATIONS"
36
+ echo "═══════════════════════════════════════════════════════"
37
+
38
+ # Install dependencies if needed
39
+ pip install -q jsonlines tqdm anthropic 2>/dev/null || true
40
+
41
+ # Generate dataset
42
+ python src/dataset_generator.py \
43
+ --output data/rae_training_data \
44
+ --num_variations "$NUM_VARIATIONS" \
45
+ --train_split 0.9 \
46
+ $USE_API
47
+
48
+ echo ""
49
+ echo "Dataset ready at: data/rae_training_data/"
50
+ echo " train.jsonl"
51
+ echo " validation.jsonl"
52
+ echo " metadata.json"
53
+ echo ""
54
+ echo "Next: run training with ./scripts/run_training.sh"