File size: 2,271 Bytes
3742716
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/bin/bash
# Complete setup and training script for EXP-A (JSON format)
# Run this on a fresh AWS instance

set -e

echo "=============================================="
echo "EXP-A: Complete Setup and Training"
echo "JSON Format with <|endofex|> marker"
echo "=============================================="
echo "Started: $(date)"
echo ""

cd /home/ubuntu/seriguela

# Activate environment
source venv/bin/activate

# Step 1: Prepare data
echo "[1/3] Preparing training data..."
echo "This will download from HuggingFace Hub and convert to JSON format"
echo ""

mkdir -p data/experiments

python scripts/data/prepare_experiment_data.py \
    --dataset_repo_id augustocsc/sintetico_natural \
    --data_dir 700K \
    --data_column i_prompt_n \
    --output_base_dir ./data/experiments

# Verify data
if [ ! -f "./data/experiments/exp_a_json/train.csv" ]; then
    echo "ERROR: Data preparation failed!"
    exit 1
fi

TRAIN_COUNT=$(wc -l < ./data/experiments/exp_a_json/train.csv)
echo "Training samples: $TRAIN_COUNT"

# Step 2: Run training
echo ""
echo "[2/3] Starting training..."
echo "Output: ./output/exp_a_json"
echo ""

python scripts/train_experiment.py \
    --experiment_name "exp_a_json" \
    --train_file ./data/experiments/exp_a_json/train.csv \
    --validation_file ./data/experiments/exp_a_json/validation.csv \
    --output_dir ./output/exp_a_json \
    --json_format \
    --end_marker '"}' \
    --num_train_epochs 3 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 4 \
    --learning_rate 5e-5 \
    --block_size 256 \
    --fp16 \
    --wandb_project seriguela_experiments \
    --wandb_run_name "exp_a_json_$(date +%Y%m%d_%H%M%S)"

# Step 3: Evaluate
echo ""
echo "[3/3] Evaluating model..."
echo ""

python scripts/evaluate_experiments.py \
    --model_path ./output/exp_a_json \
    --experiment_type json \
    --num_samples 200 \
    --output_file ./output/exp_a_json/evaluation_results.json

echo ""
echo "=============================================="
echo "EXP-A Complete!"
echo "=============================================="
echo "Finished: $(date)"
echo "Model: ./output/exp_a_json"
echo "Results: ./output/exp_a_json/evaluation_results.json"

# Create completion marker
touch /home/ubuntu/.exp_a_complete