augustocsc
/

gpt2_medium_prefix_682k

Model card Files Files and versions

gpt2_medium_prefix_682k / scripts /aws /train_exp_a.sh

augustocsc's picture

GPT-2 Medium trained on prefix dataset (682K)

3742716 verified 19 days ago

history blame contribute delete

1.61 kB

	#!/bin/bash
	# EXP-A: Training with JSON structured format
	# Uses <\|endofex\|> as end marker

	set -e

	echo "=============================================="
	echo "EXP-A: JSON Format Training"
	echo "=============================================="

	cd ~/seriguela

	# Activate virtual environment
	source venv/bin/activate

	# Check data exists
	if [ ! -f "./data/experiments/exp_a_json/train.csv" ]; then
	echo "ERROR: Training data not found!"
	echo "Expected: ./data/experiments/exp_a_json/train.csv"
	exit 1
	fi

	# Count samples
	TRAIN_COUNT=$(wc -l < ./data/experiments/exp_a_json/train.csv)
	echo "Training samples: $TRAIN_COUNT"

	# Training configuration
	export WANDB_PROJECT="seriguela_experiments"
	export HF_TOKEN="${HF_TOKEN:-}"
	export WANDB_API_KEY="${WANDB_API_KEY:-}"

	# Run training
	echo ""
	echo "Starting training..."
	echo "Output: ./output/exp_a_json"
	echo ""

	python scripts/train_experiment.py \
	--experiment_name "exp_a_json" \
	--train_file ./data/experiments/exp_a_json/train.csv \
	--validation_file ./data/experiments/exp_a_json/validation.csv \
	--output_dir ./output/exp_a_json \
	--end_marker "<\|endofex\|>" \
	--num_train_epochs 3 \
	--per_device_train_batch_size 8 \
	--gradient_accumulation_steps 4 \
	--learning_rate 5e-5 \
	--block_size 256 \
	--fp16 \
	--wandb_project seriguela_experiments \
	--wandb_run_name "exp_a_json_$(date +%Y%m%d_%H%M%S)"

	echo ""
	echo "=============================================="
	echo "EXP-A Training Complete!"
	echo "=============================================="
	echo "Model saved to: ./output/exp_a_json"