LLM-GO / scripts /train.sh
LesterCerioli's picture
Building first version from Golang LLM
a58ece3
Raw
History Blame Contribute Delete
1.58 kB
#!/usr/bin/env bash
# scripts/train.sh — executa o treinamento do GoLLM
set -euo pipefail
MODEL_SIZE=${MODEL_SIZE:-medium} # small | medium | large | xl
DATA_DIR=${DATA_DIR:-data/processed}
CKPT_DIR=${CKPT_DIR:-checkpoints}
LOG_DIR=${LOG_DIR:-logs}
BATCH_SIZE=${BATCH_SIZE:-32}
MAX_STEPS=${MAX_STEPS:-100000}
WARMUP_STEPS=${WARMUP_STEPS:-2000}
GRAD_ACCUM=${GRAD_ACCUM:-4}
PRECISION=${PRECISION:-bfloat16} # float32 | float16 | bfloat16
GPUS=${GPUS:--1} # -1 = todos, 0 = GPU 0, 1 = GPU 1...
# Verifica dados pré-processados
TRAIN_SHARDS=$(find "$DATA_DIR/train" -name "*.tfrecord" 2>/dev/null | wc -l)
if [ "$TRAIN_SHARDS" -eq 0 ]; then
echo "ERRO: Nenhum TFRecord encontrado em $DATA_DIR/train."
echo " Execute scripts/preprocess.sh primeiro."
exit 1
fi
echo "==> Iniciando treinamento GoLLM"
echo " model_size=$MODEL_SIZE batch=$BATCH_SIZE steps=$MAX_STEPS"
echo " precision=$PRECISION gpus=$GPUS grad_accum=$GRAD_ACCUM"
echo " train_shards=$TRAIN_SHARDS"
echo ""
# XLA JIT compilation (aceleração de CPU/GPU)
export TF_XLA_FLAGS="--tf_xla_auto_jit=2"
export TF_ENABLE_ONEDNN_OPTS=0
llm-go-train \
--model-size "$MODEL_SIZE" \
--data-dir "$DATA_DIR" \
--ckpt-dir "$CKPT_DIR" \
--log-dir "$LOG_DIR" \
--batch-size "$BATCH_SIZE" \
--max-steps "$MAX_STEPS" \
--warmup-steps "$WARMUP_STEPS" \
--grad-accum "$GRAD_ACCUM" \
--precision "$PRECISION" \
--gpus "$GPUS"
echo ""
echo "✅ Treinamento concluído. Checkpoints em $CKPT_DIR"