Manthan-T1 / scripts /kaggle_train_all.sh
Atah Alam
Manthan-T1 clean code-only
7f7a72e
#!/usr/bin/env bash
set -euo pipefail
# One-shot Kaggle runner: Stage 1 (pretrain/alignment) -> Stage 2 (instruct finetune)
# Designed for Kaggle 2xT4, but also works on other CUDA machines.
############################################
# User config (edit these if you want)
############################################
: "${MANTHAN_MODEL:=zyxcisss/Manthan-T1}" # HF repo or local path containing Manthan remote-code
: "${TEXT_MODEL:=Qwen/Qwen3-0.6B-Base}" # base LLM checkpoint
: "${STAGE1_DS:=liuhaotian/LLaVA-CC3M-Pretrain-595K}" # pretrain/alignment
: "${STAGE2_DS:=liuhaotian/LLaVA-Instruct-150K}" # instruction finetune
: "${OUT_BASE:=/kaggle/working/manthan_runs}" # all outputs saved here
: "${STAGE1_OUT:=${OUT_BASE}/stage1}" # stage1 output dir
: "${STAGE2_OUT:=${OUT_BASE}/stage2}" # stage2 output dir
# Training knobs (safe defaults for 2xT4)
: "${MAX_LENGTH:=2048}"
: "${IMAGE_SIZE:=384}"
: "${BATCH_SIZE:=1}"
: "${GRAD_ACCUM:=32}"
: "${LR:=1e-4}"
: "${EPOCHS_STAGE1:=1}"
: "${EPOCHS_STAGE2:=1}"
# Optional dataset limits (set empty for full)
: "${LIMIT_STAGE1:=20000}"
: "${LIMIT_STAGE2:=150000}"
# If you want to disable LoRA for projector-only training, set USE_LORA=0
: "${USE_LORA:=1}"
# If you want this script to upload artifacts via huggingface-cli, set UPLOAD=1
: "${UPLOAD:=0}"
############################################
# Environment setup
############################################
if command -v nvidia-smi >/dev/null 2>&1; then
echo "GPU found:"; nvidia-smi || true
else
echo "WARNING: nvidia-smi not found. This script expects a CUDA runtime (Kaggle)."
fi
# Persist caches on Kaggle
export HF_HOME="${HF_HOME:-/kaggle/working/hf}"
export TRANSFORMERS_CACHE="${TRANSFORMERS_CACHE:-/kaggle/working/hf/transformers}"
export HF_DATASETS_CACHE="${HF_DATASETS_CACHE:-/kaggle/working/hf/datasets}"
mkdir -p "${HF_HOME}" "${TRANSFORMERS_CACHE}" "${HF_DATASETS_CACHE}" "${OUT_BASE}"
############################################
# Dependencies
############################################
python - <<'PY'
import sys
print("python:", sys.version)
PY
# Keep installs minimal and reproducible enough for Kaggle.
python -m pip install -U pip
python -m pip install -U "transformers>=4.45" accelerate datasets peft
# Unsloth is optional; script falls back to PEFT if it isn't installed.
python -m pip install -U unsloth || true
############################################
# Helper to add optional args
############################################
maybe_limit_args() {
local limit_val="$1"
if [[ -n "${limit_val}" ]]; then
echo "--limit" "${limit_val}"
fi
}
maybe_lora_args() {
if [[ "${USE_LORA}" == "1" ]]; then
echo "--use_lora"
else
echo ""
fi
}
############################################
# Stage 1
############################################
echo "==== Stage 1: projector alignment/pretrain ===="
python scripts/train_unsloth_kaggle.py \
--stage stage1 \
--manthan_model "${MANTHAN_MODEL}" \
--text_model "${TEXT_MODEL}" \
--dataset "${STAGE1_DS}" \
--output_dir "${STAGE1_OUT}" \
$(maybe_lora_args) \
--max_length "${MAX_LENGTH}" \
--image_size "${IMAGE_SIZE}" \
--batch_size "${BATCH_SIZE}" \
--grad_accum "${GRAD_ACCUM}" \
--lr "${LR}" \
--epochs "${EPOCHS_STAGE1}" \
$(maybe_limit_args "${LIMIT_STAGE1}")
############################################
# Stage 2
############################################
echo "==== Stage 2: instruction finetune ===="
python scripts/train_unsloth_kaggle.py \
--stage stage2 \
--manthan_model "${MANTHAN_MODEL}" \
--text_model "${TEXT_MODEL}" \
--dataset "${STAGE2_DS}" \
--output_dir "${STAGE2_OUT}" \
$(maybe_lora_args) \
--max_length "${MAX_LENGTH}" \
--image_size "${IMAGE_SIZE}" \
--batch_size "${BATCH_SIZE}" \
--grad_accum "${GRAD_ACCUM}" \
--lr "${LR}" \
--epochs "${EPOCHS_STAGE2}" \
$(maybe_limit_args "${LIMIT_STAGE2}")
echo "==== Done ===="
echo "Stage1 outputs: ${STAGE1_OUT}"
echo "Stage2 outputs: ${STAGE2_OUT}"
############################################
# Optional upload (manual control)
############################################
if [[ "${UPLOAD}" == "1" ]]; then
echo "UPLOAD=1: attempting to upload artifacts (requires HF auth)."
python -m pip install -U huggingface_hub
echo "You can now upload ${OUT_BASE} with your preferred workflow."
fi