qwenillustrious / train /start_training.sh
lsmpp's picture
Add files using upload-large-folder tool
d926b4c verified
#!/bin/bash
"""
Quick Start Training Script for QwenIllustrious
快速启动训练脚本
"""
# 设置基本路径
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
# 默认参数
QWEN_MODEL_PATH="${QWEN_MODEL_PATH:-models/Qwen3-Embedding-0.6B}"
SDXL_MODEL_PATH="${SDXL_MODEL_PATH:-models/waiNSFWIllustrious_v140.safetensors}"
DATASET_PATH="${DATASET_PATH:-${PROJECT_ROOT}/illustrious_generated}"
OUTPUT_DIR="${OUTPUT_DIR:-${PROJECT_ROOT}/output/qwen_illustrious}"
CACHE_DIR="${CACHE_DIR:-${PROJECT_ROOT}/cache}"
# 训练参数
BATCH_SIZE="${BATCH_SIZE:-4}"
LEARNING_RATE="${LEARNING_RATE:-1e-4}"
NUM_EPOCHS="${NUM_EPOCHS:-10}"
LORA_RANK="${LORA_RANK:-64}"
# 混合精度和梯度设置
MIXED_PRECISION="${MIXED_PRECISION:-fp16}"
GRADIENT_ACCUMULATION_STEPS="${GRADIENT_ACCUMULATION_STEPS:-1}"
# 预计算嵌入选项
PRECOMPUTE_EMBEDDINGS="${PRECOMPUTE_EMBEDDINGS:-true}"
echo "=== QwenIllustrious Training Setup ==="
echo "Qwen Model: $QWEN_MODEL_PATH"
echo "SDXL Model: $SDXL_MODEL_PATH"
echo "Dataset: $DATASET_PATH"
echo "Output: $OUTPUT_DIR"
echo "Cache: $CACHE_DIR"
echo "Batch Size: $BATCH_SIZE"
echo "Learning Rate: $LEARNING_RATE"
echo "Epochs: $NUM_EPOCHS"
echo "LoRA Rank: $LORA_RANK"
echo "Mixed Precision: $MIXED_PRECISION"
echo "Precompute Embeddings: $PRECOMPUTE_EMBEDDINGS"
echo "=================================="
# 检查模型文件是否存在
if [ ! -e "$QWEN_MODEL_PATH" ]; then
echo "Error: Qwen model not found at $QWEN_MODEL_PATH"
echo "Please set QWEN_MODEL_PATH environment variable or place model in models/ directory"
exit 1
fi
if [ ! -e "$SDXL_MODEL_PATH" ]; then
echo "Error: SDXL model not found at $SDXL_MODEL_PATH"
echo "Please set SDXL_MODEL_PATH environment variable or place model in models/ directory"
exit 1
fi
if [ ! -d "$DATASET_PATH" ]; then
echo "Error: Dataset not found at $DATASET_PATH"
echo "Please set DATASET_PATH environment variable"
exit 1
fi
# 创建必要的目录
mkdir -p "$OUTPUT_DIR"
mkdir -p "$CACHE_DIR"
# 步骤1: 预计算嵌入 (如果启用)
if [ "$PRECOMPUTE_EMBEDDINGS" = "true" ]; then
echo ""
echo "=== Step 1: Precomputing Embeddings ==="
python "$SCRIPT_DIR/precompute_embeddings.py" \
--qwen_model_path "$QWEN_MODEL_PATH" \
--sdxl_model_path "$SDXL_MODEL_PATH" \
--dataset_path "$DATASET_PATH" \
--cache_dir "$CACHE_DIR" \
--batch_size 8 \
--mixed_precision "$MIXED_PRECISION"
if [ $? -ne 0 ]; then
echo "Error: Precomputation failed!"
exit 1
fi
echo "Precomputation completed successfully!"
fi
# 步骤2: 开始训练
echo ""
echo "=== Step 2: Starting Training ==="
# 构建训练命令
TRAIN_CMD="python $SCRIPT_DIR/train_qwen_illustrious.py"
TRAIN_CMD="$TRAIN_CMD --qwen_model_path '$QWEN_MODEL_PATH'"
TRAIN_CMD="$TRAIN_CMD --sdxl_model_path '$SDXL_MODEL_PATH'"
TRAIN_CMD="$TRAIN_CMD --dataset_path '$DATASET_PATH'"
TRAIN_CMD="$TRAIN_CMD --output_dir '$OUTPUT_DIR'"
TRAIN_CMD="$TRAIN_CMD --train_batch_size $BATCH_SIZE"
TRAIN_CMD="$TRAIN_CMD --learning_rate $LEARNING_RATE"
TRAIN_CMD="$TRAIN_CMD --num_train_epochs $NUM_EPOCHS"
TRAIN_CMD="$TRAIN_CMD --lora_rank $LORA_RANK"
TRAIN_CMD="$TRAIN_CMD --mixed_precision $MIXED_PRECISION"
TRAIN_CMD="$TRAIN_CMD --gradient_accumulation_steps $GRADIENT_ACCUMULATION_STEPS"
if [ "$PRECOMPUTE_EMBEDDINGS" = "true" ]; then
TRAIN_CMD="$TRAIN_CMD --precompute_embeddings"
TRAIN_CMD="$TRAIN_CMD --cache_dir '$CACHE_DIR'"
fi
# 添加其他有用的参数
TRAIN_CMD="$TRAIN_CMD --gradient_checkpointing"
TRAIN_CMD="$TRAIN_CMD --checkpointing_steps 500"
TRAIN_CMD="$TRAIN_CMD --validation_epochs 2"
TRAIN_CMD="$TRAIN_CMD --report_to tensorboard"
echo "Running command:"
echo "$TRAIN_CMD"
echo ""
# 执行训练
eval $TRAIN_CMD
if [ $? -eq 0 ]; then
echo ""
echo "=== Training Completed Successfully! ==="
echo "Model saved to: $OUTPUT_DIR"
echo "Adapter weights: $OUTPUT_DIR/adapter/"
echo "LoRA weights: $OUTPUT_DIR/lora/"
echo "Logs: $OUTPUT_DIR/logs/"
else
echo ""
echo "=== Training Failed! ==="
exit 1
fi