content-engine / config /models.yaml
dippoo's picture
Sync all local changes: video routes, pod management, wavespeed, UI updates
e808ae1
raw
history blame
4.51 kB
# Training Model Registry
# Defines base models available for LoRA training with their optimal parameters
training_models:
# FLUX - Best for photorealistic images (recommended for realistic person)
flux2_dev:
name: "FLUX.2 Dev (Recommended)"
description: "Latest FLUX model, 32B params, best quality for realistic person. Uses Mistral text encoder."
hf_repo: "black-forest-labs/FLUX.2-dev"
hf_filename: "flux2-dev.safetensors"
model_type: "flux2"
training_framework: "musubi-tuner"
resolution: 1024
learning_rate: 1.0
network_rank: 64
network_alpha: 32
optimizer: "prodigy"
lr_scheduler: "constant"
timestep_sampling: "flux2_shift"
network_module: "networks.lora_flux_2"
max_train_steps: 50
fp8_base: true
gradient_checkpointing: true
use_case: "images"
vram_required_gb: 48
recommended_gpu: "NVIDIA RTX A6000"
recommended_images: "15-30 high quality photos with detailed captions"
training_script: "flux_2_train_network.py"
# Model paths on network volume:
# DiT: /workspace/models/FLUX.2-dev/flux2-dev.safetensors
# VAE: /workspace/models/FLUX.2-dev/vae/diffusion_pytorch_model.safetensors
# Text encoder: /workspace/models/FLUX.2-dev/text_encoder/model-00001-of-00010.safetensors
flux1_dev:
name: "FLUX.1 Dev"
description: "Previous gen FLUX, still excellent for realistic person LoRAs"
hf_repo: "black-forest-labs/FLUX.1-dev"
hf_filename: "flux1-dev.safetensors"
model_type: "flux"
resolution: 768
learning_rate: 4e-4
text_encoder_lr: 4e-5
network_rank: 32
network_alpha: 16
clip_skip: 1
optimizer: "AdamW8bit"
lr_scheduler: "cosine"
min_snr_gamma: 5
max_train_steps: 1500
use_case: "images"
vram_required_gb: 24
recommended_images: "15-30 high quality photos"
training_script: "flux_train_network.py"
# WAN 2.2 - Text-to-Video LoRA training (14B params, uses musubi-tuner)
wan22_t2v:
name: "WAN 2.2 T2V (14B)"
description: "WAN 2.2 text-to-video model. Trains natural-looking video LoRAs. Requires A100 80GB."
model_type: "wan22"
training_framework: "musubi-tuner"
training_script: "wan_train_network.py"
network_module: "networks.lora_wan"
resolution: 512
learning_rate: 2e-4
network_rank: 64
network_alpha: 32
optimizer: "adamw8bit"
lr_scheduler: "constant"
timestep_sampling: "shift"
discrete_flow_shift: 5.0
gradient_checkpointing: true
max_train_steps: 2000
save_every_n_steps: 500
use_case: "images+video"
vram_required_gb: 48
recommended_gpu: "NVIDIA A100 80GB"
recommended_images: "20-50 high quality photos with detailed captions"
# Model paths on network volume:
# DiT low-noise: /workspace/models/WAN2.2/wan2.2_t2v_low_noise_14B_fp16.safetensors
# DiT high-noise: /workspace/models/WAN2.2/wan2.2_t2v_high_noise_14B_fp16.safetensors
# VAE: /workspace/models/WAN2.2/Wan2.1_VAE.pth
# T5: /workspace/models/WAN2.2/models_t5_umt5-xxl-enc-bf16.pth
# SD 1.5 Realistic Vision - Good balance of quality and speed
sd15_realistic:
name: "Realistic Vision V5.1"
description: "SD 1.5 based, great for realistic humans, faster training"
hf_repo: "SG161222/Realistic_Vision_V5.1_noVAE"
hf_filename: "Realistic_Vision_V5.1_fp16-no-ema.safetensors"
model_type: "sd15"
resolution: 512
learning_rate: 1e-4
network_rank: 32
network_alpha: 16
clip_skip: 1
optimizer: "AdamW8bit"
use_case: "images"
vram_required_gb: 8
recommended_images: "15-30 photos"
# SDXL - Higher quality than SD 1.5, but more VRAM
sdxl_base:
name: "SDXL Base 1.0"
description: "Higher resolution and quality than SD 1.5"
hf_repo: "stabilityai/stable-diffusion-xl-base-1.0"
hf_filename: "sd_xl_base_1.0.safetensors"
model_type: "sdxl"
resolution: 1024
learning_rate: 1e-4
network_rank: 32
network_alpha: 16
clip_skip: 2
optimizer: "AdamW8bit"
use_case: "images"
vram_required_gb: 12
recommended_images: "20-40 photos"
# Video generation models (for img2video, not training)
video_models:
wan22_i2v:
name: "WAN 2.2 Image-to-Video"
description: "Converts images to videos, use with your trained LoRA images"
hf_repo: "Wan-AI/Wan2.2-I2V-A14B"
model_type: "wan22"
use_case: "img2video"
vram_required_gb: 24
resolution: "480p/720p"
# Default model for training
default_training_model: "flux2_dev"