# Training Model Registry # Defines base models available for LoRA training with their optimal parameters training_models: # FLUX - Best for photorealistic images (recommended for realistic person) flux2_dev: name: "FLUX.2 Dev (Recommended)" description: "Latest FLUX model, 32B params, best quality for realistic person. Uses Mistral text encoder." hf_repo: "black-forest-labs/FLUX.2-dev" hf_filename: "flux2-dev.safetensors" model_type: "flux2" training_framework: "musubi-tuner" resolution: 1024 learning_rate: 1.0 network_rank: 64 network_alpha: 32 optimizer: "prodigy" lr_scheduler: "constant" timestep_sampling: "flux2_shift" network_module: "networks.lora_flux_2" max_train_steps: 50 fp8_base: true gradient_checkpointing: true use_case: "images" vram_required_gb: 48 recommended_gpu: "NVIDIA RTX A6000" recommended_images: "15-30 high quality photos with detailed captions" training_script: "flux_2_train_network.py" # Model paths on network volume: # DiT: /workspace/models/FLUX.2-dev/flux2-dev.safetensors # VAE: /workspace/models/FLUX.2-dev/vae/diffusion_pytorch_model.safetensors # Text encoder: /workspace/models/FLUX.2-dev/text_encoder/model-00001-of-00010.safetensors flux1_dev: name: "FLUX.1 Dev" description: "Previous gen FLUX, still excellent for realistic person LoRAs" hf_repo: "black-forest-labs/FLUX.1-dev" hf_filename: "flux1-dev.safetensors" model_type: "flux" resolution: 768 learning_rate: 4e-4 text_encoder_lr: 4e-5 network_rank: 32 network_alpha: 16 clip_skip: 1 optimizer: "AdamW8bit" lr_scheduler: "cosine" min_snr_gamma: 5 max_train_steps: 1500 use_case: "images" vram_required_gb: 24 recommended_images: "15-30 high quality photos" training_script: "flux_train_network.py" # WAN 2.2 - Text-to-Video LoRA training (14B params, uses musubi-tuner) wan22_t2v: name: "WAN 2.2 T2V (14B)" description: "WAN 2.2 text-to-video model. Trains natural-looking video LoRAs. Requires A100 80GB." model_type: "wan22" training_framework: "musubi-tuner" training_script: "wan_train_network.py" network_module: "networks.lora_wan" resolution: 512 learning_rate: 2e-4 network_rank: 64 network_alpha: 32 optimizer: "adamw8bit" lr_scheduler: "constant" timestep_sampling: "shift" discrete_flow_shift: 5.0 gradient_checkpointing: true max_train_steps: 2000 save_every_n_steps: 500 use_case: "images+video" vram_required_gb: 48 recommended_gpu: "NVIDIA A100 80GB" recommended_images: "20-50 high quality photos with detailed captions" # Model paths on network volume: # DiT low-noise: /workspace/models/WAN2.2/wan2.2_t2v_low_noise_14B_fp16.safetensors # DiT high-noise: /workspace/models/WAN2.2/wan2.2_t2v_high_noise_14B_fp16.safetensors # VAE: /workspace/models/WAN2.2/Wan2.1_VAE.pth # T5: /workspace/models/WAN2.2/models_t5_umt5-xxl-enc-bf16.pth # SD 1.5 Realistic Vision - Good balance of quality and speed sd15_realistic: name: "Realistic Vision V5.1" description: "SD 1.5 based, great for realistic humans, faster training" hf_repo: "SG161222/Realistic_Vision_V5.1_noVAE" hf_filename: "Realistic_Vision_V5.1_fp16-no-ema.safetensors" model_type: "sd15" resolution: 512 learning_rate: 1e-4 network_rank: 32 network_alpha: 16 clip_skip: 1 optimizer: "AdamW8bit" use_case: "images" vram_required_gb: 8 recommended_images: "15-30 photos" # SDXL - Higher quality than SD 1.5, but more VRAM sdxl_base: name: "SDXL Base 1.0" description: "Higher resolution and quality than SD 1.5" hf_repo: "stabilityai/stable-diffusion-xl-base-1.0" hf_filename: "sd_xl_base_1.0.safetensors" model_type: "sdxl" resolution: 1024 learning_rate: 1e-4 network_rank: 32 network_alpha: 16 clip_skip: 2 optimizer: "AdamW8bit" use_case: "images" vram_required_gb: 12 recommended_images: "20-40 photos" # Video generation models (for img2video, not training) video_models: wan22_i2v: name: "WAN 2.2 Image-to-Video" description: "Converts images to videos, use with your trained LoRA images" hf_repo: "Wan-AI/Wan2.2-I2V-A14B" model_type: "wan22" use_case: "img2video" vram_required_gb: 24 resolution: "480p/720p" # Default model for training default_training_model: "flux2_dev"