Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Jan 20

Commit

3df46a2

1 Parent(s): 4f80622

support lora trianing & inter

Browse files

Files changed (14) hide show

.gitignore +3 -1
acestep/gradio_ui/events/__init__.py +268 -0
acestep/gradio_ui/events/training_handlers.py +644 -0
acestep/gradio_ui/interfaces/__init__.py +8 -1
acestep/gradio_ui/interfaces/generation.py +31 -0
acestep/gradio_ui/interfaces/training.py +558 -0
acestep/handler.py +140 -0
acestep/training/__init__.py +61 -0
acestep/training/configs.py +107 -0
acestep/training/data_module.py +465 -0
acestep/training/dataset_builder.py +755 -0
acestep/training/lora_utils.py +305 -0
acestep/training/trainer.py +503 -0
requirements.txt +4 -0

.gitignore CHANGED Viewed

@@ -221,4 +221,6 @@ feishu_bot/
 tmp*
 torchinductor_root/
 scripts/
-checkpoints_legacy/

 tmp*
 torchinductor_root/
 scripts/
+checkpoints_legacy/
+lora_output/
+datasets/

acestep/gradio_ui/events/__init__.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import Optional
 # Import handler modules
 from . import generation_handlers as gen_h
 from . import results_handlers as res_h
 from acestep.gradio_ui.i18n import t
@@ -69,6 +70,32 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         ]
     )
     # ========== UI Visibility Updates ==========
     generation_section["init_llm_checkbox"].change(
         fn=gen_h.update_negative_prompt_visibility,
@@ -859,3 +886,244 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 results_section[f"generated_audio_{lrc_idx}"],  # Only updates subtitles, not value
             ]
         )

 # Import handler modules
 from . import generation_handlers as gen_h
 from . import results_handlers as res_h
+from . import training_handlers as train_h
 from acestep.gradio_ui.i18n import t
         ]
     )
+    # ========== LoRA Handlers ==========
+    generation_section["load_lora_btn"].click(
+        fn=dit_handler.load_lora,
+        inputs=[generation_section["lora_path"]],
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to enabled state after loading
+        fn=lambda: gr.update(value=True),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["unload_lora_btn"].click(
+        fn=dit_handler.unload_lora,
+        outputs=[generation_section["lora_status"]]
+    ).then(
+        # Update checkbox to disabled state after unloading
+        fn=lambda: gr.update(value=False),
+        outputs=[generation_section["use_lora_checkbox"]]
+    )
+    generation_section["use_lora_checkbox"].change(
+        fn=dit_handler.set_use_lora,
+        inputs=[generation_section["use_lora_checkbox"]],
+        outputs=[generation_section["lora_status"]]
+    )
     # ========== UI Visibility Updates ==========
     generation_section["init_llm_checkbox"].change(
         fn=gen_h.update_negative_prompt_visibility,
                 results_section[f"generated_audio_{lrc_idx}"],  # Only updates subtitles, not value
             ]
         )
+def setup_training_event_handlers(demo, dit_handler, llm_handler, training_section):
+    """Setup event handlers for the training tab (dataset builder and LoRA training)"""
+    # ========== Load Existing Dataset (Top Section) ==========
+    # Load existing dataset JSON at the top of Dataset Builder
+    training_section["load_json_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_json_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_json_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # ========== Dataset Builder Handlers ==========
+    # Scan directory for audio files
+    training_section["scan_btn"].click(
+        fn=lambda dir, name, tag, pos, instr, state: train_h.scan_directory(
+            dir, name, tag, pos, instr, state
+        ),
+        inputs=[
+            training_section["audio_directory"],
+            training_section["dataset_name"],
+            training_section["custom_tag"],
+            training_section["tag_position"],
+            training_section["all_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["scan_status"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Auto-label all samples
+    training_section["auto_label_btn"].click(
+        fn=lambda state, skip: train_h.auto_label_all(dit_handler, llm_handler, state, skip),
+        inputs=[
+            training_section["dataset_builder_state"],
+            training_section["skip_metas"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["label_progress"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Sample selector change - update preview
+    training_section["sample_selector"].change(
+        fn=train_h.get_sample_preview,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Save sample edit
+    training_section["save_edit_btn"].click(
+        fn=train_h.save_sample_edit,
+        inputs=[
+            training_section["sample_selector"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["audio_files_table"],
+            training_section["edit_status"],
+            training_section["dataset_builder_state"],
+        ]
+    )
+    # Update settings when changed
+    for trigger in [training_section["custom_tag"], training_section["tag_position"], training_section["all_instrumental"]]:
+        trigger.change(
+            fn=train_h.update_settings,
+            inputs=[
+                training_section["custom_tag"],
+                training_section["tag_position"],
+                training_section["all_instrumental"],
+                training_section["dataset_builder_state"],
+            ],
+            outputs=[training_section["dataset_builder_state"]]
+        )
+    # Save dataset
+    training_section["save_dataset_btn"].click(
+        fn=train_h.save_dataset,
+        inputs=[
+            training_section["save_path"],
+            training_section["dataset_name"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["save_status"]]
+    )
+    # ========== Preprocess Handlers ==========
+    # Load existing dataset JSON for preprocessing
+    # This also updates the preview section so users can view/edit samples
+    training_section["load_existing_dataset_btn"].click(
+        fn=train_h.load_existing_dataset_for_preprocess,
+        inputs=[
+            training_section["load_existing_dataset_path"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[
+            training_section["load_existing_status"],
+            training_section["audio_files_table"],
+            training_section["sample_selector"],
+            training_section["dataset_builder_state"],
+            # Also update preview fields with first sample
+            training_section["preview_audio"],
+            training_section["preview_filename"],
+            training_section["edit_caption"],
+            training_section["edit_lyrics"],
+            training_section["edit_bpm"],
+            training_section["edit_keyscale"],
+            training_section["edit_timesig"],
+            training_section["edit_duration"],
+            training_section["edit_language"],
+            training_section["edit_instrumental"],
+        ]
+    )
+    # Preprocess dataset to tensor files
+    training_section["preprocess_btn"].click(
+        fn=lambda output_dir, state: train_h.preprocess_dataset(
+            output_dir, dit_handler, state
+        ),
+        inputs=[
+            training_section["preprocess_output_dir"],
+            training_section["dataset_builder_state"],
+        ],
+        outputs=[training_section["preprocess_progress"]]
+    )
+    # ========== Training Tab Handlers ==========
+    # Load preprocessed tensor dataset
+    training_section["load_dataset_btn"].click(
+        fn=train_h.load_training_dataset,
+        inputs=[training_section["training_tensor_dir"]],
+        outputs=[training_section["training_dataset_info"]]
+    )
+    # Start training from preprocessed tensors
+    def training_wrapper(tensor_dir, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts):
+        try:
+            for progress, log, plot, state in train_h.start_training(
+                tensor_dir, dit_handler, r, a, d, lr, ep, bs, ga, se, sh, sd, od, ts
+            ):
+                yield progress, log, plot, state
+        except Exception as e:
+            logger.exception("Training wrapper error")
+            yield f"❌ Error: {str(e)}", str(e), None, ts
+    training_section["start_training_btn"].click(
+        fn=training_wrapper,
+        inputs=[
+            training_section["training_tensor_dir"],
+            training_section["lora_rank"],
+            training_section["lora_alpha"],
+            training_section["lora_dropout"],
+            training_section["learning_rate"],
+            training_section["train_epochs"],
+            training_section["train_batch_size"],
+            training_section["gradient_accumulation"],
+            training_section["save_every_n_epochs"],
+            training_section["training_shift"],
+            training_section["training_seed"],
+            training_section["lora_output_dir"],
+            training_section["training_state"],
+        ],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_log"],
+            training_section["training_loss_plot"],
+            training_section["training_state"],
+        ]
+    )
+    # Stop training
+    training_section["stop_training_btn"].click(
+        fn=train_h.stop_training,
+        inputs=[training_section["training_state"]],
+        outputs=[
+            training_section["training_progress"],
+            training_section["training_state"],
+        ]
+    )
+    # Export LoRA
+    training_section["export_lora_btn"].click(
+        fn=train_h.export_lora,
+        inputs=[
+            training_section["export_path"],
+            training_section["lora_output_dir"],
+        ],
+        outputs=[training_section["export_status"]]
+    )

acestep/gradio_ui/events/training_handlers.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""
+Event Handlers for Training Tab
+Contains all event handler functions for the dataset builder and training UI.
+"""
+import os
+import json
+from typing import Any, Dict, List, Tuple, Optional
+from loguru import logger
+import gradio as gr
+from acestep.training.dataset_builder import DatasetBuilder, AudioSample
+def create_dataset_builder() -> DatasetBuilder:
+    """Create a new DatasetBuilder instance."""
+    return DatasetBuilder()
+def scan_directory(
+    audio_dir: str,
+    dataset_name: str,
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[Any, str, Any, DatasetBuilder]:
+    """Scan a directory for audio files.
+    Returns:
+        Tuple of (table_data, status, slider_update, builder_state)
+    """
+    if not audio_dir or not audio_dir.strip():
+        return [], "❌ Please enter a directory path", gr.Slider(maximum=0, value=0), builder_state
+    # Create or use existing builder
+    builder = builder_state if builder_state else DatasetBuilder()
+    # Set metadata before scanning
+    builder.metadata.name = dataset_name
+    builder.metadata.custom_tag = custom_tag
+    builder.metadata.tag_position = tag_position
+    builder.metadata.all_instrumental = all_instrumental
+    # Scan directory
+    samples, status = builder.scan_directory(audio_dir.strip())
+    if not samples:
+        return [], status, gr.Slider(maximum=0, value=0), builder
+    # Set instrumental and tag for all samples
+    builder.set_all_instrumental(all_instrumental)
+    if custom_tag:
+        builder.set_custom_tag(custom_tag, tag_position)
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max and return as Slider update
+    slider_max = max(0, len(samples) - 1)
+    return table_data, status, gr.Slider(maximum=slider_max, value=0), builder
+def auto_label_all(
+    dit_handler,
+    llm_handler,
+    builder_state: Optional[DatasetBuilder],
+    skip_metas: bool = False,
+    progress=None,
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Auto-label all samples in the dataset.
+    Args:
+        dit_handler: DiT handler for audio processing
+        llm_handler: LLM handler for caption generation
+        builder_state: Dataset builder state
+        skip_metas: If True, skip LLM labeling. BPM/Key/TimeSig = N/A, Language = unknown for instrumental
+        progress: Progress callback
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ Please scan a directory first", builder_state
+    if not builder_state.samples:
+        return [], "❌ No samples to label. Please scan a directory first.", builder_state
+    # If skip_metas is True, just set default values without LLM
+    if skip_metas:
+        for sample in builder_state.samples:
+            sample.bpm = None  # Will display as N/A
+            sample.keyscale = "N/A"
+            sample.timesignature = "N/A"
+            # For instrumental, language should be "unknown"
+            if sample.is_instrumental:
+                sample.language = "unknown"
+            else:
+                sample.language = "unknown"
+            # Use custom tag as caption if set, otherwise use filename
+            if builder_state.metadata.custom_tag:
+                sample.caption = builder_state.metadata.custom_tag
+            else:
+                sample.caption = sample.filename
+        table_data = builder_state.get_samples_dataframe_data()
+        return table_data, f"✅ Skipped AI labeling. {len(builder_state.samples)} samples set with default values.", builder_state
+    # Check if handlers are initialized
+    if dit_handler is None or dit_handler.model is None:
+        return builder_state.get_samples_dataframe_data(), "❌ Model not initialized. Please initialize the service first.", builder_state
+    if llm_handler is None or not llm_handler.llm_initialized:
+        return builder_state.get_samples_dataframe_data(), "❌ LLM not initialized. Please initialize the service with LLM enabled.", builder_state
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Label all samples
+    samples, status = builder_state.label_all_samples(
+        dit_handler=dit_handler,
+        llm_handler=llm_handler,
+        progress_callback=progress_callback,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def get_sample_preview(
+    sample_idx: int,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Get preview data for a specific sample.
+    Returns:
+        Tuple of (audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    if builder_state is None or not builder_state.samples:
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    idx = int(sample_idx)
+    if idx < 0 or idx >= len(builder_state.samples):
+        return None, "", "", "", None, "", "", 0.0, "instrumental", True
+    sample = builder_state.samples[idx]
+    return (
+        sample.audio_path,
+        sample.filename,
+        sample.caption,
+        sample.lyrics,
+        sample.bpm,
+        sample.keyscale,
+        sample.timesignature,
+        sample.duration,
+        sample.language,
+        sample.is_instrumental,
+    )
+def save_sample_edit(
+    sample_idx: int,
+    caption: str,
+    lyrics: str,
+    bpm: Optional[int],
+    keyscale: str,
+    timesig: str,
+    language: str,
+    is_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[List[List[Any]], str, DatasetBuilder]:
+    """Save edits to a sample.
+    Returns:
+        Tuple of (table_data, status, builder_state)
+    """
+    if builder_state is None:
+        return [], "❌ No dataset loaded", builder_state
+    idx = int(sample_idx)
+    # Update sample
+    sample, status = builder_state.update_sample(
+        idx,
+        caption=caption,
+        lyrics=lyrics if not is_instrumental else "[Instrumental]",
+        bpm=int(bpm) if bpm else None,
+        keyscale=keyscale,
+        timesignature=timesig,
+        language="instrumental" if is_instrumental else language,
+        is_instrumental=is_instrumental,
+        labeled=True,
+    )
+    # Get updated table data
+    table_data = builder_state.get_samples_dataframe_data()
+    return table_data, status, builder_state
+def update_settings(
+    custom_tag: str,
+    tag_position: str,
+    all_instrumental: bool,
+    builder_state: Optional[DatasetBuilder],
+) -> DatasetBuilder:
+    """Update dataset settings.
+    Returns:
+        Updated builder_state
+    """
+    if builder_state is None:
+        return builder_state
+    if custom_tag:
+        builder_state.set_custom_tag(custom_tag, tag_position)
+    builder_state.set_all_instrumental(all_instrumental)
+    return builder_state
+def save_dataset(
+    save_path: str,
+    dataset_name: str,
+    builder_state: Optional[DatasetBuilder],
+) -> str:
+    """Save the dataset to a JSON file.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset to save. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    if not save_path or not save_path.strip():
+        return "❌ Please enter a save path."
+    # Check if any samples are labeled
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "⚠️ Warning: No samples have been labeled. Consider auto-labeling first.\nSaving anyway..."
+    return builder_state.save_dataset(save_path.strip(), dataset_name)
+def load_existing_dataset_for_preprocess(
+    dataset_path: str,
+    builder_state: Optional[DatasetBuilder],
+) -> Tuple[str, Any, Any, DatasetBuilder, str, str, str, str, Optional[int], str, str, float, str, bool]:
+    """Load an existing dataset JSON file for preprocessing.
+    This allows users to load a previously saved dataset and proceed to preprocessing
+    without having to re-scan and re-label.
+    Returns:
+        Tuple of (status, table_data, slider_update, builder_state,
+                  audio_path, filename, caption, lyrics, bpm, keyscale, timesig, duration, language, instrumental)
+    """
+    empty_preview = (None, "", "", "", None, "", "", 0.0, "instrumental", True)
+    if not dataset_path or not dataset_path.strip():
+        return ("❌ Please enter a dataset path", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    dataset_path = dataset_path.strip()
+    if not os.path.exists(dataset_path):
+        return (f"❌ Dataset not found: {dataset_path}", [], gr.Slider(maximum=0, value=0), builder_state) + empty_preview
+    # Create new builder (don't reuse old state when loading a file)
+    builder = DatasetBuilder()
+    # Load the dataset
+    samples, status = builder.load_dataset(dataset_path)
+    if not samples:
+        return (status, [], gr.Slider(maximum=0, value=0), builder) + empty_preview
+    # Get table data
+    table_data = builder.get_samples_dataframe_data()
+    # Calculate slider max
+    slider_max = max(0, len(samples) - 1)
+    # Create info text
+    labeled_count = builder.get_labeled_count()
+    info = f"✅ Loaded dataset: {builder.metadata.name}\n"
+    info += f"📊 Samples: {len(samples)} ({labeled_count} labeled)\n"
+    info += f"🏷️ Custom Tag: {builder.metadata.custom_tag or '(none)'}\n"
+    info += "📝 Ready for preprocessing! You can also edit samples below."
+    # Get first sample preview
+    first_sample = builder.samples[0]
+    preview = (
+        first_sample.audio_path,
+        first_sample.filename,
+        first_sample.caption,
+        first_sample.lyrics,
+        first_sample.bpm,
+        first_sample.keyscale,
+        first_sample.timesignature,
+        first_sample.duration,
+        first_sample.language,
+        first_sample.is_instrumental,
+    )
+    return (info, table_data, gr.Slider(maximum=slider_max, value=0), builder) + preview
+def preprocess_dataset(
+    output_dir: str,
+    dit_handler,
+    builder_state: Optional[DatasetBuilder],
+    progress=None,
+) -> str:
+    """Preprocess dataset to tensor files for fast training.
+    This converts audio files to VAE latents and text to embeddings.
+    Returns:
+        Status message
+    """
+    if builder_state is None:
+        return "❌ No dataset loaded. Please scan a directory first."
+    if not builder_state.samples:
+        return "❌ No samples in dataset."
+    labeled_count = builder_state.get_labeled_count()
+    if labeled_count == 0:
+        return "❌ No labeled samples. Please auto-label or manually label samples first."
+    if not output_dir or not output_dir.strip():
+        return "❌ Please enter an output directory."
+    if dit_handler is None or dit_handler.model is None:
+        return "❌ Model not initialized. Please initialize the service first."
+    def progress_callback(msg):
+        if progress:
+            try:
+                progress(msg)
+            except:
+                pass
+    # Run preprocessing
+    output_paths, status = builder_state.preprocess_to_tensors(
+        dit_handler=dit_handler,
+        output_dir=output_dir.strip(),
+        progress_callback=progress_callback,
+    )
+    return status
+def load_training_dataset(
+    tensor_dir: str,
+) -> str:
+    """Load a preprocessed tensor dataset for training.
+    Returns:
+        Info text about the dataset
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        return "❌ Please enter a tensor directory path"
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        return f"❌ Directory not found: {tensor_dir}"
+    if not os.path.isdir(tensor_dir):
+        return f"❌ Not a directory: {tensor_dir}"
+    # Check for manifest
+    manifest_path = os.path.join(tensor_dir, "manifest.json")
+    if os.path.exists(manifest_path):
+        try:
+            with open(manifest_path, 'r') as f:
+                manifest = json.load(f)
+            num_samples = manifest.get("num_samples", 0)
+            metadata = manifest.get("metadata", {})
+            name = metadata.get("name", "Unknown")
+            custom_tag = metadata.get("custom_tag", "")
+            info = f"✅ Loaded preprocessed dataset: {name}\n"
+            info += f"📊 Samples: {num_samples} preprocessed tensors\n"
+            info += f"🏷️ Custom Tag: {custom_tag or '(none)'}"
+            return info
+        except Exception as e:
+            logger.warning(f"Failed to read manifest: {e}")
+    # Fallback: count .pt files
+    pt_files = [f for f in os.listdir(tensor_dir) if f.endswith('.pt')]
+    if not pt_files:
+        return f"❌ No .pt tensor files found in {tensor_dir}"
+    info = f"✅ Found {len(pt_files)} tensor files in {tensor_dir}\n"
+    info += "⚠️ No manifest.json found - using all .pt files"
+    return info
+# Training handlers
+import time
+import re
+def _format_duration(seconds):
+    """Format seconds to human readable string."""
+    seconds = int(seconds)
+    if seconds < 60:
+        return f"{seconds}s"
+    elif seconds < 3600:
+        return f"{seconds // 60}m {seconds % 60}s"
+    else:
+        return f"{seconds // 3600}h {(seconds % 3600) // 60}m"
+def start_training(
+    tensor_dir: str,
+    dit_handler,
+    lora_rank: int,
+    lora_alpha: int,
+    lora_dropout: float,
+    learning_rate: float,
+    train_epochs: int,
+    train_batch_size: int,
+    gradient_accumulation: int,
+    save_every_n_epochs: int,
+    training_shift: float,
+    training_seed: int,
+    lora_output_dir: str,
+    training_state: Dict,
+    progress=None,
+):
+    """Start LoRA training from preprocessed tensors.
+    This is a generator function that yields progress updates.
+    """
+    if not tensor_dir or not tensor_dir.strip():
+        yield "❌ Please enter a tensor directory path", "", None, training_state
+        return
+    tensor_dir = tensor_dir.strip()
+    if not os.path.exists(tensor_dir):
+        yield f"❌ Tensor directory not found: {tensor_dir}", "", None, training_state
+        return
+    if dit_handler is None or dit_handler.model is None:
+        yield "❌ Model not initialized. Please initialize the service first.", "", None, training_state
+        return
+    # Check for required training dependencies
+    try:
+        from lightning.fabric import Fabric
+        from peft import get_peft_model, LoraConfig
+    except ImportError as e:
+        yield f"❌ Missing required packages: {e}\nPlease install: pip install peft lightning", "", None, training_state
+        return
+    training_state["is_training"] = True
+    training_state["should_stop"] = False
+    try:
+        from acestep.training.trainer import LoRATrainer
+        from acestep.training.configs import LoRAConfig as LoRAConfigClass, TrainingConfig
+        # Create configs
+        lora_config = LoRAConfigClass(
+            r=lora_rank,
+            alpha=lora_alpha,
+            dropout=lora_dropout,
+        )
+        training_config = TrainingConfig(
+            shift=training_shift,
+            learning_rate=learning_rate,
+            batch_size=train_batch_size,
+            gradient_accumulation_steps=gradient_accumulation,
+            max_epochs=train_epochs,
+            save_every_n_epochs=save_every_n_epochs,
+            seed=training_seed,
+            output_dir=lora_output_dir,
+        )
+        import pandas as pd
+        # Initialize training log and loss history
+        log_lines = []
+        loss_data = pd.DataFrame({"step": [0], "loss": [0.0]})
+        # Start timer
+        start_time = time.time()
+        yield f"🚀 Starting training from {tensor_dir}...", "", loss_data, training_state
+        # Create trainer
+        trainer = LoRATrainer(
+            dit_handler=dit_handler,
+            lora_config=lora_config,
+            training_config=training_config,
+        )
+        # Collect loss history
+        step_list = []
+        loss_list = []
+        # Train with progress updates using preprocessed tensors
+        for step, loss, status in trainer.train_from_preprocessed(tensor_dir, training_state):
+            # Calculate elapsed time and ETA
+            elapsed_seconds = time.time() - start_time
+            time_info = f"⏱️ Elapsed: {_format_duration(elapsed_seconds)}"
+            # Parse "Epoch x/y" from status to calculate ETA
+            match = re.search(r"Epoch\s+(\d+)/(\d+)", str(status))
+            if match:
+                current_ep = int(match.group(1))
+                total_ep = int(match.group(2))
+                if current_ep > 0:
+                    eta_seconds = (elapsed_seconds / current_ep) * (total_ep - current_ep)
+                    time_info += f" | ETA: ~{_format_duration(eta_seconds)}"
+            # Display status with time info
+            display_status = f"{status}\n{time_info}"
+            # Terminal log
+            log_msg = f"[{_format_duration(elapsed_seconds)}] Step {step}: {status}"
+            logger.info(log_msg)
+            # Add to UI log
+            log_lines.append(status)
+            if len(log_lines) > 15:
+                log_lines = log_lines[-15:]
+            log_text = "\n".join(log_lines)
+            # Track loss for plot (only valid values)
+            if step > 0 and loss is not None and loss == loss:  # Check for NaN
+                step_list.append(step)
+                loss_list.append(float(loss))
+                loss_data = pd.DataFrame({"step": step_list, "loss": loss_list})
+            yield display_status, log_text, loss_data, training_state
+            if training_state.get("should_stop", False):
+                logger.info("⏹️ Training stopped by user")
+                log_lines.append("⏹️ Training stopped by user")
+                yield f"⏹️ Stopped ({time_info})", "\n".join(log_lines[-15:]), loss_data, training_state
+                break
+        total_time = time.time() - start_time
+        training_state["is_training"] = False
+        completion_msg = f"✅ Training completed! Total time: {_format_duration(total_time)}"
+        logger.info(completion_msg)
+        log_lines.append(completion_msg)
+        yield completion_msg, "\n".join(log_lines[-15:]), loss_data, training_state
+    except Exception as e:
+        logger.exception("Training error")
+        training_state["is_training"] = False
+        import pandas as pd
+        empty_df = pd.DataFrame({"step": [], "loss": []})
+        yield f"❌ Error: {str(e)}", str(e), empty_df, training_state
+def stop_training(training_state: Dict) -> Tuple[str, Dict]:
+    """Stop the current training process.
+    Returns:
+        Tuple of (status, training_state)
+    """
+    if not training_state.get("is_training", False):
+        return "⚠️ No training in progress", training_state
+    training_state["should_stop"] = True
+    return "⏹️ Stopping training...", training_state
+def export_lora(
+    export_path: str,
+    lora_output_dir: str,
+) -> str:
+    """Export the trained LoRA weights.
+    Returns:
+        Status message
+    """
+    if not export_path or not export_path.strip():
+        return "❌ Please enter an export path"
+    # Check if there's a trained model to export
+    final_dir = os.path.join(lora_output_dir, "final")
+    checkpoint_dir = os.path.join(lora_output_dir, "checkpoints")
+    # Prefer final, fallback to checkpoints
+    if os.path.exists(final_dir):
+        source_path = final_dir
+    elif os.path.exists(checkpoint_dir):
+        # Find the latest checkpoint
+        checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("epoch_")]
+        if not checkpoints:
+            return "❌ No checkpoints found"
+        checkpoints.sort(key=lambda x: int(x.split("_")[1]))
+        latest = checkpoints[-1]
+        source_path = os.path.join(checkpoint_dir, latest)
+    else:
+        return f"❌ No trained model found in {lora_output_dir}"
+    try:
+        import shutil
+        export_path = export_path.strip()
+        os.makedirs(os.path.dirname(export_path) if os.path.dirname(export_path) else ".", exist_ok=True)
+        if os.path.exists(export_path):
+            shutil.rmtree(export_path)
+        shutil.copytree(source_path, export_path)
+        return f"✅ LoRA exported to {export_path}"
+    except Exception as e:
+        logger.exception("Export error")
+        return f"❌ Export failed: {str(e)}"

acestep/gradio_ui/interfaces/__init__.py CHANGED Viewed

@@ -7,7 +7,8 @@ from acestep.gradio_ui.i18n import get_i18n, t
 from acestep.gradio_ui.interfaces.dataset import create_dataset_section
 from acestep.gradio_ui.interfaces.generation import create_generation_section
 from acestep.gradio_ui.interfaces.result import create_results_section
-from acestep.gradio_ui.events import setup_event_handlers
 def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None, language='en') -> gr.Blocks:
@@ -76,7 +77,13 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_para
         # Results Section
         results_section = create_results_section(dit_handler)
         # Connect event handlers
         setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)
     return demo

 from acestep.gradio_ui.interfaces.dataset import create_dataset_section
 from acestep.gradio_ui.interfaces.generation import create_generation_section
 from acestep.gradio_ui.interfaces.result import create_results_section
+from acestep.gradio_ui.interfaces.training import create_training_section
+from acestep.gradio_ui.events import setup_event_handlers, setup_training_event_handlers
 def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_params=None, language='en') -> gr.Blocks:
         # Results Section
         results_section = create_results_section(dit_handler)
+        # Training Section (LoRA training and dataset builder)
+        training_section = create_training_section(dit_handler, llm_handler)
         # Connect event handlers
         setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)
+        # Connect training event handlers
+        setup_training_event_handlers(demo, dit_handler, llm_handler, training_section)
     return demo

acestep/gradio_ui/interfaces/generation.py CHANGED Viewed

@@ -144,6 +144,31 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
             # Set init_status value from init_params if pre-initialized
             init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
             init_status = gr.Textbox(label=t("service.status_label"), interactive=False, lines=3, value=init_status_value)
         # Inputs
         with gr.Row():
@@ -653,6 +678,12 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
         "use_flash_attention_checkbox": use_flash_attention_checkbox,
         "offload_to_cpu_checkbox": offload_to_cpu_checkbox,
         "offload_dit_to_cpu_checkbox": offload_dit_to_cpu_checkbox,
         "task_type": task_type,
         "instruction_display_gen": instruction_display_gen,
         "track_name": track_name,

             # Set init_status value from init_params if pre-initialized
             init_status_value = init_params.get('init_status', '') if service_pre_initialized else ''
             init_status = gr.Textbox(label=t("service.status_label"), interactive=False, lines=3, value=init_status_value)
+            # LoRA Configuration Section
+            gr.HTML("<hr><h4>🔧 LoRA Adapter</h4>")
+            with gr.Row():
+                lora_path = gr.Textbox(
+                    label="LoRA Path",
+                    placeholder="./lora_output/final/adapter",
+                    info="Path to trained LoRA adapter directory",
+                    scale=3,
+                )
+                load_lora_btn = gr.Button("📥 Load LoRA", variant="secondary", scale=1)
+                unload_lora_btn = gr.Button("🗑️ Unload", variant="secondary", scale=1)
+            with gr.Row():
+                use_lora_checkbox = gr.Checkbox(
+                    label="Use LoRA",
+                    value=False,
+                    info="Enable LoRA adapter for inference",
+                    scale=1,
+                )
+                lora_status = gr.Textbox(
+                    label="LoRA Status",
+                    value="No LoRA loaded",
+                    interactive=False,
+                    scale=2,
+                )
         # Inputs
         with gr.Row():
         "use_flash_attention_checkbox": use_flash_attention_checkbox,
         "offload_to_cpu_checkbox": offload_to_cpu_checkbox,
         "offload_dit_to_cpu_checkbox": offload_dit_to_cpu_checkbox,
+        # LoRA components
+        "lora_path": lora_path,
+        "load_lora_btn": load_lora_btn,
+        "unload_lora_btn": unload_lora_btn,
+        "use_lora_checkbox": use_lora_checkbox,
+        "lora_status": lora_status,
         "task_type": task_type,
         "instruction_display_gen": instruction_display_gen,
         "track_name": track_name,

acestep/gradio_ui/interfaces/training.py ADDED Viewed

	@@ -0,0 +1,558 @@

+"""
+Gradio UI Training Tab Module
+Contains the dataset builder and LoRA training interface components.
+"""
+import os
+import gradio as gr
+from acestep.gradio_ui.i18n import t
+def create_training_section(dit_handler, llm_handler) -> dict:
+    """Create the training tab section with dataset builder and training controls.
+    Args:
+        dit_handler: DiT handler instance
+        llm_handler: LLM handler instance
+    Returns:
+        Dictionary of Gradio components for event handling
+    """
+    with gr.Tab("🎓 LoRA Training"):
+        gr.HTML("""
+        <div style="text-align: center; padding: 10px; margin-bottom: 15px;">
+            <h2>🎵 LoRA Training for ACE-Step</h2>
+            <p>Build datasets from your audio files and train custom LoRA adapters</p>
+        </div>
+        """)
+        with gr.Tabs():
+            # ==================== Dataset Builder Tab ====================
+            with gr.Tab("📁 Dataset Builder"):
+                # ========== Load Existing OR Scan New ==========
+                gr.HTML("""
+                <div style="padding: 10px; margin-bottom: 10px; border: 1px solid #4a4a6a; border-radius: 8px; background: linear-gradient(135deg, #2a2a4a 0%, #1a1a3a 100%);">
+                    <h3 style="margin: 0 0 5px 0;">🚀 Quick Start</h3>
+                    <p style="margin: 0; color: #aaa;">Choose one: <b>Load existing dataset</b> OR <b>Scan new directory</b></p>
+                </div>
+                """)
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>📂 Load Existing Dataset</h4>")
+                        with gr.Row():
+                            load_json_path = gr.Textbox(
+                                label="Dataset JSON Path",
+                                placeholder="./datasets/my_lora_dataset.json",
+                                info="Load a previously saved dataset",
+                                scale=3,
+                            )
+                            load_json_btn = gr.Button("📂 Load", variant="primary", scale=1)
+                        load_json_status = gr.Textbox(
+                            label="Load Status",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h4>🔍 Scan New Directory</h4>")
+                        with gr.Row():
+                            audio_directory = gr.Textbox(
+                                label="Audio Directory Path",
+                                placeholder="/path/to/your/audio/folder",
+                                info="Scan for audio files (wav, mp3, flac, ogg, opus)",
+                                scale=3,
+                            )
+                            scan_btn = gr.Button("🔍 Scan", variant="secondary", scale=1)
+                        scan_status = gr.Textbox(
+                            label="Scan Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        # Audio files table
+                        audio_files_table = gr.Dataframe(
+                            headers=["#", "Filename", "Duration", "Labeled", "BPM", "Key", "Caption"],
+                            datatype=["number", "str", "str", "str", "str", "str", "str"],
+                            label="Found Audio Files",
+                            interactive=False,
+                            wrap=True,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ Dataset Settings</h3>")
+                        dataset_name = gr.Textbox(
+                            label="Dataset Name",
+                            value="my_lora_dataset",
+                            placeholder="Enter dataset name",
+                        )
+                        all_instrumental = gr.Checkbox(
+                            label="All Instrumental",
+                            value=True,
+                            info="Check if all tracks are instrumental (no vocals)",
+                        )
+                        need_lyrics = gr.Checkbox(
+                            label="Transcribe Lyrics",
+                            value=False,
+                            info="Attempt to transcribe lyrics (slower)",
+                            interactive=False,  # Disabled for now
+                        )
+                        custom_tag = gr.Textbox(
+                            label="Custom Activation Tag",
+                            placeholder="e.g., 8bit_retro, my_style",
+                            info="Unique tag to activate this LoRA's style",
+                        )
+                        tag_position = gr.Radio(
+                            choices=[
+                                ("Prepend (tag, caption)", "prepend"),
+                                ("Append (caption, tag)", "append"),
+                                ("Replace caption", "replace"),
+                            ],
+                            value="replace",
+                            label="Tag Position",
+                            info="Where to place the custom tag in the caption",
+                        )
+                gr.HTML("<hr><h3>🤖 Step 2: Auto-Label with AI</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        gr.Markdown("""
+                        Click the button below to automatically generate metadata for all audio files using AI:
+                        - **Caption**: Music style, genre, mood description
+                        - **BPM**: Beats per minute
+                        - **Key**: Musical key (e.g., C Major, Am)
+                        - **Time Signature**: 4/4, 3/4, etc.
+                        """)
+                        skip_metas = gr.Checkbox(
+                            label="Skip Metas (No LLM)",
+                            value=False,
+                            info="Skip AI labeling. BPM/Key/Time Signature will be N/A, Language will be 'unknown' for instrumental",
+                        )
+                    with gr.Column(scale=1):
+                        auto_label_btn = gr.Button(
+                            "🏷️ Auto-Label All",
+                            variant="primary",
+                            size="lg",
+                        )
+                label_progress = gr.Textbox(
+                    label="Labeling Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>👀 Step 3: Preview & Edit</h3>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        sample_selector = gr.Slider(
+                            minimum=0,
+                            maximum=0,
+                            step=1,
+                            value=0,
+                            label="Select Sample #",
+                            info="Choose a sample to preview and edit",
+                        )
+                        preview_audio = gr.Audio(
+                            label="Audio Preview",
+                            type="filepath",
+                            interactive=False,
+                        )
+                        preview_filename = gr.Textbox(
+                            label="Filename",
+                            interactive=False,
+                        )
+                    with gr.Column(scale=2):
+                        with gr.Row():
+                            edit_caption = gr.Textbox(
+                                label="Caption",
+                                lines=3,
+                                placeholder="Music description...",
+                            )
+                        with gr.Row():
+                            edit_lyrics = gr.Textbox(
+                                label="Lyrics",
+                                lines=4,
+                                placeholder="[Verse 1]\nLyrics here...\n\n[Chorus]\n...",
+                            )
+                        with gr.Row():
+                            edit_bpm = gr.Number(
+                                label="BPM",
+                                precision=0,
+                            )
+                            edit_keyscale = gr.Textbox(
+                                label="Key",
+                                placeholder="C Major",
+                            )
+                            edit_timesig = gr.Dropdown(
+                                choices=["", "2", "3", "4", "6"],
+                                label="Time Signature",
+                            )
+                            edit_duration = gr.Number(
+                                label="Duration (s)",
+                                precision=1,
+                                interactive=False,
+                            )
+                        with gr.Row():
+                            edit_language = gr.Dropdown(
+                                choices=["instrumental", "en", "zh", "ja", "ko", "es", "fr", "de", "pt", "ru", "unknown"],
+                                value="instrumental",
+                                label="Language",
+                            )
+                            edit_instrumental = gr.Checkbox(
+                                label="Instrumental",
+                                value=True,
+                            )
+                            save_edit_btn = gr.Button("💾 Save Changes", variant="secondary")
+                        edit_status = gr.Textbox(
+                            label="Edit Status",
+                            interactive=False,
+                        )
+                gr.HTML("<hr><h3>💾 Step 4: Save Dataset</h3>")
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        save_path = gr.Textbox(
+                            label="Save Path",
+                            value="./datasets/my_lora_dataset.json",
+                            placeholder="./datasets/dataset_name.json",
+                            info="Path where the dataset JSON will be saved",
+                        )
+                    with gr.Column(scale=1):
+                        save_dataset_btn = gr.Button(
+                            "💾 Save Dataset",
+                            variant="primary",
+                            size="lg",
+                        )
+                save_status = gr.Textbox(
+                    label="Save Status",
+                    interactive=False,
+                    lines=2,
+                )
+                gr.HTML("<hr><h3>⚡ Step 5: Preprocess to Tensors</h3>")
+                gr.Markdown("""
+                **Preprocessing converts your dataset to pre-computed tensors for fast training.**
+                You can either:
+                - Use the dataset from Steps 1-4 above, **OR**
+                - Load an existing dataset JSON file (if you've already saved one)
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        load_existing_dataset_path = gr.Textbox(
+                            label="Load Existing Dataset (Optional)",
+                            placeholder="./datasets/my_lora_dataset.json",
+                            info="Path to a previously saved dataset JSON file",
+                        )
+                    with gr.Column(scale=1):
+                        load_existing_dataset_btn = gr.Button(
+                            "📂 Load Dataset",
+                            variant="secondary",
+                            size="lg",
+                        )
+                load_existing_status = gr.Textbox(
+                    label="Load Status",
+                    interactive=False,
+                )
+                gr.Markdown("""
+                This step:
+                - Encodes audio to VAE latents
+                - Encodes captions and lyrics to text embeddings
+                - Runs the condition encoder
+                - Saves all tensors to `.pt` files
+                ⚠️ **This requires the model to be loaded and may take a few minutes.**
+                """)
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        preprocess_output_dir = gr.Textbox(
+                            label="Tensor Output Directory",
+                            value="./datasets/preprocessed_tensors",
+                            placeholder="./datasets/preprocessed_tensors",
+                            info="Directory to save preprocessed tensor files",
+                        )
+                    with gr.Column(scale=1):
+                        preprocess_btn = gr.Button(
+                            "⚡ Preprocess",
+                            variant="primary",
+                            size="lg",
+                        )
+                preprocess_progress = gr.Textbox(
+                    label="Preprocessing Progress",
+                    interactive=False,
+                    lines=3,
+                )
+            # ==================== Training Tab ====================
+            with gr.Tab("🚀 Train LoRA"):
+                with gr.Row():
+                    with gr.Column(scale=2):
+                        gr.HTML("<h3>📊 Preprocessed Dataset Selection</h3>")
+                        gr.Markdown("""
+                        Select the directory containing preprocessed tensor files (`.pt` files).
+                        These are created in the "Dataset Builder" tab using the "Preprocess" button.
+                        """)
+                        training_tensor_dir = gr.Textbox(
+                            label="Preprocessed Tensors Directory",
+                            placeholder="./datasets/preprocessed_tensors",
+                            value="./datasets/preprocessed_tensors",
+                            info="Directory containing preprocessed .pt tensor files",
+                        )
+                        load_dataset_btn = gr.Button("📂 Load Dataset", variant="secondary")
+                        training_dataset_info = gr.Textbox(
+                            label="Dataset Info",
+                            interactive=False,
+                            lines=3,
+                        )
+                    with gr.Column(scale=1):
+                        gr.HTML("<h3>⚙️ LoRA Settings</h3>")
+                        lora_rank = gr.Slider(
+                            minimum=4,
+                            maximum=256,
+                            step=4,
+                            value=64,
+                            label="LoRA Rank (r)",
+                            info="Higher = more capacity, more memory",
+                        )
+                        lora_alpha = gr.Slider(
+                            minimum=4,
+                            maximum=512,
+                            step=4,
+                            value=128,
+                            label="LoRA Alpha",
+                            info="Scaling factor (typically 2x rank)",
+                        )
+                        lora_dropout = gr.Slider(
+                            minimum=0.0,
+                            maximum=0.5,
+                            step=0.05,
+                            value=0.1,
+                            label="LoRA Dropout",
+                        )
+                gr.HTML("<hr><h3>🎛️ Training Parameters</h3>")
+                with gr.Row():
+                    learning_rate = gr.Number(
+                        label="Learning Rate",
+                        value=1e-4,
+                        info="Start with 1e-4, adjust if needed",
+                    )
+                    train_epochs = gr.Slider(
+                        minimum=100,
+                        maximum=4000,
+                        step=100,
+                        value=500,
+                        label="Max Epochs",
+                    )
+                    train_batch_size = gr.Slider(
+                        minimum=1,
+                        maximum=8,
+                        step=1,
+                        value=1,
+                        label="Batch Size",
+                        info="Increase if you have enough VRAM",
+                    )
+                    gradient_accumulation = gr.Slider(
+                        minimum=1,
+                        maximum=16,
+                        step=1,
+                        value=1,
+                        label="Gradient Accumulation",
+                        info="Effective batch = batch_size × accumulation",
+                    )
+                with gr.Row():
+                    save_every_n_epochs = gr.Slider(
+                        minimum=50,
+                        maximum=1000,
+                        step=50,
+                        value=200,
+                        label="Save Every N Epochs",
+                    )
+                    training_shift = gr.Slider(
+                        minimum=1.0,
+                        maximum=5.0,
+                        step=0.5,
+                        value=3.0,
+                        label="Shift",
+                        info="Timestep shift for turbo model",
+                    )
+                    training_seed = gr.Number(
+                        label="Seed",
+                        value=42,
+                        precision=0,
+                    )
+                with gr.Row():
+                    lora_output_dir = gr.Textbox(
+                        label="Output Directory",
+                        value="./lora_output",
+                        placeholder="./lora_output",
+                        info="Directory to save trained LoRA weights",
+                    )
+                gr.HTML("<hr>")
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        start_training_btn = gr.Button(
+                            "🚀 Start Training",
+                            variant="primary",
+                            size="lg",
+                        )
+                    with gr.Column(scale=1):
+                        stop_training_btn = gr.Button(
+                            "⏹️ Stop Training",
+                            variant="stop",
+                            size="lg",
+                        )
+                training_progress = gr.Textbox(
+                    label="Training Progress",
+                    interactive=False,
+                    lines=2,
+                )
+                with gr.Row():
+                    training_log = gr.Textbox(
+                        label="Training Log",
+                        interactive=False,
+                        lines=10,
+                        max_lines=15,
+                        scale=1,
+                    )
+                    training_loss_plot = gr.LinePlot(
+                        x="step",
+                        y="loss",
+                        title="Training Loss",
+                        x_title="Step",
+                        y_title="Loss",
+                        scale=1,
+                    )
+                gr.HTML("<hr><h3>📦 Export LoRA</h3>")
+                with gr.Row():
+                    export_path = gr.Textbox(
+                        label="Export Path",
+                        value="./lora_output/final_lora",
+                        placeholder="./lora_output/my_lora",
+                    )
+                    export_lora_btn = gr.Button("📦 Export LoRA", variant="secondary")
+                export_status = gr.Textbox(
+                    label="Export Status",
+                    interactive=False,
+                )
+    # Store dataset builder state
+    dataset_builder_state = gr.State(None)
+    training_state = gr.State({"is_training": False, "should_stop": False})
+    return {
+        # Dataset Builder - Load or Scan
+        "load_json_path": load_json_path,
+        "load_json_btn": load_json_btn,
+        "load_json_status": load_json_status,
+        "audio_directory": audio_directory,
+        "scan_btn": scan_btn,
+        "scan_status": scan_status,
+        "audio_files_table": audio_files_table,
+        "dataset_name": dataset_name,
+        "all_instrumental": all_instrumental,
+        "need_lyrics": need_lyrics,
+        "custom_tag": custom_tag,
+        "tag_position": tag_position,
+        "skip_metas": skip_metas,
+        "auto_label_btn": auto_label_btn,
+        "label_progress": label_progress,
+        "sample_selector": sample_selector,
+        "preview_audio": preview_audio,
+        "preview_filename": preview_filename,
+        "edit_caption": edit_caption,
+        "edit_lyrics": edit_lyrics,
+        "edit_bpm": edit_bpm,
+        "edit_keyscale": edit_keyscale,
+        "edit_timesig": edit_timesig,
+        "edit_duration": edit_duration,
+        "edit_language": edit_language,
+        "edit_instrumental": edit_instrumental,
+        "save_edit_btn": save_edit_btn,
+        "edit_status": edit_status,
+        "save_path": save_path,
+        "save_dataset_btn": save_dataset_btn,
+        "save_status": save_status,
+        # Preprocessing
+        "load_existing_dataset_path": load_existing_dataset_path,
+        "load_existing_dataset_btn": load_existing_dataset_btn,
+        "load_existing_status": load_existing_status,
+        "preprocess_output_dir": preprocess_output_dir,
+        "preprocess_btn": preprocess_btn,
+        "preprocess_progress": preprocess_progress,
+        "dataset_builder_state": dataset_builder_state,
+        # Training
+        "training_tensor_dir": training_tensor_dir,
+        "load_dataset_btn": load_dataset_btn,
+        "training_dataset_info": training_dataset_info,
+        "lora_rank": lora_rank,
+        "lora_alpha": lora_alpha,
+        "lora_dropout": lora_dropout,
+        "learning_rate": learning_rate,
+        "train_epochs": train_epochs,
+        "train_batch_size": train_batch_size,
+        "gradient_accumulation": gradient_accumulation,
+        "save_every_n_epochs": save_every_n_epochs,
+        "training_shift": training_shift,
+        "training_seed": training_seed,
+        "lora_output_dir": lora_output_dir,
+        "start_training_btn": start_training_btn,
+        "stop_training_btn": stop_training_btn,
+        "training_progress": training_progress,
+        "training_log": training_log,
+        "training_loss_plot": training_loss_plot,
+        "export_path": export_path,
+        "export_lora_btn": export_lora_btn,
+        "export_status": export_status,
+        "training_state": training_state,
+    }

acestep/handler.py CHANGED Viewed

@@ -3,6 +3,10 @@ Business Logic Handler
 Encapsulates all data processing and business logic as a bridge between model and UI
 """
 import os
 import math
 from copy import deepcopy
 import tempfile
@@ -70,6 +74,11 @@ class AceStepHandler:
         self.offload_to_cpu = False
         self.offload_dit_to_cpu = False
         self.current_offload_cost = 0.0
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
@@ -114,6 +123,137 @@ class AceStepHandler:
             return False
         return getattr(self.config, 'is_turbo', False)
     def initialize_service(
         self,
         project_root: str,

 Encapsulates all data processing and business logic as a bridge between model and UI
 """
 import os
+# Disable tokenizers parallelism to avoid fork warning
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import math
 from copy import deepcopy
 import tempfile
         self.offload_to_cpu = False
         self.offload_dit_to_cpu = False
         self.current_offload_cost = 0.0
+        # LoRA state
+        self.lora_loaded = False
+        self.use_lora = False
+        self._base_decoder = None  # Backup of original decoder
     def get_available_checkpoints(self) -> str:
         """Return project root directory path"""
             return False
         return getattr(self.config, 'is_turbo', False)
+    def load_lora(self, lora_path: str) -> str:
+        """Load LoRA adapter into the decoder.
+        Args:
+            lora_path: Path to the LoRA adapter directory (containing adapter_config.json)
+        Returns:
+            Status message
+        """
+        if self.model is None:
+            return "❌ Model not initialized. Please initialize service first."
+        if not lora_path or not lora_path.strip():
+            return "❌ Please provide a LoRA path."
+        lora_path = lora_path.strip()
+        # Check if path exists
+        if not os.path.exists(lora_path):
+            return f"❌ LoRA path not found: {lora_path}"
+        # Check if it's a valid PEFT adapter directory
+        config_file = os.path.join(lora_path, "adapter_config.json")
+        if not os.path.exists(config_file):
+            return f"❌ Invalid LoRA adapter: adapter_config.json not found in {lora_path}"
+        try:
+            from peft import PeftModel, PeftConfig
+        except ImportError:
+            return "❌ PEFT library not installed. Please install with: pip install peft"
+        try:
+            # Backup base decoder if not already backed up
+            if self._base_decoder is None:
+                import copy
+                self._base_decoder = copy.deepcopy(self.model.decoder)
+                logger.info("Base decoder backed up")
+            else:
+                # Restore base decoder before loading new LoRA
+                self.model.decoder = copy.deepcopy(self._base_decoder)
+                logger.info("Restored base decoder before loading new LoRA")
+            # Load PEFT adapter
+            logger.info(f"Loading LoRA adapter from {lora_path}")
+            self.model.decoder = PeftModel.from_pretrained(
+                self.model.decoder,
+                lora_path,
+                is_trainable=False,
+            )
+            self.model.decoder = self.model.decoder.to(self.device).to(self.dtype)
+            self.model.decoder.eval()
+            self.lora_loaded = True
+            self.use_lora = True  # Enable LoRA by default after loading
+            logger.info(f"LoRA adapter loaded successfully from {lora_path}")
+            return f"✅ LoRA loaded from {lora_path}"
+        except Exception as e:
+            logger.exception("Failed to load LoRA adapter")
+            return f"❌ Failed to load LoRA: {str(e)}"
+    def unload_lora(self) -> str:
+        """Unload LoRA adapter and restore base decoder.
+        Returns:
+            Status message
+        """
+        if not self.lora_loaded:
+            return "⚠️ No LoRA adapter loaded."
+        if self._base_decoder is None:
+            return "❌ Base decoder backup not found. Cannot restore."
+        try:
+            import copy
+            # Restore base decoder
+            self.model.decoder = copy.deepcopy(self._base_decoder)
+            self.model.decoder = self.model.decoder.to(self.device).to(self.dtype)
+            self.model.decoder.eval()
+            self.lora_loaded = False
+            self.use_lora = False
+            logger.info("LoRA unloaded, base decoder restored")
+            return "✅ LoRA unloaded, using base model"
+        except Exception as e:
+            logger.exception("Failed to unload LoRA")
+            return f"❌ Failed to unload LoRA: {str(e)}"
+    def set_use_lora(self, use_lora: bool) -> str:
+        """Toggle LoRA usage for inference.
+        Args:
+            use_lora: Whether to use LoRA adapter
+        Returns:
+            Status message
+        """
+        if use_lora and not self.lora_loaded:
+            return "❌ No LoRA adapter loaded. Please load a LoRA first."
+        self.use_lora = use_lora
+        # Use PEFT's enable/disable methods if available
+        if self.lora_loaded and hasattr(self.model.decoder, 'disable_adapter_layers'):
+            try:
+                if use_lora:
+                    self.model.decoder.enable_adapter_layers()
+                    logger.info("LoRA adapter enabled")
+                else:
+                    self.model.decoder.disable_adapter_layers()
+                    logger.info("LoRA adapter disabled")
+            except Exception as e:
+                logger.warning(f"Could not toggle adapter layers: {e}")
+        status = "enabled" if use_lora else "disabled"
+        return f"✅ LoRA {status}"
+    def get_lora_status(self) -> Dict[str, Any]:
+        """Get current LoRA status.
+        Returns:
+            Dictionary with LoRA status info
+        """
+        return {
+            "loaded": self.lora_loaded,
+            "active": self.use_lora,
+        }
     def initialize_service(
         self,
         project_root: str,

acestep/training/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+"""
+ACE-Step Training Module
+This module provides LoRA training functionality for ACE-Step models,
+including dataset building, audio labeling, and training utilities.
+"""
+from acestep.training.dataset_builder import DatasetBuilder, AudioSample
+from acestep.training.configs import LoRAConfig, TrainingConfig
+from acestep.training.lora_utils import (
+    inject_lora_into_dit,
+    save_lora_weights,
+    load_lora_weights,
+    merge_lora_weights,
+    check_peft_available,
+)
+from acestep.training.data_module import (
+    # Preprocessed (recommended)
+    PreprocessedTensorDataset,
+    PreprocessedDataModule,
+    collate_preprocessed_batch,
+    # Legacy (raw audio)
+    AceStepTrainingDataset,
+    AceStepDataModule,
+    collate_training_batch,
+    load_dataset_from_json,
+)
+from acestep.training.trainer import LoRATrainer, PreprocessedLoRAModule, LIGHTNING_AVAILABLE
+def check_lightning_available():
+    """Check if Lightning Fabric is available."""
+    return LIGHTNING_AVAILABLE
+__all__ = [
+    # Dataset Builder
+    "DatasetBuilder",
+    "AudioSample",
+    # Configs
+    "LoRAConfig",
+    "TrainingConfig",
+    # LoRA Utils
+    "inject_lora_into_dit",
+    "save_lora_weights",
+    "load_lora_weights",
+    "merge_lora_weights",
+    "check_peft_available",
+    # Data Module (Preprocessed - Recommended)
+    "PreprocessedTensorDataset",
+    "PreprocessedDataModule",
+    "collate_preprocessed_batch",
+    # Data Module (Legacy)
+    "AceStepTrainingDataset",
+    "AceStepDataModule",
+    "collate_training_batch",
+    "load_dataset_from_json",
+    # Trainer
+    "LoRATrainer",
+    "PreprocessedLoRAModule",
+    "check_lightning_available",
+    "LIGHTNING_AVAILABLE",
+]

acestep/training/configs.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Training Configuration Classes
+Contains dataclasses for LoRA and training configurations.
+"""
+from dataclasses import dataclass, field
+from typing import List, Optional
+@dataclass
+class LoRAConfig:
+    """Configuration for LoRA (Low-Rank Adaptation) training.
+    Attributes:
+        r: LoRA rank (dimension of low-rank matrices)
+        alpha: LoRA scaling factor (alpha/r determines the scaling)
+        dropout: Dropout probability for LoRA layers
+        target_modules: List of module names to apply LoRA to
+        bias: Whether to train bias parameters ("none", "all", or "lora_only")
+    """
+    r: int = 8
+    alpha: int = 16
+    dropout: float = 0.1
+    target_modules: List[str] = field(default_factory=lambda: [
+        "q_proj", "k_proj", "v_proj", "o_proj"
+    ])
+    bias: str = "none"
+    def to_dict(self):
+        """Convert to dictionary for PEFT config."""
+        return {
+            "r": self.r,
+            "lora_alpha": self.alpha,
+            "lora_dropout": self.dropout,
+            "target_modules": self.target_modules,
+            "bias": self.bias,
+        }
+@dataclass
+class TrainingConfig:
+    """Configuration for LoRA training process.
+    Training uses:
+    - BFloat16 precision (only supported precision)
+    - Discrete timesteps from turbo shift=3.0 schedule (8 steps)
+    - Randomly samples one of 8 timesteps per training step:
+      [1.0, 0.9545, 0.9, 0.8333, 0.75, 0.6429, 0.5, 0.3]
+    Attributes:
+        shift: Timestep shift factor (fixed at 3.0 for turbo model)
+        num_inference_steps: Number of inference steps (fixed at 8 for turbo)
+        learning_rate: Initial learning rate
+        batch_size: Training batch size
+        gradient_accumulation_steps: Number of gradient accumulation steps
+        max_epochs: Maximum number of training epochs
+        save_every_n_epochs: Save checkpoint every N epochs
+        warmup_steps: Number of warmup steps for learning rate scheduler
+        weight_decay: Weight decay for optimizer
+        max_grad_norm: Maximum gradient norm for clipping
+        mixed_precision: Always "bf16" (only supported precision)
+        seed: Random seed for reproducibility
+        output_dir: Directory to save checkpoints and logs
+    """
+    # Fixed for turbo model
+    shift: float = 3.0  # Fixed: turbo uses shift=3.0
+    num_inference_steps: int = 8  # Fixed: turbo uses 8 steps
+    learning_rate: float = 1e-4
+    batch_size: int = 1
+    gradient_accumulation_steps: int = 4
+    max_epochs: int = 100
+    save_every_n_epochs: int = 10
+    warmup_steps: int = 100
+    weight_decay: float = 0.01
+    max_grad_norm: float = 1.0
+    mixed_precision: str = "bf16"  # Fixed: only bf16 supported
+    seed: int = 42
+    output_dir: str = "./lora_output"
+    # Data loading
+    num_workers: int = 4
+    pin_memory: bool = True
+    # Logging
+    log_every_n_steps: int = 10
+    def to_dict(self):
+        """Convert to dictionary."""
+        return {
+            "shift": self.shift,
+            "num_inference_steps": self.num_inference_steps,
+            "learning_rate": self.learning_rate,
+            "batch_size": self.batch_size,
+            "gradient_accumulation_steps": self.gradient_accumulation_steps,
+            "max_epochs": self.max_epochs,
+            "save_every_n_epochs": self.save_every_n_epochs,
+            "warmup_steps": self.warmup_steps,
+            "weight_decay": self.weight_decay,
+            "max_grad_norm": self.max_grad_norm,
+            "mixed_precision": self.mixed_precision,
+            "seed": self.seed,
+            "output_dir": self.output_dir,
+            "num_workers": self.num_workers,
+            "pin_memory": self.pin_memory,
+            "log_every_n_steps": self.log_every_n_steps,
+        }

acestep/training/data_module.py ADDED Viewed

	@@ -0,0 +1,465 @@

+"""
+PyTorch Lightning DataModule for LoRA Training
+Handles data loading and preprocessing for training ACE-Step LoRA adapters.
+Supports both raw audio loading and preprocessed tensor loading.
+"""
+import os
+import json
+import random
+from typing import Optional, List, Dict, Any, Tuple
+from loguru import logger
+import torch
+import torchaudio
+from torch.utils.data import Dataset, DataLoader
+try:
+    from lightning.pytorch import LightningDataModule
+    LIGHTNING_AVAILABLE = True
+except ImportError:
+    LIGHTNING_AVAILABLE = False
+    logger.warning("Lightning not installed. Training module will not be available.")
+    # Create a dummy class for type hints
+    class LightningDataModule:
+        pass
+# ============================================================================
+# Preprocessed Tensor Dataset (Recommended for Training)
+# ============================================================================
+class PreprocessedTensorDataset(Dataset):
+    """Dataset that loads preprocessed tensor files.
+    This is the recommended dataset for training as all tensors are pre-computed:
+    - target_latents: VAE-encoded audio [T, 64]
+    - encoder_hidden_states: Condition encoder output [L, D]
+    - encoder_attention_mask: Condition mask [L]
+    - context_latents: Source context [T, 65]
+    - attention_mask: Audio latent mask [T]
+    No VAE/text encoder needed during training - just load tensors directly!
+    """
+    def __init__(self, tensor_dir: str):
+        """Initialize from a directory of preprocessed .pt files.
+        Args:
+            tensor_dir: Directory containing preprocessed .pt files and manifest.json
+        """
+        self.tensor_dir = tensor_dir
+        self.sample_paths = []
+        # Load manifest if exists
+        manifest_path = os.path.join(tensor_dir, "manifest.json")
+        if os.path.exists(manifest_path):
+            with open(manifest_path, 'r') as f:
+                manifest = json.load(f)
+            self.sample_paths = manifest.get("samples", [])
+        else:
+            # Fallback: scan directory for .pt files
+            for f in os.listdir(tensor_dir):
+                if f.endswith('.pt') and f != "manifest.json":
+                    self.sample_paths.append(os.path.join(tensor_dir, f))
+        # Validate paths
+        self.valid_paths = [p for p in self.sample_paths if os.path.exists(p)]
+        if len(self.valid_paths) != len(self.sample_paths):
+            logger.warning(f"Some tensor files not found: {len(self.sample_paths) - len(self.valid_paths)} missing")
+        logger.info(f"PreprocessedTensorDataset: {len(self.valid_paths)} samples from {tensor_dir}")
+    def __len__(self) -> int:
+        return len(self.valid_paths)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Load a preprocessed tensor file.
+        Returns:
+            Dictionary containing all pre-computed tensors for training
+        """
+        tensor_path = self.valid_paths[idx]
+        data = torch.load(tensor_path, map_location='cpu')
+        return {
+            "target_latents": data["target_latents"],  # [T, 64]
+            "attention_mask": data["attention_mask"],  # [T]
+            "encoder_hidden_states": data["encoder_hidden_states"],  # [L, D]
+            "encoder_attention_mask": data["encoder_attention_mask"],  # [L]
+            "context_latents": data["context_latents"],  # [T, 65]
+            "metadata": data.get("metadata", {}),
+        }
+def collate_preprocessed_batch(batch: List[Dict]) -> Dict[str, torch.Tensor]:
+    """Collate function for preprocessed tensor batches.
+    Handles variable-length tensors by padding to the longest in the batch.
+    Args:
+        batch: List of sample dictionaries with pre-computed tensors
+    Returns:
+        Batched dictionary with all tensors stacked
+    """
+    # Get max lengths
+    max_latent_len = max(s["target_latents"].shape[0] for s in batch)
+    max_encoder_len = max(s["encoder_hidden_states"].shape[0] for s in batch)
+    # Pad and stack tensors
+    target_latents = []
+    attention_masks = []
+    encoder_hidden_states = []
+    encoder_attention_masks = []
+    context_latents = []
+    for sample in batch:
+        # Pad target_latents [T, 64] -> [max_T, 64]
+        tl = sample["target_latents"]
+        if tl.shape[0] < max_latent_len:
+            pad = torch.zeros(max_latent_len - tl.shape[0], tl.shape[1])
+            tl = torch.cat([tl, pad], dim=0)
+        target_latents.append(tl)
+        # Pad attention_mask [T] -> [max_T]
+        am = sample["attention_mask"]
+        if am.shape[0] < max_latent_len:
+            pad = torch.zeros(max_latent_len - am.shape[0])
+            am = torch.cat([am, pad], dim=0)
+        attention_masks.append(am)
+        # Pad context_latents [T, 65] -> [max_T, 65]
+        cl = sample["context_latents"]
+        if cl.shape[0] < max_latent_len:
+            pad = torch.zeros(max_latent_len - cl.shape[0], cl.shape[1])
+            cl = torch.cat([cl, pad], dim=0)
+        context_latents.append(cl)
+        # Pad encoder_hidden_states [L, D] -> [max_L, D]
+        ehs = sample["encoder_hidden_states"]
+        if ehs.shape[0] < max_encoder_len:
+            pad = torch.zeros(max_encoder_len - ehs.shape[0], ehs.shape[1])
+            ehs = torch.cat([ehs, pad], dim=0)
+        encoder_hidden_states.append(ehs)
+        # Pad encoder_attention_mask [L] -> [max_L]
+        eam = sample["encoder_attention_mask"]
+        if eam.shape[0] < max_encoder_len:
+            pad = torch.zeros(max_encoder_len - eam.shape[0])
+            eam = torch.cat([eam, pad], dim=0)
+        encoder_attention_masks.append(eam)
+    return {
+        "target_latents": torch.stack(target_latents),  # [B, T, 64]
+        "attention_mask": torch.stack(attention_masks),  # [B, T]
+        "encoder_hidden_states": torch.stack(encoder_hidden_states),  # [B, L, D]
+        "encoder_attention_mask": torch.stack(encoder_attention_masks),  # [B, L]
+        "context_latents": torch.stack(context_latents),  # [B, T, 65]
+        "metadata": [s["metadata"] for s in batch],
+    }
+class PreprocessedDataModule(LightningDataModule if LIGHTNING_AVAILABLE else object):
+    """DataModule for preprocessed tensor files.
+    This is the recommended DataModule for training. It loads pre-computed tensors
+    directly without needing VAE, text encoder, or condition encoder at training time.
+    """
+    def __init__(
+        self,
+        tensor_dir: str,
+        batch_size: int = 1,
+        num_workers: int = 4,
+        pin_memory: bool = True,
+        val_split: float = 0.0,
+    ):
+        """Initialize the data module.
+        Args:
+            tensor_dir: Directory containing preprocessed .pt files
+            batch_size: Training batch size
+            num_workers: Number of data loading workers
+            pin_memory: Whether to pin memory for faster GPU transfer
+            val_split: Fraction of data for validation (0 = no validation)
+        """
+        if LIGHTNING_AVAILABLE:
+            super().__init__()
+        self.tensor_dir = tensor_dir
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.val_split = val_split
+        self.train_dataset = None
+        self.val_dataset = None
+    def setup(self, stage: Optional[str] = None):
+        """Setup datasets."""
+        if stage == 'fit' or stage is None:
+            # Create full dataset
+            full_dataset = PreprocessedTensorDataset(self.tensor_dir)
+            # Split if validation requested
+            if self.val_split > 0 and len(full_dataset) > 1:
+                n_val = max(1, int(len(full_dataset) * self.val_split))
+                n_train = len(full_dataset) - n_val
+                self.train_dataset, self.val_dataset = torch.utils.data.random_split(
+                    full_dataset, [n_train, n_val]
+                )
+            else:
+                self.train_dataset = full_dataset
+                self.val_dataset = None
+    def train_dataloader(self) -> DataLoader:
+        """Create training dataloader."""
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=collate_preprocessed_batch,
+            drop_last=True,
+        )
+    def val_dataloader(self) -> Optional[DataLoader]:
+        """Create validation dataloader."""
+        if self.val_dataset is None:
+            return None
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=collate_preprocessed_batch,
+        )
+# ============================================================================
+# Raw Audio Dataset (Legacy - for backward compatibility)
+# ============================================================================
+class AceStepTrainingDataset(Dataset):
+    """Dataset for ACE-Step LoRA training from raw audio.
+    DEPRECATED: Use PreprocessedTensorDataset instead for better performance.
+    Audio Format Requirements (handled automatically):
+    - Sample rate: 48kHz (resampled if different)
+    - Channels: Stereo (2 channels, mono is duplicated)
+    - Max duration: 240 seconds (4 minutes)
+    - Min duration: 5 seconds (padded if shorter)
+    """
+    def __init__(
+        self,
+        samples: List[Dict[str, Any]],
+        dit_handler,
+        max_duration: float = 240.0,
+        target_sample_rate: int = 48000,
+    ):
+        """Initialize the dataset."""
+        self.samples = samples
+        self.dit_handler = dit_handler
+        self.max_duration = max_duration
+        self.target_sample_rate = target_sample_rate
+        self.valid_samples = self._validate_samples()
+        logger.info(f"Dataset initialized with {len(self.valid_samples)} valid samples")
+    def _validate_samples(self) -> List[Dict[str, Any]]:
+        """Validate and filter samples."""
+        valid = []
+        for i, sample in enumerate(self.samples):
+            audio_path = sample.get("audio_path", "")
+            if not audio_path or not os.path.exists(audio_path):
+                logger.warning(f"Sample {i}: Audio file not found: {audio_path}")
+                continue
+            if not sample.get("caption"):
+                logger.warning(f"Sample {i}: Missing caption")
+                continue
+            valid.append(sample)
+        return valid
+    def __len__(self) -> int:
+        return len(self.valid_samples)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Get a single training sample."""
+        sample = self.valid_samples[idx]
+        audio_path = sample["audio_path"]
+        audio, sr = torchaudio.load(audio_path)
+        # Resample to 48kHz
+        if sr != self.target_sample_rate:
+            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
+            audio = resampler(audio)
+        # Convert to stereo
+        if audio.shape[0] == 1:
+            audio = audio.repeat(2, 1)
+        elif audio.shape[0] > 2:
+            audio = audio[:2, :]
+        # Truncate/pad
+        max_samples = int(self.max_duration * self.target_sample_rate)
+        if audio.shape[1] > max_samples:
+            audio = audio[:, :max_samples]
+        min_samples = int(5.0 * self.target_sample_rate)
+        if audio.shape[1] < min_samples:
+            padding = min_samples - audio.shape[1]
+            audio = torch.nn.functional.pad(audio, (0, padding))
+        return {
+            "audio": audio,
+            "caption": sample.get("caption", ""),
+            "lyrics": sample.get("lyrics", "[Instrumental]"),
+            "metadata": {
+                "caption": sample.get("caption", ""),
+                "lyrics": sample.get("lyrics", "[Instrumental]"),
+                "bpm": sample.get("bpm"),
+                "keyscale": sample.get("keyscale", ""),
+                "timesignature": sample.get("timesignature", ""),
+                "duration": sample.get("duration", audio.shape[1] / self.target_sample_rate),
+                "language": sample.get("language", "instrumental"),
+                "is_instrumental": sample.get("is_instrumental", True),
+            },
+            "audio_path": audio_path,
+        }
+def collate_training_batch(batch: List[Dict]) -> Dict[str, Any]:
+    """Collate function for raw audio batches (legacy)."""
+    max_len = max(sample["audio"].shape[1] for sample in batch)
+    padded_audio = []
+    attention_masks = []
+    for sample in batch:
+        audio = sample["audio"]
+        audio_len = audio.shape[1]
+        if audio_len < max_len:
+            padding = max_len - audio_len
+            audio = torch.nn.functional.pad(audio, (0, padding))
+        padded_audio.append(audio)
+        mask = torch.ones(max_len)
+        if audio_len < max_len:
+            mask[audio_len:] = 0
+        attention_masks.append(mask)
+    return {
+        "audio": torch.stack(padded_audio),
+        "attention_mask": torch.stack(attention_masks),
+        "captions": [s["caption"] for s in batch],
+        "lyrics": [s["lyrics"] for s in batch],
+        "metadata": [s["metadata"] for s in batch],
+        "audio_paths": [s["audio_path"] for s in batch],
+    }
+class AceStepDataModule(LightningDataModule if LIGHTNING_AVAILABLE else object):
+    """DataModule for raw audio loading (legacy).
+    DEPRECATED: Use PreprocessedDataModule for better training performance.
+    """
+    def __init__(
+        self,
+        samples: List[Dict[str, Any]],
+        dit_handler,
+        batch_size: int = 1,
+        num_workers: int = 4,
+        pin_memory: bool = True,
+        max_duration: float = 240.0,
+        val_split: float = 0.0,
+    ):
+        if LIGHTNING_AVAILABLE:
+            super().__init__()
+        self.samples = samples
+        self.dit_handler = dit_handler
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.pin_memory = pin_memory
+        self.max_duration = max_duration
+        self.val_split = val_split
+        self.train_dataset = None
+        self.val_dataset = None
+    def setup(self, stage: Optional[str] = None):
+        if stage == 'fit' or stage is None:
+            if self.val_split > 0 and len(self.samples) > 1:
+                n_val = max(1, int(len(self.samples) * self.val_split))
+                indices = list(range(len(self.samples)))
+                random.shuffle(indices)
+                val_indices = indices[:n_val]
+                train_indices = indices[n_val:]
+                train_samples = [self.samples[i] for i in train_indices]
+                val_samples = [self.samples[i] for i in val_indices]
+                self.train_dataset = AceStepTrainingDataset(
+                    train_samples, self.dit_handler, self.max_duration
+                )
+                self.val_dataset = AceStepTrainingDataset(
+                    val_samples, self.dit_handler, self.max_duration
+                )
+            else:
+                self.train_dataset = AceStepTrainingDataset(
+                    self.samples, self.dit_handler, self.max_duration
+                )
+                self.val_dataset = None
+    def train_dataloader(self) -> DataLoader:
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=collate_training_batch,
+            drop_last=True,
+        )
+    def val_dataloader(self) -> Optional[DataLoader]:
+        if self.val_dataset is None:
+            return None
+        return DataLoader(
+            self.val_dataset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=self.num_workers,
+            pin_memory=self.pin_memory,
+            collate_fn=collate_training_batch,
+        )
+def load_dataset_from_json(json_path: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    """Load a dataset from JSON file."""
+    with open(json_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    metadata = data.get("metadata", {})
+    samples = data.get("samples", [])
+    return samples, metadata

acestep/training/dataset_builder.py ADDED Viewed

	@@ -0,0 +1,755 @@

+"""
+Dataset Builder for LoRA Training
+Provides functionality to:
+1. Scan directories for audio files
+2. Auto-label audio using LLM
+3. Preview and edit metadata
+4. Save datasets in JSON format
+"""
+import os
+import json
+import uuid
+from datetime import datetime
+from dataclasses import dataclass, field, asdict
+from typing import List, Dict, Any, Optional, Tuple
+from pathlib import Path
+import torch
+import torchaudio
+from loguru import logger
+# Supported audio formats
+SUPPORTED_AUDIO_FORMATS = {'.wav', '.mp3', '.flac', '.ogg', '.opus'}
+@dataclass
+class AudioSample:
+    """Represents a single audio sample with its metadata.
+    Attributes:
+        id: Unique identifier for the sample
+        audio_path: Path to the audio file
+        filename: Original filename
+        caption: Generated or user-provided caption describing the music
+        lyrics: Lyrics or "[Instrumental]" for instrumental tracks
+        bpm: Beats per minute
+        keyscale: Musical key (e.g., "C Major", "Am")
+        timesignature: Time signature (e.g., "4" for 4/4)
+        duration: Duration in seconds
+        language: Vocal language or "instrumental"
+        is_instrumental: Whether the track is instrumental
+        custom_tag: User-defined activation tag for LoRA
+        labeled: Whether the sample has been labeled
+    """
+    id: str = ""
+    audio_path: str = ""
+    filename: str = ""
+    caption: str = ""
+    lyrics: str = "[Instrumental]"
+    bpm: Optional[int] = None
+    keyscale: str = ""
+    timesignature: str = ""
+    duration: float = 0.0
+    language: str = "instrumental"
+    is_instrumental: bool = True
+    custom_tag: str = ""
+    labeled: bool = False
+    def __post_init__(self):
+        if not self.id:
+            self.id = str(uuid.uuid4())[:8]
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return asdict(self)
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "AudioSample":
+        """Create from dictionary."""
+        return cls(**data)
+    def get_full_caption(self, tag_position: str = "prepend") -> str:
+        """Get caption with custom tag applied.
+        Args:
+            tag_position: Where to place the custom tag ("prepend", "append", "replace")
+        Returns:
+            Caption with custom tag applied
+        """
+        if not self.custom_tag:
+            return self.caption
+        if tag_position == "prepend":
+            return f"{self.custom_tag}, {self.caption}" if self.caption else self.custom_tag
+        elif tag_position == "append":
+            return f"{self.caption}, {self.custom_tag}" if self.caption else self.custom_tag
+        elif tag_position == "replace":
+            return self.custom_tag
+        else:
+            return self.caption
+@dataclass
+class DatasetMetadata:
+    """Metadata for the entire dataset.
+    Attributes:
+        name: Dataset name
+        custom_tag: Default custom tag for all samples
+        tag_position: Where to place custom tag ("prepend", "append", "replace")
+        created_at: Creation timestamp
+        num_samples: Number of samples in the dataset
+        all_instrumental: Whether all tracks are instrumental
+    """
+    name: str = "untitled_dataset"
+    custom_tag: str = ""
+    tag_position: str = "prepend"
+    created_at: str = ""
+    num_samples: int = 0
+    all_instrumental: bool = True
+    def __post_init__(self):
+        if not self.created_at:
+            self.created_at = datetime.now().isoformat()
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert to dictionary."""
+        return asdict(self)
+class DatasetBuilder:
+    """Builder for creating training datasets from audio files.
+    This class handles:
+    - Scanning directories for audio files
+    - Auto-labeling using LLM
+    - Managing sample metadata
+    - Saving/loading datasets
+    """
+    def __init__(self):
+        """Initialize the dataset builder."""
+        self.samples: List[AudioSample] = []
+        self.metadata = DatasetMetadata()
+        self._current_dir: str = ""
+    def scan_directory(self, directory: str) -> Tuple[List[AudioSample], str]:
+        """Scan a directory for audio files.
+        Args:
+            directory: Path to directory containing audio files
+        Returns:
+            Tuple of (list of AudioSample objects, status message)
+        """
+        if not os.path.exists(directory):
+            return [], f"❌ Directory not found: {directory}"
+        if not os.path.isdir(directory):
+            return [], f"❌ Not a directory: {directory}"
+        self._current_dir = directory
+        self.samples = []
+        # Scan for audio files
+        audio_files = []
+        for root, dirs, files in os.walk(directory):
+            for file in files:
+                ext = os.path.splitext(file)[1].lower()
+                if ext in SUPPORTED_AUDIO_FORMATS:
+                    audio_files.append(os.path.join(root, file))
+        if not audio_files:
+            return [], f"❌ No audio files found in {directory}\nSupported formats: {', '.join(SUPPORTED_AUDIO_FORMATS)}"
+        # Sort files by name
+        audio_files.sort()
+        # Create AudioSample objects
+        for audio_path in audio_files:
+            try:
+                # Get duration
+                duration = self._get_audio_duration(audio_path)
+                sample = AudioSample(
+                    audio_path=audio_path,
+                    filename=os.path.basename(audio_path),
+                    duration=duration,
+                    is_instrumental=self.metadata.all_instrumental,
+                    custom_tag=self.metadata.custom_tag,
+                )
+                self.samples.append(sample)
+            except Exception as e:
+                logger.warning(f"Failed to process {audio_path}: {e}")
+        self.metadata.num_samples = len(self.samples)
+        status = f"✅ Found {len(self.samples)} audio files in {directory}"
+        return self.samples, status
+    def _get_audio_duration(self, audio_path: str) -> float:
+        """Get the duration of an audio file in seconds.
+        Args:
+            audio_path: Path to audio file
+        Returns:
+            Duration in seconds
+        """
+        try:
+            info = torchaudio.info(audio_path)
+            return info.num_frames / info.sample_rate
+        except Exception as e:
+            logger.warning(f"Failed to get duration for {audio_path}: {e}")
+            return 0.0
+    def label_sample(
+        self,
+        sample_idx: int,
+        dit_handler,
+        llm_handler,
+        progress_callback=None,
+    ) -> Tuple[AudioSample, str]:
+        """Label a single sample using the LLM.
+        Args:
+            sample_idx: Index of sample to label
+            dit_handler: DiT handler for audio encoding
+            llm_handler: LLM handler for caption generation
+            progress_callback: Optional callback for progress updates
+        Returns:
+            Tuple of (updated AudioSample, status message)
+        """
+        if sample_idx < 0 or sample_idx >= len(self.samples):
+            return None, f"❌ Invalid sample index: {sample_idx}"
+        sample = self.samples[sample_idx]
+        try:
+            if progress_callback:
+                progress_callback(f"Processing: {sample.filename}")
+            # Step 1: Load and encode audio to get audio codes
+            audio_codes = self._get_audio_codes(sample.audio_path, dit_handler)
+            if not audio_codes:
+                return sample, f"❌ Failed to encode audio: {sample.filename}"
+            if progress_callback:
+                progress_callback(f"Generating metadata for: {sample.filename}")
+            # Step 2: Use LLM to understand the audio
+            metadata, status = llm_handler.understand_audio_from_codes(
+                audio_codes=audio_codes,
+                temperature=0.7,
+                use_constrained_decoding=True,
+            )
+            if not metadata:
+                return sample, f"❌ LLM labeling failed: {status}"
+            # Step 3: Update sample with generated metadata
+            sample.caption = metadata.get('caption', '')
+            sample.bpm = self._parse_int(metadata.get('bpm'))
+            sample.keyscale = metadata.get('keyscale', '')
+            sample.timesignature = metadata.get('timesignature', '')
+            sample.language = metadata.get('vocal_language', 'instrumental')
+            # Handle lyrics based on instrumental flag
+            if sample.is_instrumental:
+                sample.lyrics = "[Instrumental]"
+                sample.language = "instrumental"
+            else:
+                sample.lyrics = metadata.get('lyrics', '')
+            # NOTE: Duration is NOT overwritten from LM metadata.
+            # We keep the real audio duration obtained from torchaudio during scan.
+            sample.labeled = True
+            self.samples[sample_idx] = sample
+            return sample, f"✅ Labeled: {sample.filename}"
+        except Exception as e:
+            logger.exception(f"Error labeling sample {sample.filename}")
+            return sample, f"❌ Error: {str(e)}"
+    def label_all_samples(
+        self,
+        dit_handler,
+        llm_handler,
+        progress_callback=None,
+    ) -> Tuple[List[AudioSample], str]:
+        """Label all samples in the dataset.
+        Args:
+            dit_handler: DiT handler for audio encoding
+            llm_handler: LLM handler for caption generation
+            progress_callback: Optional callback for progress updates
+        Returns:
+            Tuple of (list of updated samples, status message)
+        """
+        if not self.samples:
+            return [], "❌ No samples to label. Please scan a directory first."
+        success_count = 0
+        fail_count = 0
+        for i, sample in enumerate(self.samples):
+            if progress_callback:
+                progress_callback(f"Labeling {i+1}/{len(self.samples)}: {sample.filename}")
+            _, status = self.label_sample(i, dit_handler, llm_handler, progress_callback)
+            if "✅" in status:
+                success_count += 1
+            else:
+                fail_count += 1
+        status_msg = f"✅ Labeled {success_count}/{len(self.samples)} samples"
+        if fail_count > 0:
+            status_msg += f" ({fail_count} failed)"
+        return self.samples, status_msg
+    def _get_audio_codes(self, audio_path: str, dit_handler) -> Optional[str]:
+        """Encode audio to get semantic codes for LLM understanding.
+        Args:
+            audio_path: Path to audio file
+            dit_handler: DiT handler with VAE and tokenizer
+        Returns:
+            Audio codes string or None if failed
+        """
+        try:
+            # Check if handler has required methods
+            if not hasattr(dit_handler, 'convert_src_audio_to_codes'):
+                logger.error("DiT handler missing convert_src_audio_to_codes method")
+                return None
+            # Use handler's method to convert audio to codes
+            codes_string = dit_handler.convert_src_audio_to_codes(audio_path)
+            if codes_string and not codes_string.startswith("❌"):
+                return codes_string
+            else:
+                logger.warning(f"Failed to convert audio to codes: {codes_string}")
+                return None
+        except Exception as e:
+            logger.exception(f"Error encoding audio {audio_path}")
+            return None
+    def _parse_int(self, value: Any) -> Optional[int]:
+        """Safely parse an integer value."""
+        if value is None or value == "N/A" or value == "":
+            return None
+        try:
+            return int(value)
+        except (ValueError, TypeError):
+            return None
+    def update_sample(self, sample_idx: int, **kwargs) -> Tuple[AudioSample, str]:
+        """Update a sample's metadata.
+        Args:
+            sample_idx: Index of sample to update
+            **kwargs: Fields to update
+        Returns:
+            Tuple of (updated sample, status message)
+        """
+        if sample_idx < 0 or sample_idx >= len(self.samples):
+            return None, f"❌ Invalid sample index: {sample_idx}"
+        sample = self.samples[sample_idx]
+        for key, value in kwargs.items():
+            if hasattr(sample, key):
+                setattr(sample, key, value)
+        self.samples[sample_idx] = sample
+        return sample, f"✅ Updated: {sample.filename}"
+    def set_custom_tag(self, custom_tag: str, tag_position: str = "prepend"):
+        """Set the custom tag for all samples.
+        Args:
+            custom_tag: Custom activation tag
+            tag_position: Where to place tag ("prepend", "append", "replace")
+        """
+        self.metadata.custom_tag = custom_tag
+        self.metadata.tag_position = tag_position
+        for sample in self.samples:
+            sample.custom_tag = custom_tag
+    def set_all_instrumental(self, is_instrumental: bool):
+        """Set instrumental flag for all samples.
+        Args:
+            is_instrumental: Whether all tracks are instrumental
+        """
+        self.metadata.all_instrumental = is_instrumental
+        for sample in self.samples:
+            sample.is_instrumental = is_instrumental
+            if is_instrumental:
+                sample.lyrics = "[Instrumental]"
+                sample.language = "instrumental"
+    def get_sample_count(self) -> int:
+        """Get the number of samples in the dataset."""
+        return len(self.samples)
+    def get_labeled_count(self) -> int:
+        """Get the number of labeled samples."""
+        return sum(1 for s in self.samples if s.labeled)
+    def save_dataset(self, output_path: str, dataset_name: str = None) -> str:
+        """Save the dataset to a JSON file.
+        Args:
+            output_path: Path to save the dataset JSON
+            dataset_name: Optional name for the dataset
+        Returns:
+            Status message
+        """
+        if not self.samples:
+            return "❌ No samples to save"
+        if dataset_name:
+            self.metadata.name = dataset_name
+        self.metadata.num_samples = len(self.samples)
+        self.metadata.created_at = datetime.now().isoformat()
+        # Build dataset with captions that include custom tags
+        dataset = {
+            "metadata": self.metadata.to_dict(),
+            "samples": []
+        }
+        for sample in self.samples:
+            sample_dict = sample.to_dict()
+            # Apply custom tag to caption based on position
+            sample_dict["caption"] = sample.get_full_caption(self.metadata.tag_position)
+            dataset["samples"].append(sample_dict)
+        try:
+            # Ensure output directory exists
+            os.makedirs(os.path.dirname(output_path) if os.path.dirname(output_path) else ".", exist_ok=True)
+            with open(output_path, 'w', encoding='utf-8') as f:
+                json.dump(dataset, f, indent=2, ensure_ascii=False)
+            return f"✅ Dataset saved to {output_path}\n{len(self.samples)} samples, tag: '{self.metadata.custom_tag}'"
+        except Exception as e:
+            logger.exception("Error saving dataset")
+            return f"❌ Failed to save dataset: {str(e)}"
+    def load_dataset(self, dataset_path: str) -> Tuple[List[AudioSample], str]:
+        """Load a dataset from a JSON file.
+        Args:
+            dataset_path: Path to the dataset JSON file
+        Returns:
+            Tuple of (list of samples, status message)
+        """
+        if not os.path.exists(dataset_path):
+            return [], f"❌ Dataset not found: {dataset_path}"
+        try:
+            with open(dataset_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            # Load metadata
+            if "metadata" in data:
+                meta_dict = data["metadata"]
+                self.metadata = DatasetMetadata(
+                    name=meta_dict.get("name", "untitled"),
+                    custom_tag=meta_dict.get("custom_tag", ""),
+                    tag_position=meta_dict.get("tag_position", "prepend"),
+                    created_at=meta_dict.get("created_at", ""),
+                    num_samples=meta_dict.get("num_samples", 0),
+                    all_instrumental=meta_dict.get("all_instrumental", True),
+                )
+            # Load samples
+            self.samples = []
+            for sample_dict in data.get("samples", []):
+                sample = AudioSample.from_dict(sample_dict)
+                self.samples.append(sample)
+            return self.samples, f"✅ Loaded {len(self.samples)} samples from {dataset_path}"
+        except Exception as e:
+            logger.exception("Error loading dataset")
+            return [], f"❌ Failed to load dataset: {str(e)}"
+    def get_samples_dataframe_data(self) -> List[List[Any]]:
+        """Get samples data in a format suitable for Gradio DataFrame.
+        Returns:
+            List of rows for DataFrame display
+        """
+        rows = []
+        for i, sample in enumerate(self.samples):
+            rows.append([
+                i,
+                sample.filename,
+                f"{sample.duration:.1f}s",
+                "✅" if sample.labeled else "❌",
+                sample.bpm or "-",
+                sample.keyscale or "-",
+                sample.caption[:50] + "..." if len(sample.caption) > 50 else sample.caption or "-",
+            ])
+        return rows
+    def to_training_format(self) -> List[Dict[str, Any]]:
+        """Convert dataset to format suitable for training.
+        Returns:
+            List of training sample dictionaries
+        """
+        training_samples = []
+        for sample in self.samples:
+            if not sample.labeled:
+                continue
+            training_sample = {
+                "audio_path": sample.audio_path,
+                "caption": sample.get_full_caption(self.metadata.tag_position),
+                "lyrics": sample.lyrics,
+                "bpm": sample.bpm,
+                "keyscale": sample.keyscale,
+                "timesignature": sample.timesignature,
+                "duration": sample.duration,
+                "language": sample.language,
+                "is_instrumental": sample.is_instrumental,
+            }
+            training_samples.append(training_sample)
+        return training_samples
+    def preprocess_to_tensors(
+        self,
+        dit_handler,
+        output_dir: str,
+        max_duration: float = 240.0,
+        progress_callback=None,
+    ) -> Tuple[List[str], str]:
+        """Preprocess all labeled samples to tensor files for efficient training.
+        This method pre-computes all tensors needed by the DiT decoder:
+        - target_latents: VAE-encoded audio
+        - encoder_hidden_states: Condition encoder output
+        - context_latents: Source context (silence_latent + zeros for text2music)
+        Args:
+            dit_handler: Initialized DiT handler with model, VAE, and text encoder
+            output_dir: Directory to save preprocessed .pt files
+            max_duration: Maximum audio duration in seconds (default 240s = 4 min)
+            progress_callback: Optional callback for progress updates
+        Returns:
+            Tuple of (list of output paths, status message)
+        """
+        if not self.samples:
+            return [], "❌ No samples to preprocess"
+        labeled_samples = [s for s in self.samples if s.labeled]
+        if not labeled_samples:
+            return [], "❌ No labeled samples to preprocess"
+        # Validate handler
+        if dit_handler is None or dit_handler.model is None:
+            return [], "❌ Model not initialized. Please initialize the service first."
+        # Create output directory
+        os.makedirs(output_dir, exist_ok=True)
+        output_paths = []
+        success_count = 0
+        fail_count = 0
+        # Get model and components
+        model = dit_handler.model
+        vae = dit_handler.vae
+        text_encoder = dit_handler.text_encoder
+        text_tokenizer = dit_handler.text_tokenizer
+        silence_latent = dit_handler.silence_latent
+        device = dit_handler.device
+        dtype = dit_handler.dtype
+        target_sample_rate = 48000
+        for i, sample in enumerate(labeled_samples):
+            try:
+                if progress_callback:
+                    progress_callback(f"Preprocessing {i+1}/{len(labeled_samples)}: {sample.filename}")
+                # Step 1: Load and preprocess audio to stereo @ 48kHz
+                audio, sr = torchaudio.load(sample.audio_path)
+                # Resample if needed
+                if sr != target_sample_rate:
+                    resampler = torchaudio.transforms.Resample(sr, target_sample_rate)
+                    audio = resampler(audio)
+                # Convert to stereo
+                if audio.shape[0] == 1:
+                    audio = audio.repeat(2, 1)
+                elif audio.shape[0] > 2:
+                    audio = audio[:2, :]
+                # Truncate to max duration
+                max_samples = int(max_duration * target_sample_rate)
+                if audio.shape[1] > max_samples:
+                    audio = audio[:, :max_samples]
+                # Add batch dimension: [2, T] -> [1, 2, T]
+                audio = audio.unsqueeze(0).to(device).to(vae.dtype)
+                # Step 2: VAE encode audio to get target_latents
+                with torch.no_grad():
+                    latent = vae.encode(audio).latent_dist.sample()
+                    # [1, 64, T_latent] -> [1, T_latent, 64]
+                    target_latents = latent.transpose(1, 2).to(dtype)
+                latent_length = target_latents.shape[1]
+                # Step 3: Create attention mask (all ones for valid audio)
+                attention_mask = torch.ones(1, latent_length, device=device, dtype=dtype)
+                # Step 4: Encode caption text
+                caption = sample.get_full_caption(self.metadata.tag_position)
+                text_inputs = text_tokenizer(
+                    caption,
+                    padding="max_length",
+                    max_length=256,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                text_input_ids = text_inputs.input_ids.to(device)
+                text_attention_mask = text_inputs.attention_mask.to(device).to(dtype)
+                with torch.no_grad():
+                    text_outputs = text_encoder(text_input_ids)
+                    text_hidden_states = text_outputs.last_hidden_state.to(dtype)
+                # Step 5: Encode lyrics
+                lyrics = sample.lyrics if sample.lyrics else "[Instrumental]"
+                lyric_inputs = text_tokenizer(
+                    lyrics,
+                    padding="max_length",
+                    max_length=512,
+                    truncation=True,
+                    return_tensors="pt",
+                )
+                lyric_input_ids = lyric_inputs.input_ids.to(device)
+                lyric_attention_mask = lyric_inputs.attention_mask.to(device).to(dtype)
+                with torch.no_grad():
+                    lyric_hidden_states = text_encoder.embed_tokens(lyric_input_ids).to(dtype)
+                # Step 6: Prepare refer_audio (empty for text2music)
+                # Create minimal refer_audio placeholder
+                refer_audio_hidden = torch.zeros(1, 1, 64, device=device, dtype=dtype)
+                refer_audio_order_mask = torch.zeros(1, device=device, dtype=torch.long)
+                # Step 7: Run model.encoder to get encoder_hidden_states
+                with torch.no_grad():
+                    encoder_hidden_states, encoder_attention_mask = model.encoder(
+                        text_hidden_states=text_hidden_states,
+                        text_attention_mask=text_attention_mask,
+                        lyric_hidden_states=lyric_hidden_states,
+                        lyric_attention_mask=lyric_attention_mask,
+                        refer_audio_acoustic_hidden_states_packed=refer_audio_hidden,
+                        refer_audio_order_mask=refer_audio_order_mask,
+                    )
+                # Step 8: Build context_latents for text2music
+                # For text2music: src_latents = silence_latent, is_covers = 0
+                # chunk_masks: 1 = generate, 0 = keep original
+                # IMPORTANT: chunk_masks must have same shape as src_latents [B, T, 64]
+                # For text2music, we want to generate the entire audio, so chunk_masks = all 1s
+                src_latents = silence_latent[:, :latent_length, :].to(dtype)
+                if src_latents.shape[0] < 1:
+                    src_latents = src_latents.expand(1, -1, -1)
+                # Pad or truncate silence_latent to match latent_length
+                if src_latents.shape[1] < latent_length:
+                    pad_len = latent_length - src_latents.shape[1]
+                    src_latents = torch.cat([
+                        src_latents,
+                        silence_latent[:, :pad_len, :].expand(1, -1, -1).to(dtype)
+                    ], dim=1)
+                elif src_latents.shape[1] > latent_length:
+                    src_latents = src_latents[:, :latent_length, :]
+                # chunk_masks = 1 means "generate this region", 0 = keep original
+                # Shape must match src_latents: [B, T, 64] (NOT [B, T, 1])
+                # For text2music, generate everything -> all 1s with shape [1, T, 64]
+                chunk_masks = torch.ones(1, latent_length, 64, device=device, dtype=dtype)
+                # context_latents = [src_latents, chunk_masks] -> [B, T, 128]
+                context_latents = torch.cat([src_latents, chunk_masks], dim=-1)
+                # Step 9: Save all tensors to .pt file (squeeze batch dimension for storage)
+                output_data = {
+                    "target_latents": target_latents.squeeze(0).cpu(),  # [T, 64]
+                    "attention_mask": attention_mask.squeeze(0).cpu(),  # [T]
+                    "encoder_hidden_states": encoder_hidden_states.squeeze(0).cpu(),  # [L, D]
+                    "encoder_attention_mask": encoder_attention_mask.squeeze(0).cpu(),  # [L]
+                    "context_latents": context_latents.squeeze(0).cpu(),  # [T, 65]
+                    "metadata": {
+                        "audio_path": sample.audio_path,
+                        "filename": sample.filename,
+                        "caption": caption,
+                        "lyrics": lyrics,
+                        "duration": sample.duration,
+                        "bpm": sample.bpm,
+                        "keyscale": sample.keyscale,
+                        "timesignature": sample.timesignature,
+                        "language": sample.language,
+                        "is_instrumental": sample.is_instrumental,
+                    }
+                }
+                # Save with sample ID as filename
+                output_path = os.path.join(output_dir, f"{sample.id}.pt")
+                torch.save(output_data, output_path)
+                output_paths.append(output_path)
+                success_count += 1
+            except Exception as e:
+                logger.exception(f"Error preprocessing {sample.filename}")
+                fail_count += 1
+                if progress_callback:
+                    progress_callback(f"❌ Failed: {sample.filename}: {str(e)}")
+        # Save manifest file listing all preprocessed samples
+        manifest = {
+            "metadata": self.metadata.to_dict(),
+            "samples": output_paths,
+            "num_samples": len(output_paths),
+        }
+        manifest_path = os.path.join(output_dir, "manifest.json")
+        with open(manifest_path, 'w', encoding='utf-8') as f:
+            json.dump(manifest, f, indent=2)
+        status = f"✅ Preprocessed {success_count}/{len(labeled_samples)} samples to {output_dir}"
+        if fail_count > 0:
+            status += f" ({fail_count} failed)"
+        return output_paths, status

acestep/training/lora_utils.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+LoRA Utilities for ACE-Step
+Provides utilities for injecting LoRA adapters into the DiT decoder model.
+Uses PEFT (Parameter-Efficient Fine-Tuning) library for LoRA implementation.
+"""
+import os
+from typing import Optional, List, Dict, Any, Tuple
+from loguru import logger
+import torch
+import torch.nn as nn
+try:
+    from peft import (
+        get_peft_model,
+        LoraConfig,
+        TaskType,
+        PeftModel,
+        PeftConfig,
+    )
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
+    logger.warning("PEFT library not installed. LoRA training will not be available.")
+from acestep.training.configs import LoRAConfig
+def check_peft_available() -> bool:
+    """Check if PEFT library is available."""
+    return PEFT_AVAILABLE
+def get_dit_target_modules(model) -> List[str]:
+    """Get the list of module names in the DiT decoder that can have LoRA applied.
+    Args:
+        model: The AceStepConditionGenerationModel
+    Returns:
+        List of module names suitable for LoRA
+    """
+    target_modules = []
+    # Focus on the decoder (DiT) attention layers
+    if hasattr(model, 'decoder'):
+        for name, module in model.decoder.named_modules():
+            # Target attention projection layers
+            if any(proj in name for proj in ['q_proj', 'k_proj', 'v_proj', 'o_proj']):
+                if isinstance(module, nn.Linear):
+                    target_modules.append(name)
+    return target_modules
+def freeze_non_lora_parameters(model, freeze_encoder: bool = True) -> None:
+    """Freeze all non-LoRA parameters in the model.
+    Args:
+        model: The model to freeze parameters for
+        freeze_encoder: Whether to freeze the encoder (condition encoder)
+    """
+    # Freeze all parameters first
+    for param in model.parameters():
+        param.requires_grad = False
+    # Count frozen and trainable parameters
+    total_params = 0
+    trainable_params = 0
+    for name, param in model.named_parameters():
+        total_params += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    logger.info(f"Frozen parameters: {total_params - trainable_params:,}")
+    logger.info(f"Trainable parameters: {trainable_params:,}")
+def inject_lora_into_dit(
+    model,
+    lora_config: LoRAConfig,
+) -> Tuple[Any, Dict[str, Any]]:
+    """Inject LoRA adapters into the DiT decoder of the model.
+    Args:
+        model: The AceStepConditionGenerationModel
+        lora_config: LoRA configuration
+    Returns:
+        Tuple of (peft_model, info_dict)
+    """
+    if not PEFT_AVAILABLE:
+        raise ImportError("PEFT library is required for LoRA training. Install with: pip install peft")
+    # Get the decoder (DiT model)
+    decoder = model.decoder
+    # Create PEFT LoRA config
+    peft_lora_config = LoraConfig(
+        r=lora_config.r,
+        lora_alpha=lora_config.alpha,
+        lora_dropout=lora_config.dropout,
+        target_modules=lora_config.target_modules,
+        bias=lora_config.bias,
+        task_type=TaskType.FEATURE_EXTRACTION,  # For diffusion models
+    )
+    # Apply LoRA to the decoder
+    peft_decoder = get_peft_model(decoder, peft_lora_config)
+    # Replace the decoder in the original model
+    model.decoder = peft_decoder
+    # Freeze all non-LoRA parameters
+    # Freeze encoder, tokenizer, detokenizer
+    for name, param in model.named_parameters():
+        # Only keep LoRA parameters trainable
+        if 'lora_' not in name:
+            param.requires_grad = False
+    # Count parameters
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    info = {
+        "total_params": total_params,
+        "trainable_params": trainable_params,
+        "trainable_ratio": trainable_params / total_params if total_params > 0 else 0,
+        "lora_r": lora_config.r,
+        "lora_alpha": lora_config.alpha,
+        "target_modules": lora_config.target_modules,
+    }
+    logger.info(f"LoRA injected into DiT decoder:")
+    logger.info(f"  Total parameters: {total_params:,}")
+    logger.info(f"  Trainable parameters: {trainable_params:,} ({info['trainable_ratio']:.2%})")
+    logger.info(f"  LoRA rank: {lora_config.r}, alpha: {lora_config.alpha}")
+    return model, info
+def save_lora_weights(
+    model,
+    output_dir: str,
+    save_full_model: bool = False,
+) -> str:
+    """Save LoRA adapter weights.
+    Args:
+        model: Model with LoRA adapters
+        output_dir: Directory to save weights
+        save_full_model: Whether to save the full model state dict
+    Returns:
+        Path to saved weights
+    """
+    os.makedirs(output_dir, exist_ok=True)
+    if hasattr(model, 'decoder') and hasattr(model.decoder, 'save_pretrained'):
+        # Save PEFT adapter
+        adapter_path = os.path.join(output_dir, "adapter")
+        model.decoder.save_pretrained(adapter_path)
+        logger.info(f"LoRA adapter saved to {adapter_path}")
+        return adapter_path
+    elif save_full_model:
+        # Save full model state dict (larger file)
+        model_path = os.path.join(output_dir, "model.pt")
+        torch.save(model.state_dict(), model_path)
+        logger.info(f"Full model state dict saved to {model_path}")
+        return model_path
+    else:
+        # Extract only LoRA parameters
+        lora_state_dict = {}
+        for name, param in model.named_parameters():
+            if 'lora_' in name:
+                lora_state_dict[name] = param.data.clone()
+        if not lora_state_dict:
+            logger.warning("No LoRA parameters found to save!")
+            return ""
+        lora_path = os.path.join(output_dir, "lora_weights.pt")
+        torch.save(lora_state_dict, lora_path)
+        logger.info(f"LoRA weights saved to {lora_path}")
+        return lora_path
+def load_lora_weights(
+    model,
+    lora_path: str,
+    lora_config: Optional[LoRAConfig] = None,
+) -> Any:
+    """Load LoRA adapter weights into the model.
+    Args:
+        model: The base model (without LoRA)
+        lora_path: Path to saved LoRA weights (adapter or .pt file)
+        lora_config: LoRA configuration (required if loading from .pt file)
+    Returns:
+        Model with LoRA weights loaded
+    """
+    if not os.path.exists(lora_path):
+        raise FileNotFoundError(f"LoRA weights not found: {lora_path}")
+    # Check if it's a PEFT adapter directory
+    if os.path.isdir(lora_path):
+        if not PEFT_AVAILABLE:
+            raise ImportError("PEFT library is required to load adapter. Install with: pip install peft")
+        # Load PEFT adapter
+        peft_config = PeftConfig.from_pretrained(lora_path)
+        model.decoder = PeftModel.from_pretrained(model.decoder, lora_path)
+        logger.info(f"LoRA adapter loaded from {lora_path}")
+    elif lora_path.endswith('.pt'):
+        # Load from PyTorch state dict
+        if lora_config is None:
+            raise ValueError("lora_config is required when loading from .pt file")
+        # First inject LoRA structure
+        model, _ = inject_lora_into_dit(model, lora_config)
+        # Load weights
+        lora_state_dict = torch.load(lora_path, map_location='cpu')
+        # Load into model
+        model_state = model.state_dict()
+        for name, param in lora_state_dict.items():
+            if name in model_state:
+                model_state[name].copy_(param)
+            else:
+                logger.warning(f"Unexpected key in LoRA state dict: {name}")
+        logger.info(f"LoRA weights loaded from {lora_path}")
+    else:
+        raise ValueError(f"Unsupported LoRA weight format: {lora_path}")
+    return model
+def merge_lora_weights(model) -> Any:
+    """Merge LoRA weights into the base model.
+    This permanently integrates the LoRA adaptations into the model weights.
+    After merging, the model can be used without PEFT.
+    Args:
+        model: Model with LoRA adapters
+    Returns:
+        Model with merged weights
+    """
+    if hasattr(model, 'decoder') and hasattr(model.decoder, 'merge_and_unload'):
+        # PEFT model - merge and unload
+        model.decoder = model.decoder.merge_and_unload()
+        logger.info("LoRA weights merged into base model")
+    else:
+        logger.warning("Model does not support LoRA merging")
+    return model
+def get_lora_info(model) -> Dict[str, Any]:
+    """Get information about LoRA adapters in the model.
+    Args:
+        model: Model to inspect
+    Returns:
+        Dictionary with LoRA information
+    """
+    info = {
+        "has_lora": False,
+        "lora_params": 0,
+        "total_params": 0,
+        "modules_with_lora": [],
+    }
+    total_params = 0
+    lora_params = 0
+    lora_modules = []
+    for name, param in model.named_parameters():
+        total_params += param.numel()
+        if 'lora_' in name:
+            lora_params += param.numel()
+            # Extract module name
+            module_name = name.rsplit('.lora_', 1)[0]
+            if module_name not in lora_modules:
+                lora_modules.append(module_name)
+    info["total_params"] = total_params
+    info["lora_params"] = lora_params
+    info["has_lora"] = lora_params > 0
+    info["modules_with_lora"] = lora_modules
+    if total_params > 0:
+        info["lora_ratio"] = lora_params / total_params
+    return info

acestep/training/trainer.py ADDED Viewed

	@@ -0,0 +1,503 @@

+"""
+LoRA Trainer for ACE-Step
+Lightning Fabric-based trainer for LoRA fine-tuning of ACE-Step DiT decoder.
+Supports training from preprocessed tensor files for optimal performance.
+"""
+import os
+import time
+from typing import Optional, List, Dict, Any, Tuple, Generator
+from loguru import logger
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts, LinearLR, SequentialLR
+try:
+    from lightning.fabric import Fabric
+    from lightning.fabric.loggers import TensorBoardLogger
+    LIGHTNING_AVAILABLE = True
+except ImportError:
+    LIGHTNING_AVAILABLE = False
+    logger.warning("Lightning Fabric not installed. Training will use basic training loop.")
+from acestep.training.configs import LoRAConfig, TrainingConfig
+from acestep.training.lora_utils import inject_lora_into_dit, save_lora_weights, check_peft_available
+from acestep.training.data_module import PreprocessedDataModule
+# Turbo model shift=3.0 discrete timesteps (8 steps, same as inference)
+TURBO_SHIFT3_TIMESTEPS = [1.0, 0.9545454545454546, 0.9, 0.8333333333333334, 0.75, 0.6428571428571429, 0.5, 0.3]
+def sample_discrete_timestep(bsz, device, dtype):
+    """Sample timesteps from discrete turbo shift=3 schedule.
+    For each sample in the batch, randomly select one of the 8 discrete timesteps
+    used by the turbo model with shift=3.0.
+    Args:
+        bsz: Batch size
+        device: Device
+        dtype: Data type (should be bfloat16)
+    Returns:
+        Tuple of (t, r) where both are the same sampled timestep
+    """
+    # Randomly select indices for each sample in batch
+    indices = torch.randint(0, len(TURBO_SHIFT3_TIMESTEPS), (bsz,), device=device)
+    # Convert to tensor and index
+    timesteps_tensor = torch.tensor(TURBO_SHIFT3_TIMESTEPS, device=device, dtype=dtype)
+    t = timesteps_tensor[indices]
+    # r = t for this training setup
+    r = t
+    return t, r
+class PreprocessedLoRAModule(nn.Module):
+    """LoRA Training Module using preprocessed tensors.
+    This module trains only the DiT decoder with LoRA adapters.
+    All inputs are pre-computed tensors - no VAE or text encoder needed!
+    Training flow:
+    1. Load pre-computed tensors (target_latents, encoder_hidden_states, context_latents)
+    2. Sample noise and timestep
+    3. Forward through decoder (with LoRA)
+    4. Compute flow matching loss
+    """
+    def __init__(
+        self,
+        model: nn.Module,
+        lora_config: LoRAConfig,
+        training_config: TrainingConfig,
+        device: torch.device,
+        dtype: torch.dtype,
+    ):
+        """Initialize the training module.
+        Args:
+            model: The AceStepConditionGenerationModel
+            lora_config: LoRA configuration
+            training_config: Training configuration
+            device: Device to use
+            dtype: Data type to use
+        """
+        super().__init__()
+        self.lora_config = lora_config
+        self.training_config = training_config
+        self.device = device
+        self.dtype = dtype
+        # Inject LoRA into the decoder only
+        if check_peft_available():
+            self.model, self.lora_info = inject_lora_into_dit(model, lora_config)
+            logger.info(f"LoRA injected: {self.lora_info['trainable_params']:,} trainable params")
+        else:
+            self.model = model
+            self.lora_info = {}
+            logger.warning("PEFT not available, training without LoRA adapters")
+        # Model config for flow matching
+        self.config = model.config
+        # Store training losses
+        self.training_losses = []
+    def training_step(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Single training step using preprocessed tensors.
+        Note: This is a distilled turbo model, NO CFG is used.
+        Args:
+            batch: Dictionary containing pre-computed tensors:
+                - target_latents: [B, T, 64] - VAE encoded audio
+                - attention_mask: [B, T] - Valid audio mask
+                - encoder_hidden_states: [B, L, D] - Condition encoder output
+                - encoder_attention_mask: [B, L] - Condition mask
+                - context_latents: [B, T, 128] - Source context
+        Returns:
+            Loss tensor (float32 for stable backward)
+        """
+        # Use autocast for bf16 mixed precision training
+        with torch.autocast(device_type='cuda', dtype=torch.bfloat16):
+            # Get tensors from batch (already on device from Fabric dataloader)
+            target_latents = batch["target_latents"].to(self.device)  # x0
+            attention_mask = batch["attention_mask"].to(self.device)
+            encoder_hidden_states = batch["encoder_hidden_states"].to(self.device)
+            encoder_attention_mask = batch["encoder_attention_mask"].to(self.device)
+            context_latents = batch["context_latents"].to(self.device)
+            bsz = target_latents.shape[0]
+            # Flow matching: sample noise x1 and interpolate with data x0
+            x1 = torch.randn_like(target_latents)  # Noise
+            x0 = target_latents  # Data
+            # Sample timesteps from discrete turbo shift=3 schedule (8 steps)
+            t, r = sample_discrete_timestep(bsz, self.device, torch.bfloat16)
+            t_ = t.unsqueeze(-1).unsqueeze(-1)
+            # Interpolate: x_t = t * x1 + (1 - t) * x0
+            xt = t_ * x1 + (1.0 - t_) * x0
+            # Forward through decoder (distilled turbo model, no CFG)
+            decoder_outputs = self.model.decoder(
+                hidden_states=xt,
+                timestep=t,
+                timestep_r=t,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                context_latents=context_latents,
+            )
+            # Flow matching loss: predict the flow field v = x1 - x0
+            flow = x1 - x0
+            diffusion_loss = F.mse_loss(decoder_outputs[0], flow)
+        # Convert loss to float32 for stable backward pass
+        diffusion_loss = diffusion_loss.float()
+        self.training_losses.append(diffusion_loss.item())
+        return diffusion_loss
+class LoRATrainer:
+    """High-level trainer for ACE-Step LoRA fine-tuning.
+    Uses Lightning Fabric for distributed training and mixed precision.
+    Supports training from preprocessed tensor directories.
+    """
+    def __init__(
+        self,
+        dit_handler,
+        lora_config: LoRAConfig,
+        training_config: TrainingConfig,
+    ):
+        """Initialize the trainer.
+        Args:
+            dit_handler: Initialized DiT handler (for model access)
+            lora_config: LoRA configuration
+            training_config: Training configuration
+        """
+        self.dit_handler = dit_handler
+        self.lora_config = lora_config
+        self.training_config = training_config
+        self.module = None
+        self.fabric = None
+        self.is_training = False
+    def train_from_preprocessed(
+        self,
+        tensor_dir: str,
+        training_state: Optional[Dict] = None,
+    ) -> Generator[Tuple[int, float, str], None, None]:
+        """Train LoRA adapters from preprocessed tensor files.
+        This is the recommended training method for best performance.
+        Args:
+            tensor_dir: Directory containing preprocessed .pt files
+            training_state: Optional state dict for stopping control
+        Yields:
+            Tuples of (step, loss, status_message)
+        """
+        self.is_training = True
+        try:
+            # Validate tensor directory
+            if not os.path.exists(tensor_dir):
+                yield 0, 0.0, f"❌ Tensor directory not found: {tensor_dir}"
+                return
+            # Create training module
+            self.module = PreprocessedLoRAModule(
+                model=self.dit_handler.model,
+                lora_config=self.lora_config,
+                training_config=self.training_config,
+                device=self.dit_handler.device,
+                dtype=self.dit_handler.dtype,
+            )
+            # Create data module
+            data_module = PreprocessedDataModule(
+                tensor_dir=tensor_dir,
+                batch_size=self.training_config.batch_size,
+                num_workers=self.training_config.num_workers,
+                pin_memory=self.training_config.pin_memory,
+            )
+            # Setup data
+            data_module.setup('fit')
+            if len(data_module.train_dataset) == 0:
+                yield 0, 0.0, "❌ No valid samples found in tensor directory"
+                return
+            yield 0, 0.0, f"📂 Loaded {len(data_module.train_dataset)} preprocessed samples"
+            if LIGHTNING_AVAILABLE:
+                yield from self._train_with_fabric(data_module, training_state)
+            else:
+                yield from self._train_basic(data_module, training_state)
+        except Exception as e:
+            logger.exception("Training failed")
+            yield 0, 0.0, f"❌ Training failed: {str(e)}"
+        finally:
+            self.is_training = False
+    def _train_with_fabric(
+        self,
+        data_module: PreprocessedDataModule,
+        training_state: Optional[Dict],
+    ) -> Generator[Tuple[int, float, str], None, None]:
+        """Train using Lightning Fabric."""
+        # Create output directory
+        os.makedirs(self.training_config.output_dir, exist_ok=True)
+        # Force BFloat16 precision (only supported precision for this model)
+        precision = "bf16-mixed"
+        # Create TensorBoard logger
+        tb_logger = TensorBoardLogger(
+            root_dir=self.training_config.output_dir,
+            name="logs"
+        )
+        # Initialize Fabric
+        self.fabric = Fabric(
+            accelerator="auto",
+            devices=1,
+            precision=precision,
+            loggers=[tb_logger],
+        )
+        self.fabric.launch()
+        yield 0, 0.0, f"🚀 Starting training (precision: {precision})..."
+        # Get dataloader
+        train_loader = data_module.train_dataloader()
+        # Setup optimizer - only LoRA parameters
+        trainable_params = [p for p in self.module.model.parameters() if p.requires_grad]
+        if not trainable_params:
+            yield 0, 0.0, "❌ No trainable parameters found!"
+            return
+        yield 0, 0.0, f"🎯 Training {sum(p.numel() for p in trainable_params):,} parameters"
+        optimizer = AdamW(
+            trainable_params,
+            lr=self.training_config.learning_rate,
+            weight_decay=self.training_config.weight_decay,
+        )
+        # Calculate total steps
+        total_steps = len(train_loader) * self.training_config.max_epochs // self.training_config.gradient_accumulation_steps
+        warmup_steps = min(self.training_config.warmup_steps, max(1, total_steps // 10))
+        # Scheduler
+        warmup_scheduler = LinearLR(
+            optimizer,
+            start_factor=0.1,
+            end_factor=1.0,
+            total_iters=warmup_steps,
+        )
+        main_scheduler = CosineAnnealingWarmRestarts(
+            optimizer,
+            T_0=max(1, total_steps - warmup_steps),
+            T_mult=1,
+            eta_min=self.training_config.learning_rate * 0.01,
+        )
+        scheduler = SequentialLR(
+            optimizer,
+            schedulers=[warmup_scheduler, main_scheduler],
+            milestones=[warmup_steps],
+        )
+        # Convert model to bfloat16 (entire model for consistent dtype)
+        self.module.model = self.module.model.to(torch.bfloat16)
+        # Setup with Fabric - only the decoder (which has LoRA)
+        self.module.model.decoder, optimizer = self.fabric.setup(self.module.model.decoder, optimizer)
+        train_loader = self.fabric.setup_dataloaders(train_loader)
+        # Training loop
+        global_step = 0
+        accumulation_step = 0
+        accumulated_loss = 0.0
+        self.module.model.decoder.train()
+        for epoch in range(self.training_config.max_epochs):
+            epoch_loss = 0.0
+            num_batches = 0
+            epoch_start_time = time.time()
+            for batch_idx, batch in enumerate(train_loader):
+                # Check for stop signal
+                if training_state and training_state.get("should_stop", False):
+                    yield global_step, accumulated_loss / max(accumulation_step, 1), "⏹️ Training stopped by user"
+                    return
+                # Forward pass
+                loss = self.module.training_step(batch)
+                loss = loss / self.training_config.gradient_accumulation_steps
+                # Backward pass
+                self.fabric.backward(loss)
+                accumulated_loss += loss.item()
+                accumulation_step += 1
+                # Optimizer step
+                if accumulation_step >= self.training_config.gradient_accumulation_steps:
+                    self.fabric.clip_gradients(
+                        self.module.model.decoder,
+                        optimizer,
+                        max_norm=self.training_config.max_grad_norm,
+                    )
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    # Log
+                    avg_loss = accumulated_loss / accumulation_step
+                    self.fabric.log("train/loss", avg_loss, step=global_step)
+                    self.fabric.log("train/lr", scheduler.get_last_lr()[0], step=global_step)
+                    if global_step % self.training_config.log_every_n_steps == 0:
+                        yield global_step, avg_loss, f"Epoch {epoch+1}/{self.training_config.max_epochs}, Step {global_step}, Loss: {avg_loss:.4f}"
+                    epoch_loss += accumulated_loss
+                    num_batches += 1
+                    accumulated_loss = 0.0
+                    accumulation_step = 0
+            # End of epoch
+            epoch_time = time.time() - epoch_start_time
+            avg_epoch_loss = epoch_loss / max(num_batches, 1)
+            self.fabric.log("train/epoch_loss", avg_epoch_loss, step=epoch + 1)
+            yield global_step, avg_epoch_loss, f"✅ Epoch {epoch+1}/{self.training_config.max_epochs} in {epoch_time:.1f}s, Loss: {avg_epoch_loss:.4f}"
+            # Save checkpoint
+            if (epoch + 1) % self.training_config.save_every_n_epochs == 0:
+                checkpoint_dir = os.path.join(self.training_config.output_dir, "checkpoints", f"epoch_{epoch+1}")
+                save_lora_weights(self.module.model, checkpoint_dir)
+                yield global_step, avg_epoch_loss, f"💾 Checkpoint saved at epoch {epoch+1}"
+        # Save final model
+        final_path = os.path.join(self.training_config.output_dir, "final")
+        save_lora_weights(self.module.model, final_path)
+        final_loss = self.module.training_losses[-1] if self.module.training_losses else 0.0
+        yield global_step, final_loss, f"✅ Training complete! LoRA saved to {final_path}"
+    def _train_basic(
+        self,
+        data_module: PreprocessedDataModule,
+        training_state: Optional[Dict],
+    ) -> Generator[Tuple[int, float, str], None, None]:
+        """Basic training loop without Fabric."""
+        yield 0, 0.0, "🚀 Starting basic training loop..."
+        os.makedirs(self.training_config.output_dir, exist_ok=True)
+        train_loader = data_module.train_dataloader()
+        trainable_params = [p for p in self.module.model.parameters() if p.requires_grad]
+        if not trainable_params:
+            yield 0, 0.0, "❌ No trainable parameters found!"
+            return
+        optimizer = AdamW(
+            trainable_params,
+            lr=self.training_config.learning_rate,
+            weight_decay=self.training_config.weight_decay,
+        )
+        total_steps = len(train_loader) * self.training_config.max_epochs // self.training_config.gradient_accumulation_steps
+        warmup_steps = min(self.training_config.warmup_steps, max(1, total_steps // 10))
+        warmup_scheduler = LinearLR(optimizer, start_factor=0.1, end_factor=1.0, total_iters=warmup_steps)
+        main_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=max(1, total_steps - warmup_steps), T_mult=1, eta_min=self.training_config.learning_rate * 0.01)
+        scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, main_scheduler], milestones=[warmup_steps])
+        global_step = 0
+        accumulation_step = 0
+        accumulated_loss = 0.0
+        self.module.model.decoder.train()
+        for epoch in range(self.training_config.max_epochs):
+            epoch_loss = 0.0
+            num_batches = 0
+            epoch_start_time = time.time()
+            for batch in train_loader:
+                if training_state and training_state.get("should_stop", False):
+                    yield global_step, accumulated_loss / max(accumulation_step, 1), "⏹️ Training stopped"
+                    return
+                loss = self.module.training_step(batch)
+                loss = loss / self.training_config.gradient_accumulation_steps
+                loss.backward()
+                accumulated_loss += loss.item()
+                accumulation_step += 1
+                if accumulation_step >= self.training_config.gradient_accumulation_steps:
+                    torch.nn.utils.clip_grad_norm_(trainable_params, self.training_config.max_grad_norm)
+                    optimizer.step()
+                    scheduler.step()
+                    optimizer.zero_grad()
+                    global_step += 1
+                    if global_step % self.training_config.log_every_n_steps == 0:
+                        avg_loss = accumulated_loss / accumulation_step
+                        yield global_step, avg_loss, f"Epoch {epoch+1}, Step {global_step}, Loss: {avg_loss:.4f}"
+                    epoch_loss += accumulated_loss
+                    num_batches += 1
+                    accumulated_loss = 0.0
+                    accumulation_step = 0
+            epoch_time = time.time() - epoch_start_time
+            avg_epoch_loss = epoch_loss / max(num_batches, 1)
+            yield global_step, avg_epoch_loss, f"✅ Epoch {epoch+1}/{self.training_config.max_epochs} in {epoch_time:.1f}s"
+            if (epoch + 1) % self.training_config.save_every_n_epochs == 0:
+                checkpoint_dir = os.path.join(self.training_config.output_dir, "checkpoints", f"epoch_{epoch+1}")
+                save_lora_weights(self.module.model, checkpoint_dir)
+                yield global_step, avg_epoch_loss, f"💾 Checkpoint saved"
+        final_path = os.path.join(self.training_config.output_dir, "final")
+        save_lora_weights(self.module.model, final_path)
+        final_loss = self.module.training_losses[-1] if self.module.training_losses else 0.0
+        yield global_step, final_loss, f"✅ Training complete! LoRA saved to {final_path}"
+    def stop(self):
+        """Stop training."""
+        self.is_training = False

requirements.txt CHANGED Viewed

@@ -24,6 +24,10 @@ numba>=0.63.1
 vector-quantize-pytorch>=1.27.15
 torchcodec>=0.9.1
 # nano-vllm dependencies
 triton-windows>=3.0.0,<3.4; sys_platform == 'win32'
 triton>=3.0.0; sys_platform != 'win32'

 vector-quantize-pytorch>=1.27.15
 torchcodec>=0.9.1
+# LoRA Training dependencies (optional)
+peft>=0.7.0
+lightning>=2.0.0
 # nano-vllm dependencies
 triton-windows>=3.0.0,<3.4; sys_platform == 'win32'
 triton>=3.0.0; sys_platform != 'win32'