Spaces:

ACE-Step
/

Ace-Step-v1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Dec 23, 2025

Commit

bf5e1fd

1 Parent(s): 8ff7c0c

refact handler

Browse files

Files changed (6) hide show

acestep/acestep_v15_pipeline.py +8 -4
acestep/dataset_handler.py +37 -0
acestep/gradio_ui.py +49 -22
acestep/handler.py +10 -424
acestep/llm_inference.py +482 -0
requirements.txt +2 -1

acestep/acestep_v15_pipeline.py CHANGED Viewed

@@ -10,6 +10,8 @@ for proxy_var in ['http_proxy', 'https_proxy', 'HTTP_PROXY', 'HTTPS_PROXY', 'ALL
     os.environ.pop(proxy_var, None)
 from .handler import AceStepHandler
 from .gradio_ui import create_gradio_interface
@@ -20,11 +22,13 @@ def create_demo():
     Returns:
         Gradio Blocks instance
     """
-    # Create handler instance (business logic processor)
-    handler = AceStepHandler()
-    # Create Gradio interface
-    demo = create_gradio_interface(handler)
     return demo

     os.environ.pop(proxy_var, None)
 from .handler import AceStepHandler
+from .llm_inference import LLMHandler
+from .dataset_handler import DatasetHandler
 from .gradio_ui import create_gradio_interface
     Returns:
         Gradio Blocks instance
     """
+    # Create independent handler instances
+    dit_handler = AceStepHandler()  # DiT handler
+    llm_handler = LLMHandler()      # LM handler
+    dataset_handler = DatasetHandler()  # Dataset handler
+    # Create Gradio interface with all handlers
+    demo = create_gradio_interface(dit_handler, llm_handler, dataset_handler)
     return demo

acestep/dataset_handler.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""
+Dataset Handler
+Handles dataset import and exploration functionality
+"""
+from typing import Optional, Tuple, Any, Dict
+class DatasetHandler:
+    """Dataset Handler for Dataset Explorer functionality"""
+    def __init__(self):
+        """Initialize dataset handler"""
+        self.dataset = None
+        self.dataset_imported = False
+    def import_dataset(self, dataset_type: str) -> str:
+        """
+        Import dataset (temporarily disabled)
+        Args:
+            dataset_type: Type of dataset to import (e.g., "train", "test")
+        Returns:
+            Status message string
+        """
+        self.dataset_imported = False
+        return f"⚠️ Dataset import is currently disabled. Text2MusicDataset dependency not available."
+    def get_item_data(self, *args, **kwargs) -> Tuple:
+        """
+        Get dataset item (temporarily disabled)
+        Returns:
+            Tuple of placeholder values matching the expected return format
+        """
+        return "", "", "", "", "", None, None, None, "❌ Dataset not available", "", 0, "", None, None, None, {}, "text2music"

acestep/gradio_ui.py CHANGED Viewed

@@ -2,16 +2,19 @@
 Gradio UI Components Module
 Contains all Gradio interface component definitions and layouts
 """
 import gradio as gr
 from typing import Callable, Optional
-def create_gradio_interface(handler) -> gr.Blocks:
     """
     Create Gradio interface
     Args:
-        handler: Business logic handler instance
     Returns:
         Gradio Blocks instance
@@ -42,21 +45,21 @@ def create_gradio_interface(handler) -> gr.Blocks:
         """)
         # Dataset Explorer Section
-        dataset_section = create_dataset_section(handler)
         # Generation Section
-        generation_section = create_generation_section(handler)
         # Results Section
-        results_section = create_results_section(handler)
         # Connect event handlers
-        setup_event_handlers(demo, handler, dataset_section, generation_section, results_section)
     return demo
-def create_dataset_section(handler) -> dict:
     """Create dataset explorer section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>📊 Dataset Explorer</h3></div>')
@@ -153,7 +156,7 @@ def create_dataset_section(handler) -> dict:
     }
-def create_generation_section(handler) -> dict:
     """Create generation section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎼 ACE-Step V1.5 Demo </h3></div>')
@@ -165,7 +168,7 @@ def create_generation_section(handler) -> dict:
                 with gr.Column(scale=4):
                     checkpoint_dropdown = gr.Dropdown(
                         label="Checkpoint File",
-                        choices=handler.get_available_checkpoints(),
                         value=None,
                         info="Select a trained model checkpoint file (full path or filename)"
                     )
@@ -174,7 +177,7 @@ def create_generation_section(handler) -> dict:
             with gr.Row():
                 # Get available acestep-v15- model list
-                available_models = handler.get_available_acestep_v15_models()
                 default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
                 config_path = gr.Dropdown(
@@ -192,7 +195,7 @@ def create_generation_section(handler) -> dict:
             with gr.Row():
                 # Get available 5Hz LM model list
-                available_lm_models = handler.get_available_5hz_lm_models()
                 default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
                 lm_model_path = gr.Dropdown(
@@ -216,7 +219,7 @@ def create_generation_section(handler) -> dict:
                     info="Check to initialize 5Hz LM during service initialization",
                 )
                 # Auto-detect flash attention availability
-                flash_attn_available = handler.is_flash_attention_available()
                 use_flash_attention_checkbox = gr.Checkbox(
                     label="Use Flash Attention",
                     value=flash_attn_available,
@@ -565,7 +568,7 @@ def create_generation_section(handler) -> dict:
     }
-def create_results_section(handler) -> dict:
     """Create results display section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎧 Generated Results</h3></div>')
@@ -620,7 +623,7 @@ def create_results_section(handler) -> dict:
     }
-def setup_event_handlers(demo, handler, dataset_section, generation_section, results_section):
     """Setup event handlers connecting UI components and business logic"""
     def update_init_status(status_msg, enable_btn):
@@ -629,14 +632,14 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
     # Dataset handlers
     dataset_section["import_dataset_btn"].click(
-        fn=handler.import_dataset,
         inputs=[dataset_section["dataset_type"]],
         outputs=[dataset_section["data_status"]]
     )
     # Service initialization - refresh checkpoints
     def refresh_checkpoints():
-        choices = handler.get_available_checkpoints()
         return gr.update(choices=choices)
     generation_section["refresh_btn"].click(
@@ -698,12 +701,36 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
     # Service initialization
     def init_service_wrapper(checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
         """Wrapper for service initialization, returns status and button state"""
-        status, enable = handler.initialize_service(
-            checkpoint, config_path, device, init_llm, lm_model_path,
-            backend=backend,
             use_flash_attention=use_flash_attention, compile_model=False,
             offload_to_cpu=offload_to_cpu, offload_dit_to_cpu=offload_dit_to_cpu
         )
         return status, gr.update(interactive=enable)
     generation_section["init_btn"].click(
@@ -756,7 +783,7 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         progress=gr.Progress(track_tqdm=True)
     ):
-        return handler.generate_music(
             captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
             time_signature=time_signature, vocal_language=vocal_language,
             inference_steps=inference_steps, guidance_scale=guidance_scale,
@@ -820,7 +847,7 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
     # 5Hz LM generation (simplified version, can be extended as needed)
     def generate_lm_hints_wrapper(caption, lyrics, temperature, cfg_scale, negative_prompt):
         """Wrapper for 5Hz LM generation"""
-        metadata, audio_codes, status = handler.generate_with_5hz_lm(caption, lyrics, temperature, cfg_scale, negative_prompt)
         # Extract metadata values and map to UI fields
         # Handle bpm
@@ -878,7 +905,7 @@ def setup_event_handlers(demo, handler, dataset_section, generation_section, res
         audio_codes_content: str = ""
     ) -> tuple:
         """Update instruction and UI visibility based on task type."""
-        instruction = handler.generate_instruction(
             task_type=task_type_value,
             track_name=track_name_value,
             complete_track_classes=complete_track_classes_value

 Gradio UI Components Module
 Contains all Gradio interface component definitions and layouts
 """
+import os
 import gradio as gr
 from typing import Callable, Optional
+def create_gradio_interface(dit_handler, llm_handler, dataset_handler) -> gr.Blocks:
     """
     Create Gradio interface
     Args:
+        dit_handler: DiT handler instance
+        llm_handler: LM handler instance
+        dataset_handler: Dataset handler instance
     Returns:
         Gradio Blocks instance
         """)
         # Dataset Explorer Section
+        dataset_section = create_dataset_section(dataset_handler)
         # Generation Section
+        generation_section = create_generation_section(dit_handler, llm_handler)
         # Results Section
+        results_section = create_results_section(dit_handler)
         # Connect event handlers
+        setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section)
     return demo
+def create_dataset_section(dataset_handler) -> dict:
     """Create dataset explorer section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>📊 Dataset Explorer</h3></div>')
     }
+def create_generation_section(dit_handler, llm_handler) -> dict:
     """Create generation section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎼 ACE-Step V1.5 Demo </h3></div>')
                 with gr.Column(scale=4):
                     checkpoint_dropdown = gr.Dropdown(
                         label="Checkpoint File",
+                        choices=dit_handler.get_available_checkpoints(),
                         value=None,
                         info="Select a trained model checkpoint file (full path or filename)"
                     )
             with gr.Row():
                 # Get available acestep-v15- model list
+                available_models = dit_handler.get_available_acestep_v15_models()
                 default_model = "acestep-v15-turbo" if "acestep-v15-turbo" in available_models else (available_models[0] if available_models else None)
                 config_path = gr.Dropdown(
             with gr.Row():
                 # Get available 5Hz LM model list
+                available_lm_models = llm_handler.get_available_5hz_lm_models()
                 default_lm_model = "acestep-5Hz-lm-0.6B" if "acestep-5Hz-lm-0.6B" in available_lm_models else (available_lm_models[0] if available_lm_models else None)
                 lm_model_path = gr.Dropdown(
                     info="Check to initialize 5Hz LM during service initialization",
                 )
                 # Auto-detect flash attention availability
+                flash_attn_available = dit_handler.is_flash_attention_available()
                 use_flash_attention_checkbox = gr.Checkbox(
                     label="Use Flash Attention",
                     value=flash_attn_available,
     }
+def create_results_section(dit_handler) -> dict:
     """Create results display section"""
     with gr.Group():
         gr.HTML('<div class="section-header"><h3>🎧 Generated Results</h3></div>')
     }
+def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, dataset_section, generation_section, results_section):
     """Setup event handlers connecting UI components and business logic"""
     def update_init_status(status_msg, enable_btn):
     # Dataset handlers
     dataset_section["import_dataset_btn"].click(
+        fn=dataset_handler.import_dataset,
         inputs=[dataset_section["dataset_type"]],
         outputs=[dataset_section["data_status"]]
     )
     # Service initialization - refresh checkpoints
     def refresh_checkpoints():
+        choices = dit_handler.get_available_checkpoints()
         return gr.update(choices=choices)
     generation_section["refresh_btn"].click(
     # Service initialization
     def init_service_wrapper(checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
         """Wrapper for service initialization, returns status and button state"""
+        # Initialize DiT handler
+        status, enable = dit_handler.initialize_service(
+            checkpoint, config_path, device,
             use_flash_attention=use_flash_attention, compile_model=False,
             offload_to_cpu=offload_to_cpu, offload_dit_to_cpu=offload_dit_to_cpu
         )
+        # Initialize LM handler if requested
+        if init_llm:
+            # Get checkpoint directory
+            current_file = os.path.abspath(__file__)
+            project_root = os.path.dirname(os.path.dirname(current_file))
+            checkpoint_dir = os.path.join(project_root, "checkpoints")
+            lm_status, lm_success = llm_handler.initialize(
+                checkpoint_dir=checkpoint_dir,
+                lm_model_path=lm_model_path,
+                backend=backend,
+                device=device,
+                offload_to_cpu=offload_to_cpu,
+                dtype=dit_handler.dtype
+            )
+            if lm_success:
+                status += f"\n{lm_status}"
+            else:
+                status += f"\n{lm_status}"
+                # Don't fail the entire initialization if LM fails, but log it
+                # Keep enable as is (DiT initialization result) even if LM fails
         return status, gr.update(interactive=enable)
     generation_section["init_btn"].click(
         use_adg, cfg_interval_start, cfg_interval_end, audio_format, lm_temperature,
         progress=gr.Progress(track_tqdm=True)
     ):
+        return dit_handler.generate_music(
             captions=captions, lyrics=lyrics, bpm=bpm, key_scale=key_scale,
             time_signature=time_signature, vocal_language=vocal_language,
             inference_steps=inference_steps, guidance_scale=guidance_scale,
     # 5Hz LM generation (simplified version, can be extended as needed)
     def generate_lm_hints_wrapper(caption, lyrics, temperature, cfg_scale, negative_prompt):
         """Wrapper for 5Hz LM generation"""
+        metadata, audio_codes, status = llm_handler.generate_with_5hz_lm(caption, lyrics, temperature, cfg_scale, negative_prompt)
         # Extract metadata values and map to UI fields
         # Handle bpm
         audio_codes_content: str = ""
     ) -> tuple:
         """Update instruction and UI visibility based on task type."""
+        instruction = dit_handler.generate_instruction(
             task_type=task_type_value,
             track_name=track_name_value,
             complete_track_classes=complete_track_classes_value

acestep/handler.py CHANGED Viewed

@@ -61,19 +61,9 @@ class AceStepHandler:
         # Sample rate
         self.sample_rate = 48000
-        # 5Hz LM related
-        self.llm = None
-        self.llm_tokenizer = None
-        self.llm_initialized = False
-        self.llm_backend = None
         # Reward model (temporarily disabled)
         self.reward_model = None
-        # Dataset related (temporarily disabled)
-        self.dataset = None
-        self.dataset_imported = False
         # Batch size
         self.batch_size = 2
@@ -120,22 +110,6 @@ class AceStepHandler:
         models.sort()
         return models
-    def get_available_5hz_lm_models(self) -> List[str]:
-        """Scan and return all model directory names starting with 'acestep-5Hz-lm-'"""
-        current_file = os.path.abspath(__file__)
-        project_root = os.path.dirname(os.path.dirname(current_file))
-        checkpoint_dir = os.path.join(project_root, "checkpoints")
-        models = []
-        if os.path.exists(checkpoint_dir):
-            for item in os.listdir(checkpoint_dir):
-                item_path = os.path.join(checkpoint_dir, item)
-                if os.path.isdir(item_path) and item.startswith("acestep-5Hz-lm-"):
-                    models.append(item)
-        models.sort()
-        return models
     def is_flash_attention_available(self) -> bool:
         """Check if flash attention is available on the system"""
         try:
@@ -149,9 +123,6 @@ class AceStepHandler:
         project_root: str,
         config_path: str,
         device: str = "auto",
-        init_llm: bool = False,
-        lm_model_path: str = "acestep-5Hz-lm-0.6B",
-        backend: str = "vllm",
         use_flash_attention: bool = False,
         compile_model: bool = False,
         offload_to_cpu: bool = False,
@@ -159,15 +130,12 @@ class AceStepHandler:
         quantization: Optional[str] = None,
     ) -> Tuple[str, bool]:
         """
-        Initialize model service
         Args:
             project_root: Project root path (may be checkpoints directory, will be handled automatically)
             config_path: Model config directory name (e.g., "acestep-v15-turbo")
             device: Device type
-            init_llm: Whether to initialize 5Hz LM model
-            lm_model_path: 5Hz LM model path
-            backend: Backend for 5Hz LM model ("vllm" or "pt")
             use_flash_attention: Whether to use flash attention (requires flash_attn package)
             compile_model: Whether to use torch.compile to optimize the model
             offload_to_cpu: Whether to offload models to CPU when not in use
@@ -309,72 +277,14 @@ class AceStepHandler:
                 self.text_encoder.eval()
             else:
                 raise FileNotFoundError(f"Text encoder not found at {text_encoder_path}")
-            # 4. Load 5Hz LM model (optional, only if init_llm is True)
-            if init_llm:
-                full_lm_model_path = os.path.join(checkpoint_dir, lm_model_path)
-                if os.path.exists(full_lm_model_path):
-                    logger.info("loading 5Hz LM tokenizer...")
-                    start_time = time.time()
-                    llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
-                    logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
-                    self.llm_tokenizer = llm_tokenizer
-                    # Initialize based on user-selected backend
-                    if backend == "vllm":
-                        # Try to initialize with vllm
-                        status_msg = self._initialize_5hz_lm_vllm(full_lm_model_path)
-                        logger.info(f"5Hz LM status message: {status_msg}")
-                        # Check if initialization failed (status_msg starts with ❌)
-                        if status_msg.startswith("❌"):
-                            # vllm initialization failed, fallback to PyTorch
-                            if not self.llm_initialized:
-                                logger.warning("vllm initialization failed, falling back to PyTorch backend")
-                                try:
-                                    self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                                    if not self.offload_to_cpu:
-                                        self.llm = self.llm.to(device).to(self.dtype)
-                                    else:
-                                        self.llm = self.llm.to("cpu").to(self.dtype)
-                                    self.llm.eval()
-                                    self.llm_backend = "pt"
-                                    self.llm_initialized = True
-                                    logger.info("5Hz LM initialized successfully using PyTorch backend (fallback)")
-                                except Exception as e:
-                                    return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
-                        # If vllm initialization succeeded, self.llm_initialized should already be True
-                    else:
-                        # Use PyTorch backend (pt)
-                        try:
-                            self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
-                            if not self.offload_to_cpu:
-                                self.llm = self.llm.to(device).to(self.dtype)
-                            else:
-                                self.llm = self.llm.to("cpu").to(self.dtype)
-                            self.llm.eval()
-                            self.llm_backend = "pt"
-                            self.llm_initialized = True
-                            logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
-                        except Exception as e:
-                            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
-                else:
-                    # 5Hz LM path not found
-                    return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
             # Determine actual attention implementation used
             actual_attn = getattr(self.config, "_attn_implementation", "eager")
-            status_msg = f"✅ Model initialized successfully on {device}\n" + status_msg
             status_msg += f"Main model: {acestep_v15_checkpoint_path}\n"
             status_msg += f"VAE: {vae_checkpoint_path}\n"
             status_msg += f"Text encoder: {text_encoder_path}\n"
-            if init_llm and hasattr(self, 'llm') and self.llm is not None:
-                backend_info = getattr(self, 'llm_backend', 'unknown')
-                status_msg += f"5Hz LM model: {os.path.join(checkpoint_dir, lm_model_path)}\n"
-                status_msg += f"5Hz LM backend: {backend_info}\n"
-            else:
-                status_msg += f"5Hz LM model: Not loaded (checkbox not selected)\n"
             status_msg += f"Dtype: {self.dtype}\n"
             status_msg += f"Attention: {actual_attn}\n"
             status_msg += f"Compiled: {compile_model}\n"
@@ -393,7 +303,7 @@ class AceStepHandler:
         Context manager to load a model to GPU and offload it back to CPU after use.
         Args:
-            model_name: Name of the model to load ("text_encoder", "vae", "model", "llm")
         """
         if not self.offload_to_cpu:
             yield
@@ -418,11 +328,6 @@ class AceStepHandler:
             yield
             return
-        # If model is LLM and using nanovllm, do not offload (it stays on GPU)
-        if model_name == "llm" and getattr(self, "llm_type", None) == "nanovllm":
-            yield
-            return
         model = getattr(self, model_name, None)
         if model is None:
             yield
@@ -434,10 +339,6 @@ class AceStepHandler:
         if model_name == "vae":
             vae_dtype = torch.bfloat16 if self.device in ["cuda", "xpu"] else self.dtype
             model.to(self.device).to(vae_dtype)
-        elif model_name == "llm" and hasattr(model, "to"):
-             # Special handling for nanovllm LLM which might have custom to() method or structure
-             # Assuming it has a .to() method based on our previous edits to nanovllm
-             model.to(self.device)
         else:
             model.to(self.device).to(self.dtype)
@@ -454,10 +355,7 @@ class AceStepHandler:
             # Offload to CPU
             logger.info(f"Offloading {model_name} to CPU")
             start_time = time.time()
-            if model_name == "llm" and hasattr(model, "to"):
-                 model.to("cpu")
-            else:
-                 model.to("cpu")
             if model_name == "model" and hasattr(self, "silence_latent"):
                  self.silence_latent = self.silence_latent.to("cpu")
@@ -467,318 +365,6 @@ class AceStepHandler:
             self.current_offload_cost += offload_time
             logger.info(f"Offloaded {model_name} to CPU in {offload_time:.4f}s")
-    def import_dataset(self, dataset_type: str) -> str:
-        """Import dataset (temporarily disabled)"""
-        self.dataset_imported = False
-        return f"⚠️ Dataset import is currently disabled. Text2MusicDataset dependency not available."
-    def get_item_data(self, *args, **kwargs):
-        """Get dataset item (temporarily disabled)"""
-        return "", "", "", "", "", None, None, None, "❌ Dataset not available", "", 0, "", None, None, None, {}, "text2music"
-    def get_gpu_memory_utilization(self, minimal_gpu: float = 8, min_ratio: float = 0.2, max_ratio: float = 0.9) -> float:
-        """Get GPU memory utilization ratio"""
-        try:
-            device = torch.device("cuda:0")
-            total_gpu_mem_bytes = torch.cuda.get_device_properties(device).total_memory
-            allocated_mem_bytes = torch.cuda.memory_allocated(device)
-            reserved_mem_bytes = torch.cuda.memory_reserved(device)
-            total_gpu = total_gpu_mem_bytes / 1024**3
-            low_gpu_memory_mode = False
-            if total_gpu < minimal_gpu:
-                minimal_gpu = 0.5 * total_gpu
-                low_gpu_memory_mode = True
-            allocated_gpu = allocated_mem_bytes / 1024**3
-            reserved_gpu = reserved_mem_bytes / 1024**3
-            available_gpu = total_gpu - reserved_gpu
-            if available_gpu >= minimal_gpu:
-                ratio = min(max_ratio, max(min_ratio, minimal_gpu / total_gpu))
-            else:
-                ratio = min(max_ratio, max(min_ratio, (available_gpu * 0.8) / total_gpu))
-            return ratio, low_gpu_memory_mode
-        except Exception as e:
-            return 0.9, low_gpu_memory_mode
-    def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
-        """Initialize 5Hz LM model"""
-        if not torch.cuda.is_available():
-            self.llm_initialized = False
-            logger.error("CUDA is not available. Please check your GPU setup.")
-            return "❌ CUDA is not available. Please check your GPU setup."
-        try:
-            from nanovllm import LLM, SamplingParams
-        except ImportError:
-            self.llm_initialized = False
-            logger.error("nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install .")
-            return "❌ nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install ."
-        try:
-            current_device = torch.cuda.current_device()
-            device_name = torch.cuda.get_device_name(current_device)
-            torch.cuda.empty_cache()
-            gpu_memory_utilization, low_gpu_memory_mode = self.get_gpu_memory_utilization(
-                minimal_gpu=8,
-                min_ratio=0.2,
-                max_ratio=0.9
-            )
-            if low_gpu_memory_mode:
-                self.max_model_len = 2048
-            else:
-                self.max_model_len = 4096
-            logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
-            start_time = time.time()
-            self.llm = LLM(
-                model=model_path,
-                enforce_eager=False,
-                tensor_parallel_size=1,
-                max_model_len=self.max_model_len,
-                gpu_memory_utilization=gpu_memory_utilization,
-                tokenizer=self.llm_tokenizer,
-            )
-            logger.info(f"5Hz LM initialized successfully in {time.time() - start_time:.2f} seconds")
-            self.llm_initialized = True
-            self.llm_backend = "vllm"
-            return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
-        except Exception as e:
-            self.llm_initialized = False
-            self.llm_type = None
-            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return error_msg
-    def generate_with_5hz_lm_vllm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
-        try:
-            from nanovllm import SamplingParams
-            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
-            formatted_prompt = self.llm_tokenizer.apply_chat_template(
-                [
-                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                    {"role": "user", "content": prompt}
-                ],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
-            logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
-            sampling_params = SamplingParams(max_tokens=self.max_model_len-64, temperature=temperature, cfg_scale=cfg_scale)
-            # Use CFG if cfg_scale > 1.0
-            if cfg_scale > 1.0:
-                # Build unconditional prompt (user input replaced with "NO USER INPUT")
-                formatted_unconditional_prompt = self.lm_tokenizer.apply_chat_template(
-                    [
-                        {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                        {"role": "user", "content": negative_prompt}
-                    ],
-                    tokenize=False,
-                    add_generation_prompt=True,
-                )
-                outputs = self.llm.generate(
-                    [formatted_prompt],
-                    sampling_params,
-                    unconditional_prompts=[formatted_unconditional_prompt]
-                )
-            else:
-                outputs = self.lm_model.generate([formatted_prompt], sampling_params)
-            # Extract text from output - handle different output formats
-            if isinstance(outputs, list) and len(outputs) > 0:
-                if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
-                    output_text = outputs[0].outputs[0].text
-                elif hasattr(outputs[0], 'text'):
-                    output_text = outputs[0].text
-                elif isinstance(outputs[0], dict) and 'text' in outputs[0]:
-                    output_text = outputs[0]['text']
-                else:
-                    output_text = str(outputs[0])
-            else:
-                output_text = str(outputs)
-            metadata, audio_codes = self.parse_lm_output(output_text)
-            print(f"[debug]output_text: {output_text}")
-            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
-            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
-        except Exception as e:
-            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return {}, "", error_msg
-    def generate_with_5hz_lm_pt(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
-        try:
-            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
-            formatted_prompt = self.llm_tokenizer.apply_chat_template(
-                [
-                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                    {"role": "user", "content": prompt}
-                ],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
-            # Tokenize the prompt
-            inputs = self.llm_tokenizer(
-                formatted_prompt,
-                return_tensors="pt",
-                padding=False,
-                truncation=True,
-            )
-            # Generate with the model
-            with self._load_model_context("llm"):
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                # Get max_new_tokens from model config or use a default
-                max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
-                if hasattr(self, 'max_model_len'):
-                    max_new_tokens = min(max_new_tokens, self.max_model_len)
-                # Define custom streamer for tqdm
-                class TqdmTokenStreamer(BaseStreamer):
-                    def __init__(self, total):
-                        self.pbar = tqdm(total=total, desc="Generating 5Hz tokens", unit="token", maxinterval=1)
-                    def put(self, value):
-                        # value is tensor of token ids
-                        if value.dim() > 1:
-                            num_tokens = value.numel()
-                        else:
-                            num_tokens = len(value)
-                        self.pbar.update(num_tokens)
-                    def end(self):
-                        self.pbar.close()
-                streamer = TqdmTokenStreamer(total=max_new_tokens)
-                with torch.no_grad():
-                    outputs = self.llm.generate(
-                        **inputs,
-                        max_new_tokens=max_new_tokens,
-                        temperature=temperature,
-                        do_sample=True if temperature > 0 else False,
-                        pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
-                        streamer=streamer,
-                    )
-            # Decode the generated tokens
-            # Only decode the newly generated tokens (skip the input prompt)
-            generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
-            output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
-            metadata, audio_codes = self.parse_lm_output(output_text)
-            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
-            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
-        except Exception as e:
-            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
-            return {}, "", error_msg
-    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
-        """Generate metadata and audio codes using 5Hz LM"""
-        # Check if 5Hz LM is initialized
-        if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
-            debug_info = f"llm_initialized={getattr(self, 'llm_initialized', 'not set')}, "
-            debug_info += f"has_llm={hasattr(self, 'llm')}, "
-            debug_info += f"llm_is_none={getattr(self, 'llm', None) is None}, "
-            debug_info += f"llm_backend={getattr(self, 'llm_backend', 'not set')}"
-            return {}, "", f"❌ 5Hz LM not initialized. Please initialize it first. Debug: {debug_info}"
-        if not hasattr(self, 'llm') or self.llm is None:
-            return {}, "", "❌ 5Hz LM model not loaded. Please initialize it first."
-        if not hasattr(self, 'llm_backend'):
-            return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
-        if self.llm_backend == "vllm":
-            return self.generate_with_5hz_lm_vllm(caption, lyrics, temperature, cfg_scale, negative_prompt)
-        else:
-            return self.generate_with_5hz_lm_pt(caption, lyrics, temperature)
-    def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
-        """
-        Parse LM output to extract metadata and audio codes.
-        Expected format:
-        <think>
-        bpm: 73
-        duration: 273
-        genres: Chinese folk
-        keyscale: G major
-        timesignature: 4
-        </think>
-        <|audio_code_56535|><|audio_code_62918|>...
-        Returns:
-            Tuple of (metadata_dict, audio_codes_string)
-        """
-        debug_output_text = output_text.split("</think>")[0]
-        logger.debug(f"Debug output text: {debug_output_text}")
-        metadata = {}
-        audio_codes = ""
-        import re
-        # Extract audio codes - find all <|audio_code_XXX|> patterns
-        code_pattern = r'<\|audio_code_\d+\|>'
-        code_matches = re.findall(code_pattern, output_text)
-        if code_matches:
-            audio_codes = "".join(code_matches)
-        # Extract metadata from reasoning section
-        # Try different reasoning tag patterns
-        reasoning_patterns = [
-            r'<think>(.*?)</think>',
-            r'<think>(.*?)</think>',
-            r'<reasoning>(.*?)</reasoning>',
-        ]
-        reasoning_text = None
-        for pattern in reasoning_patterns:
-            match = re.search(pattern, output_text, re.DOTALL)
-            if match:
-                reasoning_text = match.group(1).strip()
-                break
-        # If no reasoning tags found, try to parse metadata from the beginning of output
-        if not reasoning_text:
-            # Look for metadata lines before audio codes
-            lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
-            reasoning_text = lines_before_codes.strip()
-        # Parse metadata fields
-        if reasoning_text:
-            for line in reasoning_text.split('\n'):
-                line = line.strip()
-                if ':' in line and not line.startswith('<'):
-                    parts = line.split(':', 1)
-                    if len(parts) == 2:
-                        key = parts[0].strip().lower()
-                        value = parts[1].strip()
-                        if key == 'bpm':
-                            try:
-                                metadata['bpm'] = int(value)
-                            except:
-                                metadata['bpm'] = value
-                        elif key == 'duration':
-                            try:
-                                metadata['duration'] = int(value)
-                            except:
-                                metadata['duration'] = value
-                        elif key == 'genres':
-                            metadata['genres'] = value
-                        elif key == 'keyscale':
-                            metadata['keyscale'] = value
-                        elif key == 'timesignature':
-                            metadata['timesignature'] = value
-        return metadata, audio_codes
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
         if audio_file is None:
@@ -837,13 +423,13 @@ class AceStepHandler:
             detokenizer = self.model.detokenizer
             num_quantizers = getattr(quantizer, "num_quantizers", 1)
-            indices = torch.tensor(code_ids, device=self.device, dtype=torch.long).unsqueeze(0)  # [1, T_5Hz]
-            # Expand to include quantizer dimension: [1, T_5Hz, num_quantizers]
-            if indices.dim() == 2:
-                indices = indices.unsqueeze(-1).expand(-1, -1, num_quantizers)
-            print(indices.shape)
-            # Get quantized representation from indices: [1, T_5Hz, dim]
             quantized = quantizer.get_output_from_indices(indices)
             if quantized.dtype != self.dtype:
                 quantized = quantized.to(self.dtype)

         # Sample rate
         self.sample_rate = 48000
         # Reward model (temporarily disabled)
         self.reward_model = None
         # Batch size
         self.batch_size = 2
         models.sort()
         return models
     def is_flash_attention_available(self) -> bool:
         """Check if flash attention is available on the system"""
         try:
         project_root: str,
         config_path: str,
         device: str = "auto",
         use_flash_attention: bool = False,
         compile_model: bool = False,
         offload_to_cpu: bool = False,
         quantization: Optional[str] = None,
     ) -> Tuple[str, bool]:
         """
+        Initialize DiT model service
         Args:
             project_root: Project root path (may be checkpoints directory, will be handled automatically)
             config_path: Model config directory name (e.g., "acestep-v15-turbo")
             device: Device type
             use_flash_attention: Whether to use flash attention (requires flash_attn package)
             compile_model: Whether to use torch.compile to optimize the model
             offload_to_cpu: Whether to offload models to CPU when not in use
                 self.text_encoder.eval()
             else:
                 raise FileNotFoundError(f"Text encoder not found at {text_encoder_path}")
             # Determine actual attention implementation used
             actual_attn = getattr(self.config, "_attn_implementation", "eager")
+            status_msg = f"✅ Model initialized successfully on {device}\n"
             status_msg += f"Main model: {acestep_v15_checkpoint_path}\n"
             status_msg += f"VAE: {vae_checkpoint_path}\n"
             status_msg += f"Text encoder: {text_encoder_path}\n"
             status_msg += f"Dtype: {self.dtype}\n"
             status_msg += f"Attention: {actual_attn}\n"
             status_msg += f"Compiled: {compile_model}\n"
         Context manager to load a model to GPU and offload it back to CPU after use.
         Args:
+            model_name: Name of the model to load ("text_encoder", "vae", "model")
         """
         if not self.offload_to_cpu:
             yield
             yield
             return
         model = getattr(self, model_name, None)
         if model is None:
             yield
         if model_name == "vae":
             vae_dtype = torch.bfloat16 if self.device in ["cuda", "xpu"] else self.dtype
             model.to(self.device).to(vae_dtype)
         else:
             model.to(self.device).to(self.dtype)
             # Offload to CPU
             logger.info(f"Offloading {model_name} to CPU")
             start_time = time.time()
+            model.to("cpu")
             if model_name == "model" and hasattr(self, "silence_latent"):
                  self.silence_latent = self.silence_latent.to("cpu")
             self.current_offload_cost += offload_time
             logger.info(f"Offloaded {model_name} to CPU in {offload_time:.4f}s")
     def process_target_audio(self, audio_file) -> Optional[torch.Tensor]:
         """Process target audio"""
         if audio_file is None:
             detokenizer = self.model.detokenizer
             num_quantizers = getattr(quantizer, "num_quantizers", 1)
+            # Create indices tensor: [T_5Hz]
+            indices = torch.tensor(code_ids, device=self.device, dtype=torch.long)  # [T_5Hz]
+            indices = indices.unsqueeze(0).unsqueeze(-1)  # [1, T_5Hz, 1]
+            # Get quantized representation from indices
+            # The quantizer expects [batch, T_5Hz] format and handles quantizer dimension internally
             quantized = quantizer.get_output_from_indices(indices)
             if quantized.dtype != self.dtype:
                 quantized = quantized.to(self.dtype)

acestep/llm_inference.py ADDED Viewed

	@@ -0,0 +1,482 @@

+"""
+5Hz LM (Language Model) Handler
+Handles all LM-related operations including initialization and generation
+"""
+import os
+import traceback
+import time
+from typing import Optional, Dict, Any, Tuple, List
+from contextlib import contextmanager
+import torch
+from tqdm import tqdm
+from loguru import logger
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from transformers.generation.streamers import BaseStreamer
+class LLMHandler:
+    """5Hz LM Handler for audio code generation"""
+    def __init__(self):
+        """Initialize LLMHandler with default values"""
+        self.llm = None
+        self.llm_tokenizer = None
+        self.llm_initialized = False
+        self.llm_backend = None
+        self.max_model_len = 4096
+        self.device = "cpu"
+        self.dtype = torch.float32
+        self.offload_to_cpu = False
+    def get_available_5hz_lm_models(self) -> List[str]:
+        """Scan and return all model directory names starting with 'acestep-5Hz-lm-'"""
+        current_file = os.path.abspath(__file__)
+        project_root = os.path.dirname(os.path.dirname(current_file))
+        checkpoint_dir = os.path.join(project_root, "checkpoints")
+        models = []
+        if os.path.exists(checkpoint_dir):
+            for item in os.listdir(checkpoint_dir):
+                item_path = os.path.join(checkpoint_dir, item)
+                if os.path.isdir(item_path) and item.startswith("acestep-5Hz-lm-"):
+                    models.append(item)
+        models.sort()
+        return models
+    def get_gpu_memory_utilization(self, minimal_gpu: float = 8, min_ratio: float = 0.2, max_ratio: float = 0.9) -> Tuple[float, bool]:
+        """Get GPU memory utilization ratio"""
+        try:
+            device = torch.device("cuda:0")
+            total_gpu_mem_bytes = torch.cuda.get_device_properties(device).total_memory
+            allocated_mem_bytes = torch.cuda.memory_allocated(device)
+            reserved_mem_bytes = torch.cuda.memory_reserved(device)
+            total_gpu = total_gpu_mem_bytes / 1024**3
+            low_gpu_memory_mode = False
+            if total_gpu < minimal_gpu:
+                minimal_gpu = 0.5 * total_gpu
+                low_gpu_memory_mode = True
+            allocated_gpu = allocated_mem_bytes / 1024**3
+            reserved_gpu = reserved_mem_bytes / 1024**3
+            available_gpu = total_gpu - reserved_gpu
+            if available_gpu >= minimal_gpu:
+                ratio = min(max_ratio, max(min_ratio, minimal_gpu / total_gpu))
+            else:
+                ratio = min(max_ratio, max(min_ratio, (available_gpu * 0.8) / total_gpu))
+            return ratio, low_gpu_memory_mode
+        except Exception as e:
+            return 0.9, False
+    def initialize(
+        self,
+        checkpoint_dir: str,
+        lm_model_path: str,
+        backend: str = "vllm",
+        device: str = "auto",
+        offload_to_cpu: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ) -> Tuple[str, bool]:
+        """
+        Initialize 5Hz LM model
+        Args:
+            checkpoint_dir: Checkpoint directory path
+            lm_model_path: LM model path (relative to checkpoint_dir)
+            backend: Backend type ("vllm" or "pt")
+            device: Device type ("auto", "cuda", or "cpu")
+            offload_to_cpu: Whether to offload to CPU
+            dtype: Data type (if None, auto-detect based on device)
+        Returns:
+            (status_message, success)
+        """
+        try:
+            if device == "auto":
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+            self.device = device
+            self.offload_to_cpu = offload_to_cpu
+            # Set dtype based on device: bfloat16 for cuda, float32 for cpu
+            if dtype is None:
+                self.dtype = torch.bfloat16 if device in ["cuda", "xpu"] else torch.float32
+            else:
+                self.dtype = dtype
+            full_lm_model_path = os.path.join(checkpoint_dir, lm_model_path)
+            if not os.path.exists(full_lm_model_path):
+                return f"❌ 5Hz LM model not found at {full_lm_model_path}", False
+            logger.info("loading 5Hz LM tokenizer...")
+            start_time = time.time()
+            llm_tokenizer = AutoTokenizer.from_pretrained(full_lm_model_path, use_fast=True)
+            logger.info(f"5Hz LM tokenizer loaded successfully in {time.time() - start_time:.2f} seconds")
+            self.llm_tokenizer = llm_tokenizer
+            # Initialize based on user-selected backend
+            if backend == "vllm":
+                # Try to initialize with vllm
+                status_msg = self._initialize_5hz_lm_vllm(full_lm_model_path)
+                logger.info(f"5Hz LM status message: {status_msg}")
+                # Check if initialization failed (status_msg starts with ❌)
+                if status_msg.startswith("❌"):
+                    # vllm initialization failed, fallback to PyTorch
+                    if not self.llm_initialized:
+                        logger.warning("vllm initialization failed, falling back to PyTorch backend")
+                        try:
+                            self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
+                            if not self.offload_to_cpu:
+                                self.llm = self.llm.to(device).to(self.dtype)
+                            else:
+                                self.llm = self.llm.to("cpu").to(self.dtype)
+                            self.llm.eval()
+                            self.llm_backend = "pt"
+                            self.llm_initialized = True
+                            logger.info("5Hz LM initialized successfully using PyTorch backend (fallback)")
+                            status_msg = f"✅ 5Hz LM initialized successfully (PyTorch fallback)\nModel: {full_lm_model_path}\nBackend: PyTorch"
+                        except Exception as e:
+                            return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
+                # If vllm initialization succeeded, self.llm_initialized should already be True
+            else:
+                # Use PyTorch backend (pt)
+                try:
+                    self.llm = AutoModelForCausalLM.from_pretrained(full_lm_model_path, trust_remote_code=True)
+                    if not self.offload_to_cpu:
+                        self.llm = self.llm.to(device).to(self.dtype)
+                    else:
+                        self.llm = self.llm.to("cpu").to(self.dtype)
+                    self.llm.eval()
+                    self.llm_backend = "pt"
+                    self.llm_initialized = True
+                    logger.info(f"5Hz LM initialized successfully using PyTorch backend on {device}")
+                    status_msg = f"✅ 5Hz LM initialized successfully\nModel: {full_lm_model_path}\nBackend: PyTorch\nDevice: {device}"
+                except Exception as e:
+                    return f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}", False
+            return status_msg, True
+        except Exception as e:
+            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return error_msg, False
+    def _initialize_5hz_lm_vllm(self, model_path: str) -> str:
+        """Initialize 5Hz LM model using vllm backend"""
+        if not torch.cuda.is_available():
+            self.llm_initialized = False
+            logger.error("CUDA is not available. Please check your GPU setup.")
+            return "❌ CUDA is not available. Please check your GPU setup."
+        try:
+            from nanovllm import LLM, SamplingParams
+        except ImportError:
+            self.llm_initialized = False
+            logger.error("nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install .")
+            return "❌ nano-vllm is not installed. Please install it using 'cd acestep/third_parts/nano-vllm && pip install ."
+        try:
+            current_device = torch.cuda.current_device()
+            device_name = torch.cuda.get_device_name(current_device)
+            torch.cuda.empty_cache()
+            gpu_memory_utilization, low_gpu_memory_mode = self.get_gpu_memory_utilization(
+                minimal_gpu=8,
+                min_ratio=0.2,
+                max_ratio=0.9
+            )
+            if low_gpu_memory_mode:
+                self.max_model_len = 2048
+            else:
+                self.max_model_len = 4096
+            logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
+            start_time = time.time()
+            self.llm = LLM(
+                model=model_path,
+                enforce_eager=False,
+                tensor_parallel_size=1,
+                max_model_len=self.max_model_len,
+                gpu_memory_utilization=gpu_memory_utilization,
+                tokenizer=self.llm_tokenizer,
+            )
+            logger.info(f"5Hz LM initialized successfully in {time.time() - start_time:.2f} seconds")
+            self.llm_initialized = True
+            self.llm_backend = "vllm"
+            return f"✅ 5Hz LM initialized successfully\nModel: {model_path}\nDevice: {device_name}\nGPU Memory Utilization: {gpu_memory_utilization:.2f}"
+        except Exception as e:
+            self.llm_initialized = False
+            error_msg = f"❌ Error initializing 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return error_msg
+    def generate_with_5hz_lm_vllm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
+        """Generate metadata and audio codes using 5Hz LM with vllm backend"""
+        try:
+            from nanovllm import SamplingParams
+            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+            formatted_prompt = self.llm_tokenizer.apply_chat_template(
+                [
+                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                    {"role": "user", "content": prompt}
+                ],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
+            sampling_params = SamplingParams(max_tokens=self.max_model_len-64, temperature=temperature, cfg_scale=cfg_scale)
+            # Use CFG if cfg_scale > 1.0
+            if cfg_scale > 1.0:
+                # Build unconditional prompt (user input replaced with "NO USER INPUT")
+                formatted_unconditional_prompt = self.llm_tokenizer.apply_chat_template(
+                    [
+                        {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                        {"role": "user", "content": negative_prompt}
+                    ],
+                    tokenize=False,
+                    add_generation_prompt=True,
+                )
+                outputs = self.llm.generate(
+                    [formatted_prompt],
+                    sampling_params,
+                    unconditional_prompts=[formatted_unconditional_prompt]
+                )
+            else:
+                outputs = self.llm.generate([formatted_prompt], sampling_params)
+            # Extract text from output - handle different output formats
+            if isinstance(outputs, list) and len(outputs) > 0:
+                if hasattr(outputs[0], 'outputs') and len(outputs[0].outputs) > 0:
+                    output_text = outputs[0].outputs[0].text
+                elif hasattr(outputs[0], 'text'):
+                    output_text = outputs[0].text
+                elif isinstance(outputs[0], dict) and 'text' in outputs[0]:
+                    output_text = outputs[0]['text']
+                else:
+                    output_text = str(outputs[0])
+            else:
+                output_text = str(outputs)
+            metadata, audio_codes = self.parse_lm_output(output_text)
+            print(f"[debug]output_text: {output_text}")
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
+        except Exception as e:
+            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return {}, "", error_msg
+    def generate_with_5hz_lm_pt(self, caption: str, lyrics: str, temperature: float = 0.6) -> Tuple[Dict[str, Any], str, str]:
+        """Generate metadata and audio codes using 5Hz LM with PyTorch backend"""
+        try:
+            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+            formatted_prompt = self.llm_tokenizer.apply_chat_template(
+                [
+                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                    {"role": "user", "content": prompt}
+                ],
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+            # Tokenize the prompt
+            inputs = self.llm_tokenizer(
+                formatted_prompt,
+                return_tensors="pt",
+                padding=False,
+                truncation=True,
+            )
+            # Generate with the model
+            with self._load_model_context():
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                # Get max_new_tokens from model config or use a default
+                max_new_tokens = getattr(self.llm.config, 'max_new_tokens', 4096)
+                if hasattr(self, 'max_model_len'):
+                    max_new_tokens = min(max_new_tokens, self.max_model_len)
+                # Define custom streamer for tqdm
+                class TqdmTokenStreamer(BaseStreamer):
+                    def __init__(self, total):
+                        self.pbar = tqdm(total=total, desc="Generating 5Hz tokens", unit="token", maxinterval=1)
+                    def put(self, value):
+                        # value is tensor of token ids
+                        if value.dim() > 1:
+                            num_tokens = value.numel()
+                        else:
+                            num_tokens = len(value)
+                        self.pbar.update(num_tokens)
+                    def end(self):
+                        self.pbar.close()
+                streamer = TqdmTokenStreamer(total=max_new_tokens)
+                with torch.no_grad():
+                    outputs = self.llm.generate(
+                        **inputs,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature,
+                        do_sample=True if temperature > 0 else False,
+                        pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                        streamer=streamer,
+                    )
+            # Decode the generated tokens
+            # Only decode the newly generated tokens (skip the input prompt)
+            generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
+            output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
+            metadata, audio_codes = self.parse_lm_output(output_text)
+            codes_count = len(audio_codes.split('<|audio_code_')) - 1 if audio_codes else 0
+            return metadata, audio_codes, f"✅ Generated successfully\nOutput length: {len(output_text)} chars\nCodes count: {codes_count}"
+        except Exception as e:
+            error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            return {}, "", error_msg
+    def generate_with_5hz_lm(self, caption: str, lyrics: str, temperature: float = 0.6, cfg_scale: float = 1.0, negative_prompt: str = "NO USER INPUT") -> Tuple[Dict[str, Any], str, str]:
+        """Generate metadata and audio codes using 5Hz LM"""
+        # Check if 5Hz LM is initialized
+        if not hasattr(self, 'llm_initialized') or not self.llm_initialized:
+            debug_info = f"llm_initialized={getattr(self, 'llm_initialized', 'not set')}, "
+            debug_info += f"has_llm={hasattr(self, 'llm')}, "
+            debug_info += f"llm_is_none={getattr(self, 'llm', None) is None}, "
+            debug_info += f"llm_backend={getattr(self, 'llm_backend', 'not set')}"
+            return {}, "", f"❌ 5Hz LM not initialized. Please initialize it first. Debug: {debug_info}"
+        if not hasattr(self, 'llm') or self.llm is None:
+            return {}, "", "❌ 5Hz LM model not loaded. Please initialize it first."
+        if not hasattr(self, 'llm_backend'):
+            return {}, "", "❌ 5Hz LM backend not set. Please initialize it first."
+        if self.llm_backend == "vllm":
+            return self.generate_with_5hz_lm_vllm(caption, lyrics, temperature, cfg_scale, negative_prompt)
+        else:
+            return self.generate_with_5hz_lm_pt(caption, lyrics, temperature)
+    def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
+        """
+        Parse LM output to extract metadata and audio codes.
+        Expected format:
+        <think>
+        bpm: 73
+        duration: 273
+        genres: Chinese folk
+        keyscale: G major
+        timesignature: 4
+        </think>
+        <|audio_code_56535|><|audio_code_62918|>...
+        Returns:
+            Tuple of (metadata_dict, audio_codes_string)
+        """
+        debug_output_text = output_text.split("</think>")[0]
+        logger.debug(f"Debug output text: {debug_output_text}")
+        metadata = {}
+        audio_codes = ""
+        import re
+        # Extract audio codes - find all <|audio_code_XXX|> patterns
+        code_pattern = r'<\|audio_code_\d+\|>'
+        code_matches = re.findall(code_pattern, output_text)
+        if code_matches:
+            audio_codes = "".join(code_matches)
+        # Extract metadata from reasoning section
+        # Try different reasoning tag patterns
+        reasoning_patterns = [
+            r'<think>(.*?)</think>',
+            r'<think>(.*?)</think>',
+            r'<reasoning>(.*?)</reasoning>',
+        ]
+        reasoning_text = None
+        for pattern in reasoning_patterns:
+            match = re.search(pattern, output_text, re.DOTALL)
+            if match:
+                reasoning_text = match.group(1).strip()
+                break
+        # If no reasoning tags found, try to parse metadata from the beginning of output
+        if not reasoning_text:
+            # Look for metadata lines before audio codes
+            lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
+            reasoning_text = lines_before_codes.strip()
+        # Parse metadata fields
+        if reasoning_text:
+            for line in reasoning_text.split('\n'):
+                line = line.strip()
+                if ':' in line and not line.startswith('<'):
+                    parts = line.split(':', 1)
+                    if len(parts) == 2:
+                        key = parts[0].strip().lower()
+                        value = parts[1].strip()
+                        if key == 'bpm':
+                            try:
+                                metadata['bpm'] = int(value)
+                            except:
+                                metadata['bpm'] = value
+                        elif key == 'duration':
+                            try:
+                                metadata['duration'] = int(value)
+                            except:
+                                metadata['duration'] = value
+                        elif key == 'genres':
+                            metadata['genres'] = value
+                        elif key == 'keyscale':
+                            metadata['keyscale'] = value
+                        elif key == 'timesignature':
+                            metadata['timesignature'] = value
+        return metadata, audio_codes
+    @contextmanager
+    def _load_model_context(self):
+        """
+        Context manager to load a model to GPU and offload it back to CPU after use.
+        Only used for PyTorch backend when offload_to_cpu is True.
+        """
+        if not self.offload_to_cpu:
+            yield
+            return
+        # If using nanovllm, do not offload (it stays on GPU)
+        if self.llm_backend == "vllm":
+            yield
+            return
+        model = self.llm
+        if model is None:
+            yield
+            return
+        # Load to GPU
+        logger.info(f"Loading LLM to {self.device}")
+        start_time = time.time()
+        if hasattr(model, "to"):
+            model.to(self.device).to(self.dtype)
+        load_time = time.time() - start_time
+        logger.info(f"Loaded LLM to {self.device} in {load_time:.4f}s")
+        try:
+            yield
+        finally:
+            # Offload to CPU
+            logger.info(f"Offloading LLM to CPU")
+            start_time = time.time()
+            if hasattr(model, "to"):
+                model.to("cpu")
+            torch.cuda.empty_cache()
+            offload_time = time.time() - start_time
+            logger.info(f"Offloaded LLM to CPU in {offload_time:.4f}s")

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ gradio
 soundfile
 loguru
 einops
-accelerator

 soundfile
 loguru
 einops
+accelerator
+vector-quantize-pytorch