ACE-Step-1.5

Running on Zero

App Files Files Community

ChuxiJ commited on Dec 23, 2025

Commit

4670365

1 Parent(s): fcbd6fb

fix llm gen for pt

Browse files

Files changed (4) hide show

.gitignore +2 -1
acestep/gradio_ui.py +143 -61
acestep/handler.py +115 -2
acestep/llm_inference.py +441 -175

.gitignore CHANGED Viewed

@@ -214,4 +214,5 @@ checkpoints/
 playground.ipynb
 .history/
 upload_checkpoints.sh
-checkpoints.7z

 playground.ipynb
 .history/
 upload_checkpoints.sh
+checkpoints.7z
+README_old.md

acestep/gradio_ui.py CHANGED Viewed

@@ -36,6 +36,20 @@ def create_gradio_interface(dit_handler, llm_handler, dataset_handler, init_para
             border-radius: 5px;
             margin: 10px 0;
         }
         """
     ) as demo:
@@ -320,43 +334,46 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                     )
                     # Audio uploads
-                    with gr.Accordion("🎵 Audio Uploads", open=False):
-                        with gr.Row():
                             with gr.Column(scale=2):
                                 reference_audio = gr.Audio(
                                     label="Reference Audio (optional)",
                                     type="filepath",
                                 )
-                            with gr.Column(scale=8):
                                 src_audio = gr.Audio(
                                     label="Source Audio (optional)",
                                     type="filepath",
                                 )
-                        audio_code_string = gr.Textbox(
-                            label="Audio Codes (optional)",
-                            placeholder="<|audio_code_10695|><|audio_code_54246|>...",
-                            lines=4,
-                            visible=False,
-                            info="Paste precomputed audio code tokens"
-                        )
                     # Audio Codes for text2music
                     with gr.Accordion("🎼 Audio Codes (for text2music)", open=True, visible=True) as text2music_audio_codes_group:
-                        text2music_audio_code_string = gr.Textbox(
-                            label="Audio Codes",
-                            placeholder="<|audio_code_10695|><|audio_code_54246|>...",
-                            lines=6,
-                            info="Paste precomputed audio code tokens for text2music generation"
-                        )
-                    # 5Hz LM
-                    with gr.Row(visible=True) as use_5hz_lm_row:
-                        use_5hz_lm_btn = gr.Button(
-                            "Generate LM Hints",
-                            variant="secondary",
-                            size="lg",
-                        )
                         lm_temperature = gr.Slider(
                             label="Temperature",
                             minimum=0.0,
@@ -364,7 +381,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             value=0.85,
                             step=0.1,
                             scale=1,
-                            info="Temperature for 5Hz LM sampling (higher = more random, lower = more deterministic)"
                         )
                         lm_cfg_scale = gr.Slider(
                             label="CFG Scale",
@@ -373,10 +390,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             value=2.0,
                             step=0.1,
                             scale=1,
-                            info="Classifier-Free Guidance scale for 5Hz LM (1.0 = no CFG, higher = stronger guidance)"
                         )
-                    with gr.Row():
                         lm_top_k = gr.Slider(
                             label="Top-K",
                             minimum=0,
@@ -384,7 +399,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             value=0,
                             step=1,
                             scale=1,
-                            info="Top-K sampling: consider only top K tokens (0 = disabled)"
                         )
                         lm_top_p = gr.Slider(
                             label="Top-P",
@@ -393,7 +408,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             value=0.9,
                             step=0.01,
                             scale=1,
-                            info="Top-P (nucleus) sampling: cumulative probability threshold (1.0 = disabled)"
                         )
                         lm_repetition_penalty = gr.Slider(
                             label="Repetition Penalty",
@@ -402,20 +417,10 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             value=1.0,
                             step=0.01,
                             scale=1,
-                            info="Repetition penalty: >1.0 reduces repetition, <1.0 increases it (1.0 = no penalty). For audio generation, use 1.0 or very small values (1.01-1.05) as audio tokens naturally repeat.",
                             visible=False,
                         )
-                    # Negative prompt for CFG (only visible when LM initialized and cfg_scale > 1)
-                    lm_negative_prompt = gr.Textbox(
-                        label="Negative Prompt",
-                        value="NO USER INPUT",
-                        placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
-                        visible=True,
-                        info="Negative prompt used for Classifier-Free Guidance when CFG Scale > 1.0",
-                        lines=2
-                    )
                     # Repainting controls
                     with gr.Group(visible=False) as repainting_group:
                         gr.HTML("<h5>🎨 Repainting Controls (seconds) </h5>")
@@ -445,12 +450,24 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                 # Music Caption
                 with gr.Accordion("📝 Music Caption", open=True):
-                    captions = gr.Textbox(
-                        label="Music Caption (optional)",
-                        placeholder="A peaceful acoustic guitar melody with soft vocals...",
-                        lines=3,
-                        info="Describe the style, genre, instruments, and mood"
-                    )
                 # Lyrics
                 with gr.Accordion("📝 Lyrics", open=True):
@@ -468,7 +485,8 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             choices=["en", "zh", "ja", "ko", "es", "fr", "de"],
                             value="en",
                             label="Vocal Language (optional)",
-                            allow_custom_value=True
                         )
                         bpm = gr.Number(
                             label="BPM (optional)",
@@ -477,15 +495,17 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                             info="leave empty for N/A"
                         )
                         key_scale = gr.Textbox(
-                            label="Key/Scale (optional)",
                             placeholder="Leave empty for N/A",
                             value="",
                         )
                         time_signature = gr.Dropdown(
                             choices=["2", "3", "4", "N/A", ""],
                             value="4",
                             label="Time Signature (optional)",
-                            allow_custom_value=True
                         )
                         audio_duration = gr.Number(
                             label="Audio Duration (seconds)",
@@ -497,7 +517,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
                         )
                         batch_size_input = gr.Number(
                             label="Batch Size",
-                            value=1,
                             minimum=1,
                             maximum=8,
                             step=1,
@@ -582,6 +602,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
         generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive)
     return {
         "checkpoint_dropdown": checkpoint_dropdown,
         "refresh_btn": refresh_btn,
         "config_path": config_path,
@@ -598,9 +619,10 @@ def create_generation_section(dit_handler, llm_handler, init_params=None) -> dic
         "instruction_display_gen": instruction_display_gen,
         "track_name": track_name,
         "complete_track_classes": complete_track_classes,
         "reference_audio": reference_audio,
         "src_audio": src_audio,
-        "audio_code_string": audio_code_string,
         "text2music_audio_code_string": text2music_audio_code_string,
         "text2music_audio_codes_group": text2music_audio_codes_group,
         "use_5hz_lm_row": use_5hz_lm_row,
@@ -650,12 +672,22 @@ def create_results_section(dit_handler) -> dict:
                     type="filepath",
                     interactive=False
                 )
             with gr.Column():
                 generated_audio_2 = gr.Audio(
                     label="🎵 Generated Music (Sample 2)",
                     type="filepath",
                     interactive=False
                 )
         with gr.Accordion("📁 Batch Results & Generation Details", open=False):
             generated_audio_batch = gr.File(
@@ -680,6 +712,8 @@ def create_results_section(dit_handler) -> dict:
         "status_output": status_output,
         "generated_audio_1": generated_audio_1,
         "generated_audio_2": generated_audio_2,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
         "align_score_1": align_score_1,
@@ -768,7 +802,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
     # Service initialization
     def init_service_wrapper(checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
-        """Wrapper for service initialization, returns status and button state"""
         # Initialize DiT handler
         status, enable = dit_handler.initialize_service(
             checkpoint, config_path, device,
@@ -799,7 +833,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
                 # Don't fail the entire initialization if LM fails, but log it
                 # Keep enable as is (DiT initialization result) even if LM fails
-        return status, gr.update(interactive=enable)
     # Update negative prompt visibility based on "Initialize 5Hz LM" checkbox
     def update_negative_prompt_visibility(init_llm_checked):
@@ -855,7 +893,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["offload_to_cpu_checkbox"],
             generation_section["offload_dit_to_cpu_checkbox"],
         ],
-        outputs=[generation_section["init_status"], generation_section["generate_btn"]]
     )
     # Generation with progress bar
@@ -992,6 +1030,18 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         ]
     )
     # Update instruction and UI visibility based on task type
     def update_instruction_ui(
         task_type_value: str,
@@ -1020,8 +1070,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
         else:
             audio_cover_strength_label = "Audio Cover Strength"
             audio_cover_strength_info = "Control how many denoising steps use cover mode"
-        # Show audio_code_string for cover
-        audio_code_visible = task_type_value == "cover"
         # Show repainting controls for repaint and lego
         repainting_visible = task_type_value in ["repaint", "lego"]
         # Show use_5hz_lm, lm_temperature for text2music
@@ -1037,7 +1085,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             gr.update(visible=complete_visible),  # complete_track_classes
             gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
             gr.update(visible=repainting_visible),  # repainting_group
-            gr.update(visible=audio_code_visible),  # audio_code_string
             gr.update(visible=use_5hz_lm_visible),  # use_5hz_lm_row
             gr.update(visible=text2music_audio_codes_visible),  # text2music_audio_codes_group
         )
@@ -1058,7 +1105,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
-            generation_section["audio_code_string"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
@@ -1080,7 +1126,6 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
-            generation_section["audio_code_string"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
@@ -1102,9 +1147,46 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
-            generation_section["audio_code_string"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
     )

             border-radius: 5px;
             margin: 10px 0;
         }
+        .lm-hints-row {
+            align-items: stretch;
+        }
+        .lm-hints-col {
+            display: flex;
+        }
+        .lm-hints-col > div {
+            flex: 1;
+            display: flex;
+        }
+        .lm-hints-btn button {
+            height: 100%;
+            width: 100%;
+        }
         """
     ) as demo:
                     )
                     # Audio uploads
+                    audio_uploads_accordion = gr.Accordion("🎵 Audio Uploads", open=False)
+                    with audio_uploads_accordion:
+                        with gr.Row(equal_height=True):
                             with gr.Column(scale=2):
                                 reference_audio = gr.Audio(
                                     label="Reference Audio (optional)",
                                     type="filepath",
                                 )
+                            with gr.Column(scale=7):
                                 src_audio = gr.Audio(
                                     label="Source Audio (optional)",
                                     type="filepath",
                                 )
+                            with gr.Column(scale=1, min_width=80):
+                                convert_src_to_codes_btn = gr.Button(
+                                    "Convert to Codes",
+                                    variant="secondary",
+                                    size="sm"
+                                )
                     # Audio Codes for text2music
                     with gr.Accordion("🎼 Audio Codes (for text2music)", open=True, visible=True) as text2music_audio_codes_group:
+                        with gr.Row(equal_height=True, elem_classes=["lm-hints-row"]):
+                            with gr.Column(scale=9):
+                                text2music_audio_code_string = gr.Textbox(
+                                    label="Audio Codes",
+                                    placeholder="<|audio_code_10695|><|audio_code_54246|>...",
+                                    lines=6,
+                                    info="Paste precomputed audio code tokens for text2music generation"
+                                )
+                            with gr.Column(scale=3, elem_classes=["lm-hints-col"]):
+                                with gr.Row(equal_height=True, visible=True) as use_5hz_lm_row:
+                                    use_5hz_lm_btn = gr.Button(
+                                        "Generate LM Hints",
+                                        variant="secondary",
+                                        # size="lg",
+                                        elem_classes=["lm-hints-btn"],
+                                    )
+                    with gr.Row(equal_height=True):
                         lm_temperature = gr.Slider(
                             label="Temperature",
                             minimum=0.0,
                             value=0.85,
                             step=0.1,
                             scale=1,
+                            info="5Hz LM temperature (higher = random)"
                         )
                         lm_cfg_scale = gr.Slider(
                             label="CFG Scale",
                             value=2.0,
                             step=0.1,
                             scale=1,
+                            info="5Hz LM CFG (1.0 = no CFG)"
                         )
                         lm_top_k = gr.Slider(
                             label="Top-K",
                             minimum=0,
                             value=0,
                             step=1,
                             scale=1,
+                            info="Top-K (0 = disabled)"
                         )
                         lm_top_p = gr.Slider(
                             label="Top-P",
                             value=0.9,
                             step=0.01,
                             scale=1,
+                            info="Top-P (1.0 = disabled)"
                         )
                         lm_repetition_penalty = gr.Slider(
                             label="Repetition Penalty",
                             value=1.0,
                             step=0.01,
                             scale=1,
+                            info="Repetition penalty: >1.0 reduces repetition, <1.0 increases it. Use 1.0 or very small values for audio tokens.",
                             visible=False,
                         )
                     # Repainting controls
                     with gr.Group(visible=False) as repainting_group:
                         gr.HTML("<h5>🎨 Repainting Controls (seconds) </h5>")
                 # Music Caption
                 with gr.Accordion("📝 Music Caption", open=True):
+                    with gr.Row(equal_height=True):
+                        captions = gr.Textbox(
+                            label="Music Caption (optional)",
+                            placeholder="A peaceful acoustic guitar melody with soft vocals...",
+                            lines=3,
+                            info="Describe the style, genre, instruments, and mood",
+                            scale=7,
+                        )
+                        # Negative prompt for CFG (only visible when LM initialized and cfg_scale > 1)
+                        lm_negative_prompt = gr.Textbox(
+                            label="Negative Prompt",
+                            value="NO USER INPUT",
+                            placeholder="Enter negative prompt for CFG (default: NO USER INPUT)",
+                            visible=True,
+                            info="Negative prompt (use when CFG Scale > 1.0)",
+                            lines=3,
+                            scale=5,
+                        )
                 # Lyrics
                 with gr.Accordion("📝 Lyrics", open=True):
                             choices=["en", "zh", "ja", "ko", "es", "fr", "de"],
                             value="en",
                             label="Vocal Language (optional)",
+                            allow_custom_value=True,
+                            info="use `unknown` for inst"
                         )
                         bpm = gr.Number(
                             label="BPM (optional)",
                             info="leave empty for N/A"
                         )
                         key_scale = gr.Textbox(
+                            label="KeyScale (optional)",
                             placeholder="Leave empty for N/A",
                             value="",
+                            info="A-G, #/♭, major/minor"
                         )
                         time_signature = gr.Dropdown(
                             choices=["2", "3", "4", "N/A", ""],
                             value="4",
                             label="Time Signature (optional)",
+                            allow_custom_value=True,
+                            info="2/4, 3/4, 4/4..."
                         )
                         audio_duration = gr.Number(
                             label="Audio Duration (seconds)",
                         )
                         batch_size_input = gr.Number(
                             label="Batch Size",
+                            value=2,
                             minimum=1,
                             maximum=8,
                             step=1,
         generate_btn = gr.Button("🎵 Generate Music", variant="primary", size="lg", interactive=generate_btn_interactive)
     return {
+        "service_config_accordion": service_config_accordion,
         "checkpoint_dropdown": checkpoint_dropdown,
         "refresh_btn": refresh_btn,
         "config_path": config_path,
         "instruction_display_gen": instruction_display_gen,
         "track_name": track_name,
         "complete_track_classes": complete_track_classes,
+        "audio_uploads_accordion": audio_uploads_accordion,
         "reference_audio": reference_audio,
         "src_audio": src_audio,
+        "convert_src_to_codes_btn": convert_src_to_codes_btn,
         "text2music_audio_code_string": text2music_audio_code_string,
         "text2music_audio_codes_group": text2music_audio_codes_group,
         "use_5hz_lm_row": use_5hz_lm_row,
                     type="filepath",
                     interactive=False
                 )
+                send_to_src_btn_1 = gr.Button(
+                    "Send To Src Audio",
+                    variant="secondary",
+                    size="sm"
+                )
             with gr.Column():
                 generated_audio_2 = gr.Audio(
                     label="🎵 Generated Music (Sample 2)",
                     type="filepath",
                     interactive=False
                 )
+                send_to_src_btn_2 = gr.Button(
+                    "Send To Src Audio",
+                    variant="secondary",
+                    size="sm"
+                )
         with gr.Accordion("📁 Batch Results & Generation Details", open=False):
             generated_audio_batch = gr.File(
         "status_output": status_output,
         "generated_audio_1": generated_audio_1,
         "generated_audio_2": generated_audio_2,
+        "send_to_src_btn_1": send_to_src_btn_1,
+        "send_to_src_btn_2": send_to_src_btn_2,
         "generated_audio_batch": generated_audio_batch,
         "generation_info": generation_info,
         "align_score_1": align_score_1,
     # Service initialization
     def init_service_wrapper(checkpoint, config_path, device, init_llm, lm_model_path, backend, use_flash_attention, offload_to_cpu, offload_dit_to_cpu):
+        """Wrapper for service initialization, returns status, button state, and accordion state"""
         # Initialize DiT handler
         status, enable = dit_handler.initialize_service(
             checkpoint, config_path, device,
                 # Don't fail the entire initialization if LM fails, but log it
                 # Keep enable as is (DiT initialization result) even if LM fails
+        # Check if model is initialized - if so, collapse the accordion
+        is_model_initialized = dit_handler.model is not None
+        accordion_state = gr.update(open=not is_model_initialized)
+        return status, gr.update(interactive=enable), accordion_state
     # Update negative prompt visibility based on "Initialize 5Hz LM" checkbox
     def update_negative_prompt_visibility(init_llm_checked):
             generation_section["offload_to_cpu_checkbox"],
             generation_section["offload_dit_to_cpu_checkbox"],
         ],
+        outputs=[generation_section["init_status"], generation_section["generate_btn"], generation_section["service_config_accordion"]]
     )
     # Generation with progress bar
         ]
     )
+    # Convert src audio to codes
+    def convert_src_audio_to_codes_wrapper(src_audio):
+        """Wrapper for converting src audio to codes"""
+        codes_string = dit_handler.convert_src_audio_to_codes(src_audio)
+        return codes_string
+    generation_section["convert_src_to_codes_btn"].click(
+        fn=convert_src_audio_to_codes_wrapper,
+        inputs=[generation_section["src_audio"]],
+        outputs=[generation_section["text2music_audio_code_string"]]
+    )
     # Update instruction and UI visibility based on task type
     def update_instruction_ui(
         task_type_value: str,
         else:
             audio_cover_strength_label = "Audio Cover Strength"
             audio_cover_strength_info = "Control how many denoising steps use cover mode"
         # Show repainting controls for repaint and lego
         repainting_visible = task_type_value in ["repaint", "lego"]
         # Show use_5hz_lm, lm_temperature for text2music
             gr.update(visible=complete_visible),  # complete_track_classes
             gr.update(visible=audio_cover_strength_visible, label=audio_cover_strength_label, info=audio_cover_strength_info),  # audio_cover_strength
             gr.update(visible=repainting_visible),  # repainting_group
             gr.update(visible=use_5hz_lm_visible),  # use_5hz_lm_row
             gr.update(visible=text2music_audio_codes_visible),  # text2music_audio_codes_group
         )
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
             generation_section["complete_track_classes"],
             generation_section["audio_cover_strength"],
             generation_section["repainting_group"],
             generation_section["use_5hz_lm_row"],
             generation_section["text2music_audio_codes_group"],
         ]
     )
+    # Send generated audio to src_audio
+    def send_audio_to_src(audio_file):
+        """Send generated audio file to src_audio input"""
+        if audio_file is None:
+            return None
+        return audio_file
+    results_section["send_to_src_btn_1"].click(
+        fn=send_audio_to_src,
+        inputs=[results_section["generated_audio_1"]],
+        outputs=[generation_section["src_audio"]]
+    )
+    results_section["send_to_src_btn_2"].click(
+        fn=send_audio_to_src,
+        inputs=[results_section["generated_audio_2"]],
+        outputs=[generation_section["src_audio"]]
+    )
+    # Auto-expand Audio Uploads accordion when audio is uploaded
+    def update_audio_uploads_accordion(reference_audio, src_audio):
+        """Update Audio Uploads accordion open state based on whether audio files are present"""
+        has_audio = (reference_audio is not None) or (src_audio is not None)
+        return gr.update(open=has_audio)
+    # Bind to both audio components' change events
+    generation_section["reference_audio"].change(
+        fn=update_audio_uploads_accordion,
+        inputs=[generation_section["reference_audio"], generation_section["src_audio"]],
+        outputs=[generation_section["audio_uploads_accordion"]]
+    )
+    generation_section["src_audio"].change(
+        fn=update_audio_uploads_accordion,
+        inputs=[generation_section["reference_audio"], generation_section["src_audio"]],
+        outputs=[generation_section["audio_uploads_accordion"]]
+    )

acestep/handler.py CHANGED Viewed

@@ -504,6 +504,49 @@ class AceStepHandler:
         return parsed_metas
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get text hidden states from text encoder."""
         if self.text_tokenizer is None or self.text_encoder is None:
@@ -765,6 +808,71 @@ class AceStepHandler:
         except Exception as e:
             logger.error(f"Error processing target audio: {e}")
             return None
     def prepare_batch_data(
         self,
@@ -1919,10 +2027,15 @@ class AceStepHandler:
                 refer_audios = [[torch.zeros(2, 30*self.sample_rate)] for _ in range(actual_batch_size)]
             # 2. Process source audio
             processed_src_audio = None
             if src_audio is not None:
-                logger.info("[generate_music] Processing source audio...")
-                processed_src_audio = self.process_src_audio(src_audio)
             # 3. Prepare batch data
             captions_batch, instructions_batch, lyrics_batch, vocal_languages_batch, metas_batch = self.prepare_batch_data(

         return parsed_metas
+    def build_dit_inputs(
+        self,
+        task: str,
+        instruction: Optional[str],
+        caption: str,
+        lyrics: str,
+        metas: Optional[Union[str, Dict[str, Any]]] = None,
+        vocal_language: str = "en",
+    ) -> Tuple[str, str]:
+        """
+        Build text inputs for the caption and lyric branches used by DiT.
+        Args:
+            task: Task name (e.g., text2music, cover, repaint); kept for logging/future branching.
+            instruction: Instruction text; default fallback matches service_generate behavior.
+            caption: Caption string.
+            lyrics: Lyrics string.
+            metas: Metadata (str or dict); follows _parse_metas formatting.
+            vocal_language: Language code for lyrics section.
+        Returns:
+            (caption_input_text, lyrics_input_text)
+        Example:
+            caption_input, lyrics_input = handler.build_dit_inputs(
+                task="text2music",
+                instruction=None,
+                caption="A calm piano melody",
+                lyrics="la la la",
+                metas={"bpm": 90, "duration": 45},
+                vocal_language="en",
+            )
+        """
+        # Align instruction formatting with _prepare_batch
+        final_instruction = instruction or "Fill the audio semantic mask based on the given conditions:"
+        if not final_instruction.endswith(":"):
+            final_instruction = final_instruction + ":"
+        parsed_meta = self._parse_metas([metas])[0]
+        caption_input = SFT_GEN_PROMPT.format(final_instruction, caption, parsed_meta)
+        lyrics_input = f"# Languages\n{vocal_language}\n\n# Lyric\n{lyrics}<|endoftext|>"
+        return caption_input, lyrics_input
     def _get_text_hidden_states(self, text_prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
         """Get text hidden states from text encoder."""
         if self.text_tokenizer is None or self.text_encoder is None:
         except Exception as e:
             logger.error(f"Error processing target audio: {e}")
             return None
+    def convert_src_audio_to_codes(self, audio_file) -> str:
+        """
+        Convert uploaded source audio to audio codes string.
+        Args:
+            audio_file: Path to audio file or None
+        Returns:
+            Formatted codes string like '<|audio_code_123|><|audio_code_456|>...' or error message
+        """
+        if audio_file is None:
+            return "❌ Please upload source audio first"
+        if self.model is None or self.vae is None:
+            return "❌ Model not initialized. Please initialize the service first."
+        try:
+            # Process audio file
+            processed_audio = self.process_src_audio(audio_file)
+            if processed_audio is None:
+                return "❌ Failed to process audio file"
+            # Encode audio to latents using VAE
+            with torch.no_grad():
+                with self._load_model_context("vae"):
+                    # Prepare audio for VAE: [channels, samples] -> [1, channels, samples]
+                    vae_input = processed_audio.unsqueeze(0).to(self.device).to(self.vae.dtype)
+                    # Check if audio is silence
+                    if self.is_silence(vae_input):
+                        return "❌ Audio file appears to be silent"
+                    # Encode to latents
+                    latents = self.vae.encode(vae_input).latent_dist.sample()
+                    # Cast back to model dtype
+                    latents = latents.to(self.dtype)
+                    # Transpose: [1, d, T] -> [1, T, d] -> [T, d]
+                    latents = latents.squeeze(0).transpose(0, 1)  # [T, d]
+                # Create attention mask for latents
+                attention_mask = torch.ones(latents.shape[0], dtype=torch.bool, device=self.device)
+                # Tokenize latents to get code indices
+                with self._load_model_context("model"):
+                    # Prepare latents for tokenize: [T, d] -> [1, T, d]
+                    hidden_states = latents.unsqueeze(0)  # [1, T, d]
+                    # Call tokenize method
+                    # tokenize returns: (quantized, indices, attention_mask)
+                    _, indices, _ = self.model.tokenize(hidden_states, self.silence_latent, attention_mask.unsqueeze(0))
+                    # Format indices as code string
+                    # indices shape: [1, T_5Hz] or [1, T_5Hz, num_quantizers]
+                    # Flatten and convert to list
+                    indices_flat = indices.flatten().cpu().tolist()
+                    codes_string = "".join([f"<|audio_code_{idx}|>" for idx in indices_flat])
+                    logger.info(f"[convert_src_audio_to_codes] Generated {len(indices_flat)} audio codes")
+                    return codes_string
+        except Exception as e:
+            error_msg = f"❌ Error converting audio to codes: {str(e)}\n{traceback.format_exc()}"
+            logger.error(error_msg)
+            return error_msg
     def prepare_batch_data(
         self,
                 refer_audios = [[torch.zeros(2, 30*self.sample_rate)] for _ in range(actual_batch_size)]
             # 2. Process source audio
+            # If audio_code_string is provided, ignore src_audio and use codes instead
             processed_src_audio = None
             if src_audio is not None:
+                # Check if audio codes are provided - if so, ignore src_audio
+                if audio_code_string and str(audio_code_string).strip():
+                    logger.info("[generate_music] Audio codes provided, ignoring src_audio and using codes instead")
+                else:
+                    logger.info("[generate_music] Processing source audio...")
+                    processed_src_audio = self.process_src_audio(src_audio)
             # 3. Prepare batch data
             captions_batch, instructions_batch, lyrics_batch, vocal_languages_batch, metas_batch = self.prepare_batch_data(

acestep/llm_inference.py CHANGED Viewed

@@ -11,20 +11,15 @@ from contextlib import contextmanager
 import torch
 from tqdm import tqdm
 from loguru import logger
-from transformers import AutoTokenizer, AutoModelForCausalLM, ClassifierFreeGuidanceLogitsProcessor
 from transformers.generation.streamers import BaseStreamer
 from transformers.generation.logits_process import (
     LogitsProcessorList,
-    LogitsProcessor,
-    TopKLogitsWarper,
-    TopPLogitsWarper,
     RepetitionPenaltyLogitsProcessor,
-    TemperatureLogitsWarper,
 )
 class LLMHandler:
     """5Hz LM Handler for audio code generation"""
@@ -234,16 +229,7 @@ class LLMHandler:
         try:
             from nanovllm import SamplingParams
-            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
-            formatted_prompt = self.llm_tokenizer.apply_chat_template(
-                [
-                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                    {"role": "user", "content": prompt}
-                ],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
             logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
             sampling_params = SamplingParams(
@@ -257,14 +243,7 @@ class LLMHandler:
             # Use CFG if cfg_scale > 1.0
             if cfg_scale > 1.0:
                 # Build unconditional prompt (user input replaced with "NO USER INPUT")
-                formatted_unconditional_prompt = self.llm_tokenizer.apply_chat_template(
-                    [
-                        {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                        {"role": "user", "content": negative_prompt}
-                    ],
-                    tokenize=False,
-                    add_generation_prompt=True,
-                )
                 outputs = self.llm.generate(
                     [formatted_prompt],
                     sampling_params,
@@ -293,6 +272,53 @@ class LLMHandler:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
     def generate_with_5hz_lm_pt(
         self,
         caption: str,
@@ -306,23 +332,13 @@ class LLMHandler:
     ) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with PyTorch backend"""
         try:
-            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
-            formatted_prompt = self.llm_tokenizer.apply_chat_template(
-                [
-                    {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                    {"role": "user", "content": prompt}
-                ],
-                tokenize=False,
-                add_generation_prompt=True,
-            )
             # Tokenize the prompt
             inputs = self.llm_tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=False,
-                truncation=True,
             )
             # Generate with the model
@@ -352,82 +368,90 @@ class LLMHandler:
                 streamer = TqdmTokenStreamer(total=max_new_tokens)
-                # Build logits processor list
                 logits_processor = LogitsProcessorList()
-                # Add repetition penalty if needed
                 if repetition_penalty != 1.0:
                     logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
-                # Add temperature warper if needed (temperature is handled separately in generate, but we can also use warper)
-                # Note: temperature is passed directly to generate(), but we can use TemperatureLogitsWarper for consistency
-                if temperature != 1.0:
-                    logits_processor.append(TemperatureLogitsWarper(temperature=temperature))
-                # Add top-k warper if specified
-                if top_k is not None and top_k > 0:
-                    logits_processor.append(TopKLogitsWarper(top_k=top_k))
-                # Add top-p warper if specified
-                if top_p is not None and top_p > 0.0 and top_p < 1.0:
-                    logits_processor.append(TopPLogitsWarper(top_p=top_p))
                 # Handle CFG if cfg_scale > 1.0
                 if cfg_scale > 1.0:
                     # Build unconditional prompt
-                    formatted_unconditional_prompt = self.llm_tokenizer.apply_chat_template(
-                        [
-                            {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
-                            {"role": "user", "content": negative_prompt}
-                        ],
-                        tokenize=False,
-                        add_generation_prompt=True,
-                    )
-                    # Tokenize unconditional prompt
-                    uncond_inputs = self.llm_tokenizer(
-                        formatted_unconditional_prompt,
                         return_tensors="pt",
-                        padding=False,
                         truncation=True,
                     )
-                    uncond_inputs = {k: v.to(self.device) for k, v in uncond_inputs.items()}
-                    # Use custom CFG generation with batch processing
-                    # Combine conditional and unconditional inputs into a batch
-                    # Format: [cond_input, uncond_input]
-                    batch_input_ids = torch.cat([inputs['input_ids'], uncond_inputs['input_ids']], dim=0)
-                    batch_attention_mask = None
-                    if 'attention_mask' in inputs:
-                        batch_attention_mask = torch.cat([inputs['attention_mask'], uncond_inputs.get('attention_mask', torch.ones_like(uncond_inputs['input_ids']))], dim=0)
-                    # Custom CFG generation loop
-                    outputs = self._generate_with_cfg(
                         batch_input_ids=batch_input_ids,
                         batch_attention_mask=batch_attention_mask,
                         max_new_tokens=max_new_tokens,
                         temperature=temperature,
                         cfg_scale=cfg_scale,
-                        logits_processor=logits_processor,
                         pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                         streamer=streamer,
                     )
                 else:
-                    # Generate without CFG
                     with torch.no_grad():
                         outputs = self.llm.generate(
                             **inputs,
                             max_new_tokens=max_new_tokens,
                             temperature=temperature if temperature > 0 else 1.0,
                             do_sample=True if temperature > 0 else False,
                             logits_processor=logits_processor if len(logits_processor) > 0 else None,
                             pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                             streamer=streamer,
                         )
             # Decode the generated tokens
             # Only decode the newly generated tokens (skip the input prompt)
-            generated_ids = outputs[0][inputs['input_ids'].shape[1]:]
             output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
             metadata, audio_codes = self.parse_lm_output(output_text)
@@ -436,8 +460,120 @@ class LLMHandler:
         except Exception as e:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
     def generate_with_5hz_lm(
         self,
         caption: str,
@@ -474,103 +610,115 @@ class LLMHandler:
                 caption, lyrics, temperature, cfg_scale, negative_prompt,
                 top_k, top_p, repetition_penalty
             )
-    def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
         """
-        Parse LM output to extract metadata and audio codes.
-        Expected format:
-        <think>
-        bpm: 73
-        duration: 273
-        genres: Chinese folk
-        keyscale: G major
-        timesignature: 4
-        </think>
-        <|audio_code_56535|><|audio_code_62918|>...
         Returns:
-            Tuple of (metadata_dict, audio_codes_string)
         """
-        debug_output_text = output_text.split("</think>")[0]
-        logger.debug(f"Debug output text: {debug_output_text}")
-        metadata = {}
-        audio_codes = ""
-        import re
-        # Extract audio codes - find all <|audio_code_XXX|> patterns
-        code_pattern = r'<\|audio_code_\d+\|>'
-        code_matches = re.findall(code_pattern, output_text)
-        if code_matches:
-            audio_codes = "".join(code_matches)
-        # Extract metadata from reasoning section
-        # Try different reasoning tag patterns
-        reasoning_patterns = [
-            r'<think>(.*?)</think>',
-            r'<think>(.*?)</think>',
-            r'<reasoning>(.*?)</reasoning>',
-        ]
-        reasoning_text = None
-        for pattern in reasoning_patterns:
-            match = re.search(pattern, output_text, re.DOTALL)
-            if match:
-                reasoning_text = match.group(1).strip()
-                break
-        # If no reasoning tags found, try to parse metadata from the beginning of output
-        if not reasoning_text:
-            # Look for metadata lines before audio codes
-            lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
-            reasoning_text = lines_before_codes.strip()
-        # Parse metadata fields
-        if reasoning_text:
-            for line in reasoning_text.split('\n'):
-                line = line.strip()
-                if ':' in line and not line.startswith('<'):
-                    parts = line.split(':', 1)
-                    if len(parts) == 2:
-                        key = parts[0].strip().lower()
-                        value = parts[1].strip()
-                        if key == 'bpm':
-                            try:
-                                metadata['bpm'] = int(value)
-                            except:
-                                metadata['bpm'] = value
-                        elif key == 'duration':
-                            try:
-                                metadata['duration'] = int(value)
-                            except:
-                                metadata['duration'] = value
-                        elif key == 'genres':
-                            metadata['genres'] = value
-                        elif key == 'keyscale':
-                            metadata['keyscale'] = value
-                        elif key == 'timesignature':
-                            metadata['timesignature'] = value
-        return metadata, audio_codes
-    def _generate_with_cfg(
         self,
         batch_input_ids: torch.Tensor,
         batch_attention_mask: Optional[torch.Tensor],
         max_new_tokens: int,
         temperature: float,
         cfg_scale: float,
-        logits_processor: Optional[LogitsProcessorList],
         pad_token_id: int,
         streamer: Optional[BaseStreamer],
     ) -> torch.Tensor:
         """
-        Custom generation loop with CFG support using batch processing.
-        Batch format: [conditional_input, unconditional_input]
-        This properly utilizes KV cache by processing both sequences in parallel.
         """
         model = self.llm
         device = self.device
@@ -594,6 +742,16 @@ class LLMHandler:
         past_key_values = None
         use_cache = hasattr(model, 'generation_config') and getattr(model.generation_config, 'use_cache', True)
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
@@ -613,22 +771,38 @@ class LLMHandler:
                         use_cache=use_cache,
                     )
-                # Get logits
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
                 # Split conditional and unconditional logits
                 cond_logits = next_token_logits[cond_start_idx:cond_start_idx+batch_size]
                 uncond_logits = next_token_logits[uncond_start_idx:uncond_start_idx+batch_size]
-                # Apply CFG formula: logits_cfg = logits_uncond + cfg_scale * (logits_cond - logits_uncond)
                 cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
-                # Apply logits processors (temperature, top-k, top-p, repetition penalty)
-                if logits_processor is not None:
-                    # Get current input_ids for repetition penalty (only conditional part)
-                    current_input_ids = generated_ids[cond_start_idx:cond_start_idx+batch_size]
-                    for processor in logits_processor:
-                        cfg_logits = processor(current_input_ids, cfg_logits)
                 # Apply temperature and sample
                 if temperature > 0:
@@ -638,9 +812,19 @@ class LLMHandler:
                 else:
                     next_tokens = torch.argmax(cfg_logits, dim=-1)
-                # Update generated sequences (apply same token to both conditional and unconditional)
-                next_tokens = next_tokens.unsqueeze(1)
-                generated_ids = torch.cat([generated_ids, next_tokens.repeat(2, 1)], dim=1)
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size*2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
                 model_kwargs['attention_mask'] = attention_mask
@@ -650,17 +834,99 @@ class LLMHandler:
                 # Update streamer
                 if streamer is not None:
-                    streamer.put(next_tokens[0])  # Only stream conditional tokens
-                # Check for EOS (simplified - you may want to check model's eos_token_id)
-                if (next_tokens[0] == pad_token_id).all():
                     break
         if streamer is not None:
             streamer.end()
-        # Return only conditional output
-        return generated_ids[cond_start_idx:cond_start_idx+batch_size]
     @contextmanager
     def _load_model_context(self):

 import torch
 from tqdm import tqdm
 from loguru import logger
+from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.generation.streamers import BaseStreamer
 from transformers.generation.logits_process import (
     LogitsProcessorList,
     RepetitionPenaltyLogitsProcessor,
+    LogitsProcessor,
 )
 class LLMHandler:
     """5Hz LM Handler for audio code generation"""
         try:
             from nanovllm import SamplingParams
+            formatted_prompt = self.build_formatted_prompt(caption, lyrics)
             logger.debug(f"[debug] formatted_prompt: {formatted_prompt}")
             sampling_params = SamplingParams(
             # Use CFG if cfg_scale > 1.0
             if cfg_scale > 1.0:
                 # Build unconditional prompt (user input replaced with "NO USER INPUT")
+                formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
                 outputs = self.llm.generate(
                     [formatted_prompt],
                     sampling_params,
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
             return {}, "", error_msg
+    def _run_vllm_from_formatted(
+        self,
+        formatted_prompt: str,
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+    ) -> str:
+        """Shared vllm path: accept prebuilt formatted prompt and return text."""
+        from nanovllm import SamplingParams
+        sampling_params = SamplingParams(
+            max_tokens=self.max_model_len - 64,
+            temperature=temperature,
+            cfg_scale=cfg_scale,
+            top_k=top_k,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+        )
+        if cfg_scale > 1.0:
+            formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
+            outputs = self.llm.generate(
+                [formatted_prompt],
+                sampling_params,
+                unconditional_prompts=[formatted_unconditional_prompt],
+            )
+        else:
+            outputs = self.llm.generate([formatted_prompt], sampling_params)
+        # Extract text (retain original selection order/logic)
+        if isinstance(outputs, list) and len(outputs) > 0:
+            if hasattr(outputs[0], "outputs") and len(outputs[0].outputs) > 0:
+                output_text = outputs[0].outputs[0].text
+            elif hasattr(outputs[0], "text"):
+                output_text = outputs[0].text
+            elif isinstance(outputs[0], dict) and "text" in outputs[0]:
+                output_text = outputs[0]["text"]
+            else:
+                output_text = str(outputs[0])
+        else:
+            output_text = str(outputs)
+        return output_text
     def generate_with_5hz_lm_pt(
         self,
         caption: str,
     ) -> Tuple[Dict[str, Any], str, str]:
         """Generate metadata and audio codes using 5Hz LM with PyTorch backend"""
         try:
+            formatted_prompt = self.build_formatted_prompt(caption, lyrics)
             # Tokenize the prompt
             inputs = self.llm_tokenizer(
                 formatted_prompt,
                 return_tensors="pt",
                 padding=False,
             )
             # Generate with the model
                 streamer = TqdmTokenStreamer(total=max_new_tokens)
+                # Build logits processor list (only for CFG and repetition penalty)
                 logits_processor = LogitsProcessorList()
+                # Add repetition penalty if needed (generate() doesn't support it natively in all versions)
                 if repetition_penalty != 1.0:
                     logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
                 # Handle CFG if cfg_scale > 1.0
                 if cfg_scale > 1.0:
                     # Build unconditional prompt
+                    formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
+                    # Tokenize both prompts together to ensure same length (with left padding)
+                    # Left padding is important for generation tasks
+                    batch_texts = [formatted_prompt, formatted_unconditional_prompt]
+                    original_padding_side = self.llm_tokenizer.padding_side
+                    self.llm_tokenizer.padding_side = 'left'
+                    batch_inputs = self.llm_tokenizer(
+                        batch_texts,
                         return_tensors="pt",
+                        padding=True,
                         truncation=True,
                     )
+                    self.llm_tokenizer.padding_side = original_padding_side
+                    batch_inputs = {k: v.to(self.device) for k, v in batch_inputs.items()}
+                    # Extract conditional and unconditional inputs
+                    batch_input_ids = batch_inputs['input_ids']  # [2, seq_len]
+                    batch_attention_mask = batch_inputs.get('attention_mask', None)
+                    # Use custom CFG generation loop
+                    outputs = self._generate_with_cfg_custom(
                         batch_input_ids=batch_input_ids,
                         batch_attention_mask=batch_attention_mask,
                         max_new_tokens=max_new_tokens,
                         temperature=temperature,
                         cfg_scale=cfg_scale,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
                         pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                         streamer=streamer,
                     )
+                    # Extract only the conditional output (first in batch)
+                    outputs = outputs[0:1]  # Keep only conditional output
                 else:
+                    # Generate without CFG using native generate() parameters
                     with torch.no_grad():
                         outputs = self.llm.generate(
                             **inputs,
                             max_new_tokens=max_new_tokens,
                             temperature=temperature if temperature > 0 else 1.0,
                             do_sample=True if temperature > 0 else False,
+                            top_k=top_k if top_k is not None and top_k > 0 else None,
+                            top_p=top_p if top_p is not None and 0.0 < top_p < 1.0 else None,
                             logits_processor=logits_processor if len(logits_processor) > 0 else None,
                             pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
                             streamer=streamer,
                         )
             # Decode the generated tokens
+            # outputs is a tensor with shape [batch_size, seq_len], extract first sequence
+            if isinstance(outputs, torch.Tensor):
+                if outputs.dim() == 2:
+                    generated_ids = outputs[0]
+                else:
+                    generated_ids = outputs
+            else:
+                generated_ids = outputs[0]
             # Only decode the newly generated tokens (skip the input prompt)
+            # Use the correct input length based on whether CFG was used
+            if cfg_scale > 1.0:
+                # In CFG case, use batch_inputs length (both sequences have same length due to padding)
+                input_length = batch_inputs['input_ids'].shape[1]
+            else:
+                input_length = inputs['input_ids'].shape[1]
+            generated_ids = generated_ids[input_length:]
+            # Move to CPU for decoding
+            if generated_ids.is_cuda:
+                generated_ids = generated_ids.cpu()
             output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
             metadata, audio_codes = self.parse_lm_output(output_text)
         except Exception as e:
             error_msg = f"❌ Error generating with 5Hz LM: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
+            logger.error(error_msg)
             return {}, "", error_msg
+    def _run_pt_from_formatted(
+        self,
+        formatted_prompt: str,
+        temperature: float,
+        cfg_scale: float,
+        negative_prompt: str,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
+    ) -> str:
+        """Shared PyTorch path: accept prebuilt formatted prompt and return text."""
+        inputs = self.llm_tokenizer(
+            formatted_prompt,
+            return_tensors="pt",
+            padding=False,
+            truncation=True,
+        )
+        with self._load_model_context():
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            max_new_tokens = getattr(self.llm.config, "max_new_tokens", 4096)
+            if hasattr(self, "max_model_len"):
+                max_new_tokens = min(max_new_tokens, self.max_model_len - 64)
+            # Build logits processor list (only for CFG and repetition penalty)
+            logits_processor = LogitsProcessorList()
+            # Add repetition penalty if needed
+            if repetition_penalty != 1.0:
+                logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+            if cfg_scale > 1.0:
+                formatted_unconditional_prompt = self.build_formatted_prompt(negative_prompt, is_negative_prompt=True)
+                # Tokenize both prompts together to ensure same length (with left padding)
+                # Left padding is important for generation tasks
+                batch_texts = [formatted_prompt, formatted_unconditional_prompt]
+                original_padding_side = self.llm_tokenizer.padding_side
+                self.llm_tokenizer.padding_side = 'left'
+                batch_inputs_tokenized = self.llm_tokenizer(
+                    batch_texts,
+                    return_tensors="pt",
+                    padding=True,
+                    truncation=True,
+                )
+                self.llm_tokenizer.padding_side = original_padding_side
+                batch_inputs_tokenized = {k: v.to(self.device) for k, v in batch_inputs_tokenized.items()}
+                # Extract batch inputs
+                batch_input_ids = batch_inputs_tokenized['input_ids']
+                batch_attention_mask = batch_inputs_tokenized.get('attention_mask', None)
+                # Use custom CFG generation loop
+                outputs = self._generate_with_cfg_custom(
+                    batch_input_ids=batch_input_ids,
+                    batch_attention_mask=batch_attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                    pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                    streamer=None,
+                )
+                # Extract only the conditional output (first in batch)
+                outputs = outputs[0:1]  # Keep only conditional output
+            else:
+                # Generate without CFG using native generate() parameters
+                with torch.no_grad():
+                    outputs = self.llm.generate(
+                        **inputs,
+                        max_new_tokens=max_new_tokens,
+                        temperature=temperature if temperature > 0 else 1.0,
+                        do_sample=True if temperature > 0 else False,
+                        top_k=top_k if top_k is not None and top_k > 0 else None,
+                        top_p=top_p if top_p is not None and 0.0 < top_p < 1.0 else None,
+                        logits_processor=logits_processor if len(logits_processor) > 0 else None,
+                        pad_token_id=self.llm_tokenizer.pad_token_id or self.llm_tokenizer.eos_token_id,
+                        streamer=None,
+                    )
+        # Decode the generated tokens
+        # outputs is a tensor with shape [batch_size, seq_len], extract first sequence
+        if isinstance(outputs, torch.Tensor):
+            if outputs.dim() == 2:
+                generated_ids = outputs[0]
+            else:
+                generated_ids = outputs
+        else:
+            generated_ids = outputs[0]
+        # Only decode the newly generated tokens (skip the input prompt)
+        # Use the original input length (before batch processing for CFG)
+        if cfg_scale > 1.0:
+            # In CFG case, we need to use the conditional input length from batch_inputs_tokenized
+            # Both sequences have the same length due to padding
+            input_length = batch_inputs_tokenized['input_ids'].shape[1]
+        else:
+            input_length = inputs["input_ids"].shape[1]
+        generated_ids = generated_ids[input_length:]
+        # Move to CPU for decoding
+        if generated_ids.is_cuda:
+            generated_ids = generated_ids.cpu()
+        output_text = self.llm_tokenizer.decode(generated_ids, skip_special_tokens=False)
+        return output_text
     def generate_with_5hz_lm(
         self,
         caption: str,
                 caption, lyrics, temperature, cfg_scale, negative_prompt,
                 top_k, top_p, repetition_penalty
             )
+    def build_formatted_prompt(self, caption: str, lyrics: str = "", is_negative_prompt: bool = False) -> str:
         """
+        Build the chat-formatted prompt for 5Hz LM from caption/lyrics.
+        Raises a ValueError if the tokenizer is not initialized.
+        Example:
+            prompt = handler.build_formatted_prompt("calm piano", "hello world")
+        """
+        if self.llm_tokenizer is None:
+            raise ValueError("LLM tokenizer is not initialized. Call initialize() first.")
+        if is_negative_prompt:
+            prompt = caption
+        else:
+            prompt = f"# Caption\n{caption}\n\n# Lyric\n{lyrics}\n"
+        return self.llm_tokenizer.apply_chat_template(
+            [
+                {"role": "system", "content": "# Instruction\nGenerate audio semantic tokens based on the given conditions:\n\n"},
+                {"role": "user", "content": prompt},
+            ],
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+    def generate_from_formatted_prompt(
+        self,
+        formatted_prompt: str,
+        cfg: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[str, str]:
+        """
+        Generate raw LM text output from a pre-built formatted prompt.
+        Args:
+            formatted_prompt: Prompt that is already formatted by `build_formatted_prompt`.
+            cfg: Optional dict supporting keys:
+                - temperature (float)
+                - cfg_scale (float)
+                - negative_prompt (str) used when cfg_scale > 1
+                - top_k (int), top_p (float), repetition_penalty (float)
         Returns:
+            (output_text, status_message)
+        Example:
+            prompt = handler.build_formatted_prompt(caption, lyric)
+            text, status = handler.generate_from_formatted_prompt(prompt, {"temperature": 0.7})
         """
+        if not getattr(self, "llm_initialized", False):
+            return "", "❌ 5Hz LM not initialized. Please initialize it first."
+        if self.llm is None or self.llm_tokenizer is None:
+            return "", "❌ 5Hz LM is missing model or tokenizer."
+        cfg = cfg or {}
+        temperature = cfg.get("temperature", 0.6)
+        cfg_scale = cfg.get("cfg_scale", 1.0)
+        negative_prompt = cfg.get("negative_prompt", "NO USER INPUT")
+        top_k = cfg.get("top_k")
+        top_p = cfg.get("top_p")
+        repetition_penalty = cfg.get("repetition_penalty", 1.0)
+        try:
+            if self.llm_backend == "vllm":
+                output_text = self._run_vllm_from_formatted(
+                    formatted_prompt=formatted_prompt,
+                    temperature=temperature,
+                    cfg_scale=cfg_scale,
+                    negative_prompt=negative_prompt,
+                    top_k=top_k,
+                    top_p=top_p,
+                    repetition_penalty=repetition_penalty,
+                )
+                return output_text, f"✅ Generated successfully (vllm) | length={len(output_text)}"
+            # PyTorch backend
+            output_text = self._run_pt_from_formatted(
+                formatted_prompt=formatted_prompt,
+                temperature=temperature,
+                cfg_scale=cfg_scale,
+                negative_prompt=negative_prompt,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+            )
+            return output_text, f"✅ Generated successfully (pt) | length={len(output_text)}"
+        except Exception as e:
+            return "", f"❌ Error generating from formatted prompt: {e}"
+    def _generate_with_cfg_custom(
         self,
         batch_input_ids: torch.Tensor,
         batch_attention_mask: Optional[torch.Tensor],
         max_new_tokens: int,
         temperature: float,
         cfg_scale: float,
+        top_k: Optional[int],
+        top_p: Optional[float],
+        repetition_penalty: float,
         pad_token_id: int,
         streamer: Optional[BaseStreamer],
     ) -> torch.Tensor:
         """
+        Custom CFG generation loop that:
+        1. Processes both conditional and unconditional sequences in parallel
+        2. Applies CFG formula to logits
+        3. Samples tokens only for conditional sequences
+        4. Applies the same sampled tokens to both conditional and unconditional sequences
+        Batch format: [cond_input, uncond_input]
         """
         model = self.llm
         device = self.device
         past_key_values = None
         use_cache = hasattr(model, 'generation_config') and getattr(model.generation_config, 'use_cache', True)
+        # Get EOS token ID for stopping condition
+        eos_token_id = self.llm_tokenizer.eos_token_id
+        if eos_token_id is None:
+            eos_token_id = pad_token_id
+        # Build logits processor for non-CFG operations (repetition penalty, top_k, top_p)
+        logits_processor = LogitsProcessorList()
+        if repetition_penalty != 1.0:
+            logits_processor.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
         with torch.no_grad():
             for step in range(max_new_tokens):
                 # Forward pass for the entire batch (conditional + unconditional)
                         use_cache=use_cache,
                     )
+                # Get logits for the last position
                 next_token_logits = outputs.logits[:, -1, :]  # [batch_size*2, vocab_size]
                 # Split conditional and unconditional logits
                 cond_logits = next_token_logits[cond_start_idx:cond_start_idx+batch_size]
                 uncond_logits = next_token_logits[uncond_start_idx:uncond_start_idx+batch_size]
+                # Apply CFG formula: cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
                 cfg_logits = uncond_logits + cfg_scale * (cond_logits - uncond_logits)
+                # Apply logits processors (repetition penalty, top-k, top-p)
+                # Get current input_ids for repetition penalty (only conditional part)
+                current_input_ids = generated_ids[cond_start_idx:cond_start_idx+batch_size]
+                for processor in logits_processor:
+                    cfg_logits = processor(current_input_ids, cfg_logits)
+                # Apply top-k filtering
+                if top_k is not None and top_k > 0:
+                    indices_to_remove = cfg_logits < torch.topk(cfg_logits, top_k)[0][..., -1, None]
+                    cfg_logits[indices_to_remove] = float('-inf')
+                # Apply top-p (nucleus) filtering
+                if top_p is not None and 0.0 < top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(cfg_logits, descending=True)
+                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+                    # Remove tokens with cumulative probability above the threshold
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    # Shift the indices to the right to keep also the first token above the threshold
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+                    indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+                    cfg_logits[indices_to_remove] = float('-inf')
                 # Apply temperature and sample
                 if temperature > 0:
                 else:
                     next_tokens = torch.argmax(cfg_logits, dim=-1)
+                # Check for EOS token in conditional sequences BEFORE unsqueezing
+                # Stop if any conditional sequence generates EOS token
+                # next_tokens shape: [batch_size] (only conditional tokens)
+                should_stop = False
+                if torch.any(next_tokens == eos_token_id):
+                    should_stop = True
+                elif pad_token_id is not None and pad_token_id != eos_token_id:
+                    if torch.any(next_tokens == pad_token_id):
+                        should_stop = True
+                # Apply the same sampled tokens to both conditional and unconditional sequences
+                next_tokens_unsqueezed = next_tokens.unsqueeze(1)
+                generated_ids = torch.cat([generated_ids, next_tokens_unsqueezed.repeat(2, 1)], dim=1)
                 attention_mask = torch.cat([attention_mask, torch.ones((batch_size*2, 1), device=device, dtype=attention_mask.dtype)], dim=1)
                 model_kwargs['attention_mask'] = attention_mask
                 # Update streamer
                 if streamer is not None:
+                    streamer.put(next_tokens_unsqueezed)  # Stream conditional tokens
+                # Stop generation if EOS token detected
+                if should_stop:
                     break
         if streamer is not None:
             streamer.end()
+        # Return the full batch (both conditional and unconditional)
+        # The caller will extract only the conditional output
+        return generated_ids
+    def parse_lm_output(self, output_text: str) -> Tuple[Dict[str, Any], str]:
+        """
+        Parse LM output to extract metadata and audio codes.
+        Expected format:
+        <think>
+        bpm: 73
+        duration: 273
+        genres: Chinese folk
+        keyscale: G major
+        timesignature: 4
+        </think>
+        <|audio_code_56535|><|audio_code_62918|>...
+        Returns:
+            Tuple of (metadata_dict, audio_codes_string)
+        """
+        debug_output_text = output_text.split("</think>")[0]
+        logger.debug(f"Debug output text: {debug_output_text}")
+        metadata = {}
+        audio_codes = ""
+        import re
+        # Extract audio codes - find all <|audio_code_XXX|> patterns
+        code_pattern = r'<\|audio_code_\d+\|>'
+        code_matches = re.findall(code_pattern, output_text)
+        if code_matches:
+            audio_codes = "".join(code_matches)
+        # Extract metadata from reasoning section
+        # Try different reasoning tag patterns
+        reasoning_patterns = [
+            r'<think>(.*?)</think>',
+            r'<think>(.*?)</think>',
+            r'<reasoning>(.*?)</reasoning>',
+        ]
+        reasoning_text = None
+        for pattern in reasoning_patterns:
+            match = re.search(pattern, output_text, re.DOTALL)
+            if match:
+                reasoning_text = match.group(1).strip()
+                break
+        # If no reasoning tags found, try to parse metadata from the beginning of output
+        if not reasoning_text:
+            # Look for metadata lines before audio codes
+            lines_before_codes = output_text.split('<|audio_code_')[0] if '<|audio_code_' in output_text else output_text
+            reasoning_text = lines_before_codes.strip()
+        # Parse metadata fields
+        if reasoning_text:
+            for line in reasoning_text.split('\n'):
+                line = line.strip()
+                if ':' in line and not line.startswith('<'):
+                    parts = line.split(':', 1)
+                    if len(parts) == 2:
+                        key = parts[0].strip().lower()
+                        value = parts[1].strip()
+                        if key == 'bpm':
+                            try:
+                                metadata['bpm'] = int(value)
+                            except:
+                                metadata['bpm'] = value
+                        elif key == 'duration':
+                            try:
+                                metadata['duration'] = int(value)
+                            except:
+                                metadata['duration'] = value
+                        elif key == 'genres':
+                            metadata['genres'] = value
+                        elif key == 'keyscale':
+                            metadata['keyscale'] = value
+                        elif key == 'timesignature':
+                            metadata['timesignature'] = value
+        return metadata, audio_codes
     @contextmanager
     def _load_model_context(self):