ChuxiJ commited on
Commit
def62ce
·
1 Parent(s): 7d44db3

fix api server no duration

Browse files
acestep/api_server.py CHANGED
@@ -125,7 +125,7 @@ class GenerateMusicRequest(BaseModel):
125
  is_format_caption: bool = False
126
 
127
  lm_temperature: float = 0.85
128
- lm_cfg_scale: float = 2.0
129
  lm_top_k: Optional[int] = None
130
  lm_top_p: Optional[float] = 0.9
131
  lm_repetition_penalty: float = 1.0
@@ -137,7 +137,7 @@ class GenerateMusicRequest(BaseModel):
137
 
138
 
139
  _LM_DEFAULT_TEMPERATURE = 0.85
140
- _LM_DEFAULT_CFG_SCALE = 2.0
141
  _LM_DEFAULT_TOP_P = 0.9
142
  _DEFAULT_DIT_INSTRUCTION = DEFAULT_DIT_INSTRUCTION
143
  _DEFAULT_LM_INSTRUCTION = DEFAULT_LM_INSTRUCTION
@@ -728,16 +728,33 @@ def create_app() -> FastAPI:
728
  print(f"[api_server] Sample generated: caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}")
729
 
730
  # Apply format_sample() if use_format is True and caption/lyrics are provided
 
 
 
731
  if req.use_format and (caption or lyrics):
732
  print(f"[api_server] Applying format_sample to enhance input...")
733
  _ensure_llm_ready()
734
  if getattr(app.state, "_llm_init_error", None):
735
  raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
736
 
 
 
 
 
 
 
 
 
 
 
 
 
 
737
  format_result = format_sample(
738
  llm_handler=llm,
739
  caption=caption,
740
  lyrics=lyrics,
 
741
  temperature=req.lm_temperature,
742
  top_k=lm_top_k if lm_top_k > 0 else None,
743
  top_p=lm_top_p if lm_top_p < 1.0 else None,
@@ -745,9 +762,20 @@ def create_app() -> FastAPI:
745
  )
746
 
747
  if format_result.success:
748
- caption = format_result.caption
749
- lyrics = format_result.lyrics
750
- print(f"[api_server] Format applied: new caption_len={len(caption)}, lyrics_len={len(lyrics)}")
 
 
 
 
 
 
 
 
 
 
 
751
  else:
752
  print(f"[api_server] Warning: format_sample failed: {format_result.error}, using original input")
753
 
@@ -811,7 +839,12 @@ def create_app() -> FastAPI:
811
  lm_top_k=lm_top_k,
812
  lm_top_p=lm_top_p,
813
  lm_negative_prompt=req.lm_negative_prompt,
814
- use_cot_metas=not sample_mode, # Sample mode already generated metas, don't regenerate
 
 
 
 
 
815
  use_cot_caption=req.use_cot_caption,
816
  use_cot_language=req.use_cot_language,
817
  use_constrained_decoding=req.constrained_decoding,
 
125
  is_format_caption: bool = False
126
 
127
  lm_temperature: float = 0.85
128
+ lm_cfg_scale: float = 2.5
129
  lm_top_k: Optional[int] = None
130
  lm_top_p: Optional[float] = 0.9
131
  lm_repetition_penalty: float = 1.0
 
137
 
138
 
139
  _LM_DEFAULT_TEMPERATURE = 0.85
140
+ _LM_DEFAULT_CFG_SCALE = 2.5
141
  _LM_DEFAULT_TOP_P = 0.9
142
  _DEFAULT_DIT_INSTRUCTION = DEFAULT_DIT_INSTRUCTION
143
  _DEFAULT_LM_INSTRUCTION = DEFAULT_LM_INSTRUCTION
 
728
  print(f"[api_server] Sample generated: caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}")
729
 
730
  # Apply format_sample() if use_format is True and caption/lyrics are provided
731
+ # Track whether format_sample generated duration (to decide if Phase 1 is needed)
732
+ format_has_duration = False
733
+
734
  if req.use_format and (caption or lyrics):
735
  print(f"[api_server] Applying format_sample to enhance input...")
736
  _ensure_llm_ready()
737
  if getattr(app.state, "_llm_init_error", None):
738
  raise RuntimeError(f"5Hz LM init failed (needed for format): {app.state._llm_init_error}")
739
 
740
+ # Build user_metadata from request params (matching bot.py behavior)
741
+ user_metadata_for_format = {}
742
+ if bpm is not None:
743
+ user_metadata_for_format['bpm'] = bpm
744
+ if audio_duration is not None and audio_duration > 0:
745
+ user_metadata_for_format['duration'] = int(audio_duration)
746
+ if key_scale:
747
+ user_metadata_for_format['keyscale'] = key_scale
748
+ if time_signature:
749
+ user_metadata_for_format['timesignature'] = time_signature
750
+ if req.vocal_language and req.vocal_language != "unknown":
751
+ user_metadata_for_format['language'] = req.vocal_language
752
+
753
  format_result = format_sample(
754
  llm_handler=llm,
755
  caption=caption,
756
  lyrics=lyrics,
757
+ user_metadata=user_metadata_for_format if user_metadata_for_format else None,
758
  temperature=req.lm_temperature,
759
  top_k=lm_top_k if lm_top_k > 0 else None,
760
  top_p=lm_top_p if lm_top_p < 1.0 else None,
 
762
  )
763
 
764
  if format_result.success:
765
+ # Extract all formatted data (matching bot.py behavior)
766
+ caption = format_result.caption or caption
767
+ lyrics = format_result.lyrics or lyrics
768
+ if format_result.duration:
769
+ audio_duration = format_result.duration
770
+ format_has_duration = True
771
+ if format_result.bpm:
772
+ bpm = format_result.bpm
773
+ if format_result.keyscale:
774
+ key_scale = format_result.keyscale
775
+ if format_result.timesignature:
776
+ time_signature = format_result.timesignature
777
+
778
+ print(f"[api_server] Format applied: new caption_len={len(caption)}, lyrics_len={len(lyrics)}, bpm={bpm}, duration={audio_duration}, has_duration={format_has_duration}")
779
  else:
780
  print(f"[api_server] Warning: format_sample failed: {format_result.error}, using original input")
781
 
 
839
  lm_top_k=lm_top_k,
840
  lm_top_p=lm_top_p,
841
  lm_negative_prompt=req.lm_negative_prompt,
842
+ # use_cot_metas logic:
843
+ # - sample_mode: metas already generated, skip Phase 1
844
+ # - format with duration: metas already generated, skip Phase 1
845
+ # - format without duration: need Phase 1 to generate duration
846
+ # - no format: need Phase 1 to generate all metas
847
+ use_cot_metas=not sample_mode and not format_has_duration,
848
  use_cot_caption=req.use_cot_caption,
849
  use_cot_language=req.use_cot_language,
850
  use_constrained_decoding=req.constrained_decoding,
acestep/gradio_ui/events/generation_handlers.py CHANGED
@@ -70,7 +70,7 @@ def load_metadata(file_obj):
70
  """Load generation parameters from a JSON file"""
71
  if file_obj is None:
72
  gr.Warning(t("messages.no_file_selected"))
73
- return [None] * 34 + [False] # Return None for all fields, False for is_format_caption
74
 
75
  try:
76
  # Read the uploaded file
@@ -115,7 +115,7 @@ def load_metadata(file_obj):
115
  inference_steps = metadata.get('inference_steps', 8)
116
  guidance_scale = metadata.get('guidance_scale', 7.0)
117
  seed = metadata.get('seed', '-1')
118
- random_seed = metadata.get('random_seed', True)
119
  use_adg = metadata.get('use_adg', False)
120
  cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
121
  cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
@@ -137,6 +137,9 @@ def load_metadata(file_obj):
137
  complete_track_classes = metadata.get('complete_track_classes', [])
138
  shift = metadata.get('shift', 3.0) # Default 3.0 for base models
139
  infer_method = metadata.get('infer_method', 'ode') # Default 'ode' for diffusion inference
 
 
 
140
  instrumental = metadata.get('instrumental', False) # Added: read instrumental
141
 
142
  gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
@@ -144,8 +147,9 @@ def load_metadata(file_obj):
144
  return (
145
  task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
146
  audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
147
- use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method, audio_format,
148
- lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
 
149
  use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
150
  think, audio_codes, repainting_start, repainting_end,
151
  track_name, complete_track_classes, instrumental,
@@ -154,10 +158,10 @@ def load_metadata(file_obj):
154
 
155
  except json.JSONDecodeError as e:
156
  gr.Warning(t("messages.invalid_json", error=str(e)))
157
- return [None] * 35 + [False]
158
  except Exception as e:
159
  gr.Warning(t("messages.load_error", error=str(e)))
160
- return [None] * 35 + [False]
161
 
162
 
163
  def load_random_example(task_type: str):
@@ -429,7 +433,7 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
429
 
430
  # Check if model is initialized - if so, collapse the accordion
431
  is_model_initialized = dit_handler.model is not None
432
- accordion_state = gr.update(open=not is_model_initialized)
433
 
434
  # Get model type settings based on actual loaded model
435
  is_turbo = dit_handler.is_turbo_model()
@@ -446,12 +450,12 @@ def init_service_wrapper(dit_handler, llm_handler, checkpoint, config_path, devi
446
  def get_model_type_ui_settings(is_turbo: bool):
447
  """Get UI settings based on whether the model is turbo or base"""
448
  if is_turbo:
449
- # Turbo model: max 8 steps, hide CFG/ADG/shift, only show text2music/repaint/cover
450
  return (
451
- gr.update(value=8, maximum=8, minimum=1), # inference_steps
452
  gr.update(visible=False), # guidance_scale
453
  gr.update(visible=False), # use_adg
454
- gr.update(value=1.0, visible=False), # shift (not effective for turbo)
455
  gr.update(visible=False), # cfg_interval_start
456
  gr.update(visible=False), # cfg_interval_end
457
  gr.update(choices=TASK_TYPES_TURBO), # task_type
@@ -603,7 +607,7 @@ def reset_format_caption_flag():
603
  def update_audio_uploads_accordion(reference_audio, src_audio):
604
  """Update Audio Uploads accordion open state based on whether audio files are present"""
605
  has_audio = (reference_audio is not None) or (src_audio is not None)
606
- return gr.update(open=has_audio)
607
 
608
 
609
  def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
@@ -708,11 +712,11 @@ def handle_generation_mode_change(mode: str):
708
 
709
  return (
710
  gr.update(visible=is_simple), # simple_mode_group
711
- gr.update(open=not is_simple), # caption_accordion - collapsed in simple, open in custom
712
- gr.update(open=not is_simple), # lyrics_accordion - collapsed in simple, open in custom
713
  gr.update(interactive=not is_simple), # generate_btn - disabled in simple until sample created
714
  False, # simple_sample_created - reset to False on mode change
715
- gr.update(open=not is_simple), # optional_params_accordion - hidden in simple mode
716
  )
717
 
718
 
@@ -836,8 +840,8 @@ def handle_create_sample(
836
  result.language, # simple vocal_language
837
  result.timesignature, # time_signature
838
  result.instrumental, # instrumental_checkbox
839
- gr.update(open=True), # caption_accordion - expand
840
- gr.update(open=True), # lyrics_accordion - expand
841
  gr.update(interactive=True), # generate_btn - enable
842
  True, # simple_sample_created - True
843
  True, # think_checkbox - enable thinking
 
70
  """Load generation parameters from a JSON file"""
71
  if file_obj is None:
72
  gr.Warning(t("messages.no_file_selected"))
73
+ return [None] * 36 + [False] # Return None for all fields, False for is_format_caption
74
 
75
  try:
76
  # Read the uploaded file
 
115
  inference_steps = metadata.get('inference_steps', 8)
116
  guidance_scale = metadata.get('guidance_scale', 7.0)
117
  seed = metadata.get('seed', '-1')
118
+ random_seed = False # Always set to False when loading to enable reproducibility with saved seed
119
  use_adg = metadata.get('use_adg', False)
120
  cfg_interval_start = metadata.get('cfg_interval_start', 0.0)
121
  cfg_interval_end = metadata.get('cfg_interval_end', 1.0)
 
137
  complete_track_classes = metadata.get('complete_track_classes', [])
138
  shift = metadata.get('shift', 3.0) # Default 3.0 for base models
139
  infer_method = metadata.get('infer_method', 'ode') # Default 'ode' for diffusion inference
140
+ custom_timesteps = metadata.get('timesteps', '') # Custom timesteps (stored as 'timesteps' in JSON)
141
+ if custom_timesteps is None:
142
+ custom_timesteps = ''
143
  instrumental = metadata.get('instrumental', False) # Added: read instrumental
144
 
145
  gr.Info(t("messages.params_loaded", filename=os.path.basename(filepath)))
 
147
  return (
148
  task_type, captions, lyrics, vocal_language, bpm, key_scale, time_signature,
149
  audio_duration, batch_size, inference_steps, guidance_scale, seed, random_seed,
150
+ use_adg, cfg_interval_start, cfg_interval_end, shift, infer_method,
151
+ custom_timesteps, # Added: custom_timesteps (between infer_method and audio_format)
152
+ audio_format, lm_temperature, lm_cfg_scale, lm_top_k, lm_top_p, lm_negative_prompt,
153
  use_cot_metas, use_cot_caption, use_cot_language, audio_cover_strength,
154
  think, audio_codes, repainting_start, repainting_end,
155
  track_name, complete_track_classes, instrumental,
 
158
 
159
  except json.JSONDecodeError as e:
160
  gr.Warning(t("messages.invalid_json", error=str(e)))
161
+ return [None] * 36 + [False]
162
  except Exception as e:
163
  gr.Warning(t("messages.load_error", error=str(e)))
164
+ return [None] * 36 + [False]
165
 
166
 
167
  def load_random_example(task_type: str):
 
433
 
434
  # Check if model is initialized - if so, collapse the accordion
435
  is_model_initialized = dit_handler.model is not None
436
+ accordion_state = gr.Accordion(open=not is_model_initialized)
437
 
438
  # Get model type settings based on actual loaded model
439
  is_turbo = dit_handler.is_turbo_model()
 
450
  def get_model_type_ui_settings(is_turbo: bool):
451
  """Get UI settings based on whether the model is turbo or base"""
452
  if is_turbo:
453
+ # Turbo model: max 20 steps, default 8, show shift with default 3.0, only show text2music/repaint/cover
454
  return (
455
+ gr.update(value=8, maximum=20, minimum=1), # inference_steps
456
  gr.update(visible=False), # guidance_scale
457
  gr.update(visible=False), # use_adg
458
+ gr.update(value=3.0, visible=True), # shift (show with default 3.0)
459
  gr.update(visible=False), # cfg_interval_start
460
  gr.update(visible=False), # cfg_interval_end
461
  gr.update(choices=TASK_TYPES_TURBO), # task_type
 
607
  def update_audio_uploads_accordion(reference_audio, src_audio):
608
  """Update Audio Uploads accordion open state based on whether audio files are present"""
609
  has_audio = (reference_audio is not None) or (src_audio is not None)
610
+ return gr.Accordion(open=has_audio)
611
 
612
 
613
  def handle_instrumental_checkbox(instrumental_checked, current_lyrics):
 
712
 
713
  return (
714
  gr.update(visible=is_simple), # simple_mode_group
715
+ gr.Accordion(open=not is_simple), # caption_accordion - collapsed in simple, open in custom
716
+ gr.Accordion(open=not is_simple), # lyrics_accordion - collapsed in simple, open in custom
717
  gr.update(interactive=not is_simple), # generate_btn - disabled in simple until sample created
718
  False, # simple_sample_created - reset to False on mode change
719
+ gr.Accordion(open=not is_simple), # optional_params_accordion - hidden in simple mode
720
  )
721
 
722
 
 
840
  result.language, # simple vocal_language
841
  result.timesignature, # time_signature
842
  result.instrumental, # instrumental_checkbox
843
+ gr.Accordion(open=True), # caption_accordion - expand
844
+ gr.Accordion(open=True), # lyrics_accordion - expand
845
  gr.update(interactive=True), # generate_btn - enable
846
  True, # simple_sample_created - True
847
  True, # think_checkbox - enable thinking
acestep/gradio_ui/interfaces/generation.py CHANGED
@@ -402,13 +402,13 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
402
  )
403
 
404
  # Advanced Settings
405
- # Default UI settings use turbo mode (max 8 steps, hide CFG/ADG/shift)
406
  # These will be updated after model initialization based on handler.is_turbo_model()
407
  with gr.Accordion(t("generation.advanced_settings"), open=False):
408
  with gr.Row():
409
  inference_steps = gr.Slider(
410
  minimum=1,
411
- maximum=8,
412
  value=8,
413
  step=1,
414
  label=t("generation.inference_steps_label"),
@@ -455,7 +455,7 @@ def create_generation_section(dit_handler, llm_handler, init_params=None, langua
455
  step=0.1,
456
  label=t("generation.shift_label"),
457
  info=t("generation.shift_info"),
458
- visible=False
459
  )
460
  infer_method = gr.Dropdown(
461
  choices=["ode", "sde"],
 
402
  )
403
 
404
  # Advanced Settings
405
+ # Default UI settings use turbo mode (max 20 steps, default 8, show shift with default 3)
406
  # These will be updated after model initialization based on handler.is_turbo_model()
407
  with gr.Accordion(t("generation.advanced_settings"), open=False):
408
  with gr.Row():
409
  inference_steps = gr.Slider(
410
  minimum=1,
411
+ maximum=20,
412
  value=8,
413
  step=1,
414
  label=t("generation.inference_steps_label"),
 
455
  step=0.1,
456
  label=t("generation.shift_label"),
457
  info=t("generation.shift_info"),
458
+ visible=True
459
  )
460
  infer_method = gr.Dropdown(
461
  choices=["ode", "sde"],
acestep/llm_inference.py CHANGED
@@ -375,9 +375,9 @@ class LLMHandler:
375
  max_ratio=0.9
376
  )
377
  if low_gpu_memory_mode:
378
- self.max_model_len = 2048
379
- else:
380
  self.max_model_len = 4096
 
 
381
 
382
  logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
383
  start_time = time.time()
@@ -1796,7 +1796,7 @@ class LLMHandler:
1796
  # If no lyrics generated, keep original input
1797
  metadata['lyrics'] = lyrics
1798
 
1799
- logger.info(f"Format completed successfully. Generated {len(metadata)} fields")
1800
  if constrained_decoding_debug:
1801
  logger.debug(f"Generated metadata: {list(metadata.keys())}")
1802
  logger.debug(f"Output text preview: {output_text[:300]}...")
 
375
  max_ratio=0.9
376
  )
377
  if low_gpu_memory_mode:
 
 
378
  self.max_model_len = 4096
379
+ else:
380
+ self.max_model_len = 8192
381
 
382
  logger.info(f"Initializing 5Hz LM with model: {model_path}, enforce_eager: False, tensor_parallel_size: 1, max_model_len: {self.max_model_len}, gpu_memory_utilization: {gpu_memory_utilization}")
383
  start_time = time.time()
 
1796
  # If no lyrics generated, keep original input
1797
  metadata['lyrics'] = lyrics
1798
 
1799
+ logger.info(f"Format completed successfully. Generated {metadata} fields")
1800
  if constrained_decoding_debug:
1801
  logger.debug(f"Generated metadata: {list(metadata.keys())}")
1802
  logger.debug(f"Output text preview: {output_text[:300]}...")
acestep/third_parts/nano-vllm/nanovllm/config.py CHANGED
@@ -8,7 +8,7 @@ class Config:
8
  model: str
9
  max_num_batched_tokens: int = 16384
10
  max_num_seqs: int = 512
11
- max_model_len: int = 4096
12
  gpu_memory_utilization: float = 0.9
13
  tensor_parallel_size: int = 1
14
  enforce_eager: bool = False
 
8
  model: str
9
  max_num_batched_tokens: int = 16384
10
  max_num_seqs: int = 512
11
+ max_model_len: int = 8192
12
  gpu_memory_utilization: float = 0.9
13
  tensor_parallel_size: int = 1
14
  enforce_eager: bool = False