ChuxiJ commited on
Commit
e4e90cd
·
1 Parent(s): 13537d2

fix lrc bugs

Browse files
acestep/constants.py CHANGED
@@ -70,6 +70,8 @@ TASK_TYPES_BASE = ["text2music", "repaint", "cover", "extract", "lego", "complet
70
  DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
71
  DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
72
  DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
 
 
73
 
74
  # Instruction templates for each task type
75
  # Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
 
70
  DEFAULT_DIT_INSTRUCTION = "Fill the audio semantic mask based on the given conditions:"
71
  DEFAULT_LM_INSTRUCTION = "Generate audio semantic tokens based on the given conditions:"
72
  DEFAULT_LM_UNDERSTAND_INSTRUCTION = "Understand the given musical conditions and describe the audio semantics accordingly:"
73
+ DEFAULT_LM_INSPIRED_INSTRUCTION = "Expand the user's input into a more detailed and specific musical description:"
74
+ DEFAULT_LM_REWRITE_INSTRUCTION = "Format the user's input into a more detailed and specific musical description:"
75
 
76
  # Instruction templates for each task type
77
  # Note: Some instructions use placeholders like {TRACK_NAME} or {TRACK_CLASSES}
acestep/gradio_ui/events/__init__.py CHANGED
@@ -373,7 +373,7 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
373
  outputs=[
374
  results_section[f"lrc_display_{btn_idx}"],
375
  results_section[f"details_accordion_{btn_idx}"],
376
- # Audio subtitles now auto-updated via lrc_display.change()
377
  results_section["batch_queue"]
378
  ]
379
  )
@@ -723,14 +723,11 @@ def setup_event_handlers(demo, dit_handler, llm_handler, dataset_handler, datase
723
  )
724
 
725
  # ========== LRC Display Change Handlers ==========
726
- # When lrc_display textbox changes, update the corresponding audio component's subtitles
727
- for i in range(1, 9):
728
- results_section[f"lrc_display_{i}"].change(
729
- fn=res_h.update_audio_subtitles_from_lrc,
730
- inputs=[
731
- results_section[f"lrc_display_{i}"],
732
- results_section[f"generated_audio_{i}"],
733
- generation_section["audio_duration"],
734
- ],
735
- outputs=[results_section[f"generated_audio_{i}"]]
736
- )
 
373
  outputs=[
374
  results_section[f"lrc_display_{btn_idx}"],
375
  results_section[f"details_accordion_{btn_idx}"],
376
+ results_section[f"generated_audio_{btn_idx}"], # Direct audio update (subtitles only)
377
  results_section["batch_queue"]
378
  ]
379
  )
 
723
  )
724
 
725
  # ========== LRC Display Change Handlers ==========
726
+ # REMOVED: lrc_display.change() event was causing audio flickering issues
727
+ #
728
+ # Subtitles are now updated through:
729
+ # 1. Manual LRC button click -> generate_lrc_handler directly updates subtitles
730
+ # 2. Auto LRC generation -> subtitles stored in batch_queue, applied during batch navigation
731
+ # 3. Batch navigation (prev/next) -> subtitles applied from batch_queue
732
+ #
733
+ # If user manually edits LRC text, they can click the LRC button to apply changes.
 
 
 
acestep/gradio_ui/events/results_handlers.py CHANGED
@@ -625,7 +625,9 @@ def generate_with_progress(
625
 
626
  status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
627
  current_audio_updates = [gr.skip() for _ in range(8)]
628
- # Always set audio path first, subtitles will be applied via Audio component's subtitles parameter
 
 
629
  current_audio_updates[i] = audio_path
630
 
631
  # Codes display updates (for results section)
@@ -697,14 +699,16 @@ def generate_with_progress(
697
  num_audios=len(result.audios),
698
  )
699
 
700
- # Build final codes display, LRC display, and accordion visibility updates
701
  final_codes_display_updates = []
702
  final_lrc_display_updates = []
703
  final_accordion_updates = []
 
704
  for i in range(8):
705
  code_str = final_codes_list[i]
706
  lrc_text = final_lrcs_list[i]
707
  score_str = final_scores_list[i]
 
708
  has_code = bool(code_str)
709
  has_lrc = bool(lrc_text)
710
  has_score = bool(score_str) and score_str != "Done!"
@@ -713,10 +717,16 @@ def generate_with_progress(
713
  final_codes_display_updates.append(gr.update(value=code_str, visible=has_code))
714
  final_lrc_display_updates.append(gr.update(value=lrc_text, visible=has_lrc))
715
  final_accordion_updates.append(gr.update(visible=has_content))
 
 
 
 
 
 
716
 
717
  yield (
718
- gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 1-4: SKIP
719
- gr.skip(), gr.skip(), gr.skip(), gr.skip(), # Audio 5-8: SKIP
720
  all_audio_paths,
721
  generation_info,
722
  "Generation Complete",
@@ -1020,7 +1030,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1020
 
1021
  This function retrieves cached generation data from batch_queue and calls
1022
  the handler's get_lyric_timestamp method to generate LRC format lyrics.
1023
- Audio subtitles are automatically updated via lrc_display.change() event.
1024
 
1025
  Args:
1026
  dit_handler: DiT handler instance with get_lyric_timestamp method
@@ -1031,19 +1041,19 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1031
  inference_steps: Number of inference steps used in generation
1032
 
1033
  Returns:
1034
- Tuple of (lrc_display_update, details_accordion_update, batch_queue)
1035
  """
1036
  import torch
1037
 
1038
  if current_batch_index not in batch_queue:
1039
- return gr.skip(), gr.skip(), batch_queue
1040
 
1041
  batch_data = batch_queue[current_batch_index]
1042
  extra_outputs = batch_data.get("extra_outputs", {})
1043
 
1044
  # Check if required data is available
1045
  if not extra_outputs:
1046
- return gr.update(value=t("messages.lrc_no_extra_outputs"), visible=True), gr.update(visible=True), batch_queue
1047
 
1048
  pred_latents = extra_outputs.get("pred_latents")
1049
  encoder_hidden_states = extra_outputs.get("encoder_hidden_states")
@@ -1052,7 +1062,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1052
  lyric_token_idss = extra_outputs.get("lyric_token_idss")
1053
 
1054
  if any(x is None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
1055
- return gr.update(value=t("messages.lrc_missing_tensors"), visible=True), gr.update(visible=True), batch_queue
1056
 
1057
  # Adjust sample_idx to 0-based
1058
  sample_idx_0based = sample_idx - 1
@@ -1060,7 +1070,7 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1060
  # Check if sample exists in batch
1061
  batch_size = pred_latents.shape[0]
1062
  if sample_idx_0based >= batch_size:
1063
- return gr.update(value=t("messages.lrc_sample_not_exist"), visible=True), gr.update(visible=True), batch_queue
1064
 
1065
  # Extract the specific sample's data
1066
  try:
@@ -1098,14 +1108,14 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1098
  if result.get("success"):
1099
  lrc_text = result.get("lrc_text", "")
1100
  if not lrc_text:
1101
- return gr.update(value=t("messages.lrc_empty_result"), visible=True), gr.update(visible=True), batch_queue
1102
 
1103
  # Store LRC in batch_queue for later retrieval when switching batches
1104
  if "lrcs" not in batch_queue[current_batch_index]:
1105
  batch_queue[current_batch_index]["lrcs"] = [""] * 8
1106
  batch_queue[current_batch_index]["lrcs"][sample_idx_0based] = lrc_text
1107
 
1108
- # Parse LRC to subtitles format for storage (audio subtitles will be updated via lrc_display.change())
1109
  subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=float(audio_duration))
1110
 
1111
  # Store subtitles in batch_queue for batch navigation
@@ -1113,29 +1123,33 @@ def generate_lrc_handler(dit_handler, sample_idx, current_batch_index, batch_que
1113
  batch_queue[current_batch_index]["subtitles"] = [None] * 8
1114
  batch_queue[current_batch_index]["subtitles"][sample_idx_0based] = subtitles_data
1115
 
1116
- # Return: lrc_display, details_accordion, batch_queue
1117
- # Audio subtitles are automatically updated via lrc_display.change() event
1118
  return (
1119
  gr.update(value=lrc_text, visible=True),
1120
  gr.update(visible=True),
 
1121
  batch_queue
1122
  )
1123
  else:
1124
  error_msg = result.get("error", "Unknown error")
1125
- return gr.update(value=f"❌ {error_msg}", visible=True), gr.update(visible=True), batch_queue
1126
 
1127
  except Exception as e:
1128
  logger.exception("[generate_lrc_handler] Error generating LRC")
1129
- return gr.update(value=f"❌ Error: {str(e)}", visible=True), gr.update(visible=True), batch_queue
1130
 
1131
 
1132
  def update_audio_subtitles_from_lrc(lrc_text: str, audio_component_value, audio_duration: float = None):
1133
  """
1134
  Update Audio component's subtitles based on LRC text content.
1135
 
1136
- This function is triggered when lrc_display textbox changes.
1137
  It parses the LRC text and updates the corresponding Audio component's subtitles.
1138
 
 
 
 
1139
  Args:
1140
  lrc_text: LRC format lyrics string from lrc_display textbox
1141
  audio_component_value: Current value of the audio component (path or dict)
@@ -1144,10 +1158,6 @@ def update_audio_subtitles_from_lrc(lrc_text: str, audio_component_value, audio_
1144
  Returns:
1145
  gr.update for the Audio component with subtitles
1146
  """
1147
- # If no LRC text, skip update (don't clear subtitles to avoid flickering)
1148
- if not lrc_text or not lrc_text.strip():
1149
- return gr.skip()
1150
-
1151
  # Get audio path from component value
1152
  audio_path = None
1153
  if audio_component_value:
@@ -1156,14 +1166,21 @@ def update_audio_subtitles_from_lrc(lrc_text: str, audio_component_value, audio_
1156
  else:
1157
  audio_path = audio_component_value
1158
 
 
1159
  if not audio_path:
1160
  return gr.skip()
1161
 
 
 
 
 
 
1162
  # Parse LRC to subtitles format
1163
  subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=audio_duration)
1164
 
1165
- # Return updated audio with subtitles
1166
- return gr.update(value=audio_path, subtitles=subtitles_data if subtitles_data else None)
 
1167
 
1168
 
1169
  def capture_current_params(
@@ -1374,7 +1391,8 @@ def generate_with_batch_management(
1374
 
1375
  # Extract extra_outputs from result tuple (index 46 after adding lrc_display)
1376
  # Note: index 47 is raw_codes_list which we already extracted above
1377
- extra_outputs_from_result = result[46] if len(result) > 46 else {}
 
1378
 
1379
  # Store current batch in queue
1380
  batch_queue = store_batch_in_queue(
@@ -1610,7 +1628,8 @@ def generate_next_batch_background(
1610
  generated_codes_single = generated_codes_batch[0] if generated_codes_batch else ""
1611
 
1612
  # Extract extra_outputs for LRC generation (index 46)
1613
- extra_outputs_from_bg = final_result[46] if len(final_result) > 46 else None
 
1614
 
1615
  # Determine which codes to store
1616
  batch_size = params.get("batch_size_input", 2)
 
625
 
626
  status_message = f"Encoding & Ready: {i+1}/{len(audios)}"
627
  current_audio_updates = [gr.skip() for _ in range(8)]
628
+ # Set audio path - subtitles will be handled separately
629
+ # Note: gr.update() in yield doesn't work reliably, so we pass the path directly
630
+ # and rely on lrc_display.change() or final update for subtitles
631
  current_audio_updates[i] = audio_path
632
 
633
  # Codes display updates (for results section)
 
699
  num_audios=len(result.audios),
700
  )
701
 
702
+ # Build final codes display, LRC display, accordion visibility updates, and audio subtitles
703
  final_codes_display_updates = []
704
  final_lrc_display_updates = []
705
  final_accordion_updates = []
706
+ final_audio_updates = []
707
  for i in range(8):
708
  code_str = final_codes_list[i]
709
  lrc_text = final_lrcs_list[i]
710
  score_str = final_scores_list[i]
711
+ subtitles = final_subtitles_list[i]
712
  has_code = bool(code_str)
713
  has_lrc = bool(lrc_text)
714
  has_score = bool(score_str) and score_str != "Done!"
 
717
  final_codes_display_updates.append(gr.update(value=code_str, visible=has_code))
718
  final_lrc_display_updates.append(gr.update(value=lrc_text, visible=has_lrc))
719
  final_accordion_updates.append(gr.update(visible=has_content))
720
+ # Set subtitles in final yield (only subtitles, not value - to avoid reload)
721
+ # This applies auto_lrc subtitles after all audio paths are set
722
+ if subtitles:
723
+ final_audio_updates.append(gr.update(subtitles=subtitles))
724
+ else:
725
+ final_audio_updates.append(gr.skip())
726
 
727
  yield (
728
+ final_audio_updates[0], final_audio_updates[1], final_audio_updates[2], final_audio_updates[3],
729
+ final_audio_updates[4], final_audio_updates[5], final_audio_updates[6], final_audio_updates[7],
730
  all_audio_paths,
731
  generation_info,
732
  "Generation Complete",
 
1030
 
1031
  This function retrieves cached generation data from batch_queue and calls
1032
  the handler's get_lyric_timestamp method to generate LRC format lyrics.
1033
+ Audio subtitles are directly updated by this handler (not via lrc_display.change()).
1034
 
1035
  Args:
1036
  dit_handler: DiT handler instance with get_lyric_timestamp method
 
1041
  inference_steps: Number of inference steps used in generation
1042
 
1043
  Returns:
1044
+ Tuple of (lrc_display_update, details_accordion_update, audio_update, batch_queue)
1045
  """
1046
  import torch
1047
 
1048
  if current_batch_index not in batch_queue:
1049
+ return gr.skip(), gr.skip(), gr.skip(), batch_queue
1050
 
1051
  batch_data = batch_queue[current_batch_index]
1052
  extra_outputs = batch_data.get("extra_outputs", {})
1053
 
1054
  # Check if required data is available
1055
  if not extra_outputs:
1056
+ return gr.update(value=t("messages.lrc_no_extra_outputs"), visible=True), gr.update(visible=True), gr.skip(), batch_queue
1057
 
1058
  pred_latents = extra_outputs.get("pred_latents")
1059
  encoder_hidden_states = extra_outputs.get("encoder_hidden_states")
 
1062
  lyric_token_idss = extra_outputs.get("lyric_token_idss")
1063
 
1064
  if any(x is None for x in [pred_latents, encoder_hidden_states, encoder_attention_mask, context_latents, lyric_token_idss]):
1065
+ return gr.update(value=t("messages.lrc_missing_tensors"), visible=True), gr.update(visible=True), gr.skip(), batch_queue
1066
 
1067
  # Adjust sample_idx to 0-based
1068
  sample_idx_0based = sample_idx - 1
 
1070
  # Check if sample exists in batch
1071
  batch_size = pred_latents.shape[0]
1072
  if sample_idx_0based >= batch_size:
1073
+ return gr.update(value=t("messages.lrc_sample_not_exist"), visible=True), gr.update(visible=True), gr.skip(), batch_queue
1074
 
1075
  # Extract the specific sample's data
1076
  try:
 
1108
  if result.get("success"):
1109
  lrc_text = result.get("lrc_text", "")
1110
  if not lrc_text:
1111
+ return gr.update(value=t("messages.lrc_empty_result"), visible=True), gr.update(visible=True), gr.skip(), batch_queue
1112
 
1113
  # Store LRC in batch_queue for later retrieval when switching batches
1114
  if "lrcs" not in batch_queue[current_batch_index]:
1115
  batch_queue[current_batch_index]["lrcs"] = [""] * 8
1116
  batch_queue[current_batch_index]["lrcs"][sample_idx_0based] = lrc_text
1117
 
1118
+ # Parse LRC to subtitles format
1119
  subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=float(audio_duration))
1120
 
1121
  # Store subtitles in batch_queue for batch navigation
 
1123
  batch_queue[current_batch_index]["subtitles"] = [None] * 8
1124
  batch_queue[current_batch_index]["subtitles"][sample_idx_0based] = subtitles_data
1125
 
1126
+ # Return: lrc_display, details_accordion, audio (subtitles only!), batch_queue
1127
+ # IMPORTANT: Only update subtitles, NOT value - this avoids audio reload/flickering
1128
  return (
1129
  gr.update(value=lrc_text, visible=True),
1130
  gr.update(visible=True),
1131
+ gr.update(subtitles=subtitles_data), # Only update subtitles, not value!
1132
  batch_queue
1133
  )
1134
  else:
1135
  error_msg = result.get("error", "Unknown error")
1136
+ return gr.update(value=f"❌ {error_msg}", visible=True), gr.update(visible=True), gr.skip(), batch_queue
1137
 
1138
  except Exception as e:
1139
  logger.exception("[generate_lrc_handler] Error generating LRC")
1140
+ return gr.update(value=f"❌ Error: {str(e)}", visible=True), gr.update(visible=True), gr.skip(), batch_queue
1141
 
1142
 
1143
  def update_audio_subtitles_from_lrc(lrc_text: str, audio_component_value, audio_duration: float = None):
1144
  """
1145
  Update Audio component's subtitles based on LRC text content.
1146
 
1147
+ This function is triggered when lrc_display textbox changes (user manual edit).
1148
  It parses the LRC text and updates the corresponding Audio component's subtitles.
1149
 
1150
+ Note: When LRC button is clicked, subtitles are updated directly by generate_lrc_handler,
1151
+ not through this function. This function handles manual LRC text edits only.
1152
+
1153
  Args:
1154
  lrc_text: LRC format lyrics string from lrc_display textbox
1155
  audio_component_value: Current value of the audio component (path or dict)
 
1158
  Returns:
1159
  gr.update for the Audio component with subtitles
1160
  """
 
 
 
 
1161
  # Get audio path from component value
1162
  audio_path = None
1163
  if audio_component_value:
 
1166
  else:
1167
  audio_path = audio_component_value
1168
 
1169
+ # If no audio, skip update
1170
  if not audio_path:
1171
  return gr.skip()
1172
 
1173
+ # If LRC text is empty, clear subtitles
1174
+ # Must set value together with subtitles=None to ensure Gradio properly clears the subtitles
1175
+ if not lrc_text or not lrc_text.strip():
1176
+ return gr.update(value=audio_path, subtitles=None)
1177
+
1178
  # Parse LRC to subtitles format
1179
  subtitles_data = parse_lrc_to_subtitles(lrc_text, total_duration=audio_duration)
1180
 
1181
+ # For non-empty LRC updates, only set subtitles (avoid audio reload/flickering)
1182
+ # This is for manual LRC text edits - the audio value should remain unchanged
1183
+ return gr.update(subtitles=subtitles_data if subtitles_data else None)
1184
 
1185
 
1186
  def capture_current_params(
 
1391
 
1392
  # Extract extra_outputs from result tuple (index 46 after adding lrc_display)
1393
  # Note: index 47 is raw_codes_list which we already extracted above
1394
+ # Must check both length AND that the value is not None (intermediate yields use None as placeholder)
1395
+ extra_outputs_from_result = result[46] if len(result) > 46 and result[46] is not None else {}
1396
 
1397
  # Store current batch in queue
1398
  batch_queue = store_batch_in_queue(
 
1628
  generated_codes_single = generated_codes_batch[0] if generated_codes_batch else ""
1629
 
1630
  # Extract extra_outputs for LRC generation (index 46)
1631
+ # Must check both length AND that the value is not None (intermediate yields use None as placeholder)
1632
+ extra_outputs_from_bg = final_result[46] if len(final_result) > 46 and final_result[46] is not None else {}
1633
 
1634
  # Determine which codes to store
1635
  batch_size = params.get("batch_size_input", 2)