feat(ui, core): Implement advanced grouped image backgrounds
Browse filesThis major refactoring introduces a powerful new feature allowing users to assign specific sets of images to designated groups of tracks. This transforms the application from a linear visualizer into a tool capable of creating dynamic, context-aware videos with thematic sections.
The implementation required a complete overhaul of the UI for image uploads and a fundamental rewrite of the backend logic for track processing and image distribution.
Dynamic Group Management:
The single image uploader has been replaced with a dynamic interface for defining up to 10 distinct image groups.
Users can now click "+ Add Image Group" and "- Remove Last Group" buttons to manage the number of visible group definitions.
This is simulated by managing the visibility of a pre-defined maximum number of groups, providing a seamless user experience.
Group Definition:
Each group consists of a Textbox for defining track ranges (e.g., "1-4, 7, 10-13") and a dedicated Files uploader for that group's specific images.
Fallback Images:
A separate "Fallback / Default Images" uploader is provided for any tracks that are not explicitly assigned to a group.
|
@@ -9,7 +9,7 @@ import subprocess
|
|
| 9 |
import soundfile as sf
|
| 10 |
import matplotlib.font_manager as fm
|
| 11 |
from PIL import ImageFont
|
| 12 |
-
from typing import Tuple, List, Dict
|
| 13 |
from mutagen.flac import FLAC
|
| 14 |
from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
|
| 15 |
|
|
@@ -28,7 +28,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
| 28 |
elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
|
| 29 |
return name_bytes.decode('mac_roman').strip('\x00')
|
| 30 |
elif platform_id == 0: # Unicode
|
| 31 |
-
|
| 32 |
else: # Fallback
|
| 33 |
return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
|
| 34 |
except Exception:
|
|
@@ -36,9 +36,10 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
with open(font_path, 'rb') as f: data = f.read()
|
| 39 |
-
def read_ushort(offset):
|
| 40 |
-
|
| 41 |
-
|
|
|
|
| 42 |
font_offsets = [0]
|
| 43 |
# Check for TTC (TrueType Collection) header
|
| 44 |
if data[:4] == b'ttcf':
|
|
@@ -47,7 +48,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
| 47 |
|
| 48 |
# For simplicity, we only parse the first font in a TTC
|
| 49 |
font_offset = font_offsets[0]
|
| 50 |
-
|
| 51 |
num_tables = read_ushort(font_offset + 4)
|
| 52 |
name_table_offset = -1
|
| 53 |
# Locate the 'name' table
|
|
@@ -55,38 +56,50 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
|
|
| 55 |
entry_offset = font_offset + 12 + i * 16
|
| 56 |
tag = data[entry_offset:entry_offset+4]
|
| 57 |
if tag == b'name':
|
| 58 |
-
name_table_offset = read_ulong(entry_offset + 8)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
|
| 63 |
name_candidates = {}
|
| 64 |
# Iterate through all name records
|
| 65 |
for i in range(count):
|
| 66 |
rec_offset = name_table_offset + 6 + i * 12
|
| 67 |
platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
|
| 68 |
-
|
| 69 |
if name_id == 4: # We only care about the "Full Font Name"
|
| 70 |
string_pos = name_table_offset + string_offset + offset
|
| 71 |
value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
|
| 72 |
-
|
| 73 |
if value:
|
| 74 |
# Store candidates based on language ID
|
| 75 |
-
if language_id in [1028, 2052, 3076, 4100, 5124]:
|
| 76 |
-
|
| 77 |
-
elif language_id ==
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
else:
|
| 80 |
-
if "other" not in name_candidates:
|
| 81 |
-
|
|
|
|
| 82 |
# Return the best candidate based on language priority
|
| 83 |
-
if name_candidates.get("zh"):
|
| 84 |
-
|
| 85 |
-
if name_candidates.get("
|
| 86 |
-
|
| 87 |
-
if name_candidates.get("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
return None, None
|
| 89 |
-
|
| 90 |
except Exception:
|
| 91 |
return None, None
|
| 92 |
|
|
@@ -106,22 +119,22 @@ def get_font_data() -> Tuple[Dict[str, str], List[str]]:
|
|
| 106 |
for path in all_font_files:
|
| 107 |
display_name, lang_tag = get_font_display_name(path)
|
| 108 |
is_fallback = display_name is None
|
| 109 |
-
|
| 110 |
if is_fallback:
|
| 111 |
# Create a fallback name from the filename
|
| 112 |
display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
|
| 113 |
lang_tag = 'fallback'
|
| 114 |
-
|
| 115 |
if display_name and display_name not in font_map:
|
| 116 |
font_map[display_name] = path
|
| 117 |
found_names.append((display_name, is_fallback, lang_tag))
|
| 118 |
-
|
| 119 |
# Define sort priority for languages
|
| 120 |
sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
|
| 121 |
|
| 122 |
# Sort by priority, then alphabetically
|
| 123 |
found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
|
| 124 |
-
|
| 125 |
sorted_display_names = [name for name, _, _ in found_names]
|
| 126 |
return font_map, sorted_display_names
|
| 127 |
|
|
@@ -188,7 +201,7 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
|
|
| 188 |
'-c:a', 'copy', # Copy audio without re-encoding
|
| 189 |
output_path
|
| 190 |
]
|
| 191 |
-
|
| 192 |
try:
|
| 193 |
# Execute the command
|
| 194 |
# Using capture_output to hide ffmpeg logs from the main console unless an error occurs
|
|
@@ -203,24 +216,75 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
|
|
| 203 |
raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
|
| 204 |
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
# --- Main Processing Function ---
|
| 207 |
-
def process_audio_to_video(
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
if not audio_files:
|
| 218 |
raise gr.Error("Please upload at least one audio file.")
|
| 219 |
if not font_name:
|
| 220 |
raise gr.Error("Please select a font from the list.")
|
| 221 |
|
| 222 |
progress(0, desc="Initializing...")
|
| 223 |
-
|
| 224 |
# Define paths for temporary and final files
|
| 225 |
timestamp = int(time.time())
|
| 226 |
temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
|
|
@@ -254,7 +318,7 @@ def process_audio_to_video(
|
|
| 254 |
raise ValueError(f"Could not parse rgb color string: {color_str}")
|
| 255 |
else:
|
| 256 |
raise ValueError(f"Unknown color format: {color_str}")
|
| 257 |
-
|
| 258 |
# Use the new robust parser for all color inputs
|
| 259 |
fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
|
| 260 |
grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
|
|
@@ -264,11 +328,9 @@ def process_audio_to_video(
|
|
| 264 |
# --- Define total steps for the progress bar ---
|
| 265 |
TOTAL_STEPS = 5
|
| 266 |
|
| 267 |
-
# --- 1
|
| 268 |
-
|
| 269 |
-
total_duration = 0.0
|
| 270 |
-
y_accumulator = []
|
| 271 |
-
current_sr = None
|
| 272 |
|
| 273 |
# --- Use `progress.tqdm` to create a progress bar for this loop ---
|
| 274 |
for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
|
|
@@ -301,48 +363,20 @@ def process_audio_to_video(
|
|
| 301 |
|
| 302 |
print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
|
| 303 |
except Exception as e:
|
| 304 |
-
print(f"Warning: Could not
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
"end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
|
| 317 |
-
"number_str": number_str
|
| 318 |
-
})
|
| 319 |
-
else: # Scenario 2: Multiple files, this one has NO CUE
|
| 320 |
-
number_str = f"{file_num:02d}" if format_double_digits else str(file_num)
|
| 321 |
-
all_tracks_info.append({
|
| 322 |
-
"title": os.path.splitext(os.path.basename(audio_path))[0],
|
| 323 |
-
"start_time": total_duration, "end_time": total_duration + file_duration,
|
| 324 |
-
"number_str": number_str
|
| 325 |
-
})
|
| 326 |
-
else: # Scenario 1: Single file upload
|
| 327 |
-
if cue_tracks: # With CUE
|
| 328 |
-
for track_idx, track in enumerate(cue_tracks):
|
| 329 |
-
track_num = track_idx + 1
|
| 330 |
-
number_str = f"{track_num:02d}" if format_double_digits else str(track_num)
|
| 331 |
-
all_tracks_info.append({
|
| 332 |
-
"title": track.get('title', 'Unknown Track'),
|
| 333 |
-
"start_time": total_duration + track.get('start_time', 0),
|
| 334 |
-
"end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
|
| 335 |
-
"number_str": f"{number_str}." # Add a dot for single file CUE tracks
|
| 336 |
-
})
|
| 337 |
-
else: # No CUE
|
| 338 |
-
all_tracks_info.append({
|
| 339 |
-
"title": os.path.splitext(os.path.basename(audio_path))[0],
|
| 340 |
-
"start_time": total_duration, "end_time": total_duration + file_duration,
|
| 341 |
-
"number_str": None # Signal to not show any number
|
| 342 |
-
})
|
| 343 |
-
|
| 344 |
total_duration += file_duration
|
| 345 |
-
|
| 346 |
# --- Concatenate along the time axis (axis=1) for stereo arrays ---
|
| 347 |
y_combined = np.concatenate(y_accumulator, axis=1)
|
| 348 |
duration = total_duration
|
|
@@ -350,116 +384,128 @@ def process_audio_to_video(
|
|
| 350 |
# --- Transpose the array for soundfile to write stereo correctly ---
|
| 351 |
sf.write(temp_audio_path, y_combined.T, current_sr)
|
| 352 |
print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
|
| 353 |
-
|
| 354 |
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
| 355 |
-
progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}:
|
| 356 |
|
| 357 |
-
# --- 2
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
|
| 368 |
-
|
|
|
|
|
|
|
| 369 |
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
|
|
|
|
|
|
|
|
|
| 373 |
|
| 374 |
-
|
| 375 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
_, descent = pil_font.getmetrics()
|
| 382 |
-
# Calculate a bottom margin to compensate for the font's descent.
|
| 383 |
-
# A small constant is added as a safety buffer.
|
| 384 |
-
# This prevents clipping on fonts with large descenders (like 'g', 'p').
|
| 385 |
-
bottom_margin = int(descent * 0.5) + 2
|
| 386 |
-
print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
|
| 387 |
-
except Exception as e:
|
| 388 |
-
# Fallback in case of any font loading error
|
| 389 |
-
print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
|
| 390 |
-
bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
|
| 391 |
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
continue
|
| 396 |
-
|
| 397 |
-
# Construct display text based on pre-formatted number string
|
| 398 |
-
display_text = f"{track['number_str']} {track['title']}" if track['number_str'] else track['title']
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
# 1. Create the TextClip first without positioning to get its size
|
| 402 |
-
txt_clip = TextClip(
|
| 403 |
-
text=display_text.strip(),
|
| 404 |
-
font_size=font_size,
|
| 405 |
-
color=font_color,
|
| 406 |
-
font=font_path,
|
| 407 |
-
bg_color=bg_color_tuple,
|
| 408 |
-
method='caption', # <-- Set method to caption
|
| 409 |
-
size=(caption_width, None), # <-- Provide size for wrapping
|
| 410 |
-
margin=(0, 0, 0, bottom_margin)
|
| 411 |
-
).with_position(position).with_duration(text_duration).with_start(track['start_time'])
|
| 412 |
-
|
| 413 |
-
text_clips.append(txt_clip)
|
| 414 |
-
|
| 415 |
-
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
| 416 |
-
progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Visual Layers")
|
| 417 |
|
| 418 |
-
# --- 3. Image and Spectrogram Logic ---
|
| 419 |
-
image_clips = []
|
| 420 |
-
if image_paths and len(image_paths) > 0:
|
| 421 |
-
print(f"Found {len(image_paths)} images to process.")
|
| 422 |
|
| 423 |
-
#
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
# 1. Calculate scaling factor to "contain" the image (fit inside).
|
| 439 |
-
# We use min() to find the ratio that requires the most shrinkage,
|
| 440 |
-
# ensuring the whole image fits without being cropped.
|
| 441 |
-
scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
|
| 442 |
-
|
| 443 |
-
# 2. Resize the image so it fits perfectly within the video dimensions.
|
| 444 |
-
resized_clip = img_clip_raw.resized(scale_factor)
|
| 445 |
-
|
| 446 |
-
# 3. Create a composite clip to position the resized image on a
|
| 447 |
-
# correctly-sized transparent canvas. This is the key to preventing overflow.
|
| 448 |
-
final_layer = CompositeVideoClip(
|
| 449 |
-
[resized_clip.with_position("center")],
|
| 450 |
-
size=(WIDTH, HEIGHT)
|
| 451 |
-
)
|
| 452 |
-
|
| 453 |
-
# 4. Set the timing on the final composite layer.
|
| 454 |
-
return final_layer.with_duration(dur).with_start(start)
|
| 455 |
-
except Exception as e:
|
| 456 |
-
print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
|
| 457 |
-
return None
|
| 458 |
-
|
| 459 |
-
# Create an ImageClip for the duration of the track.
|
| 460 |
-
clip = create_image_layer(img_path, i * img_duration, img_duration)
|
| 461 |
-
if clip:
|
| 462 |
-
image_clips.append(clip)
|
| 463 |
|
| 464 |
N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
|
| 465 |
MIN_DB, MAX_DB = -80.0, 0.0
|
|
@@ -506,16 +552,16 @@ def process_audio_to_video(
|
|
| 506 |
|
| 507 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
| 508 |
|
| 509 |
-
# ---
|
| 510 |
# If image clips were created, make the spectrogram layer 50% transparent.
|
| 511 |
if image_clips:
|
| 512 |
print("Applying 50% opacity to spectrogram layer.")
|
| 513 |
video_clip = video_clip.with_opacity(0.5)
|
| 514 |
-
|
| 515 |
# --- Use fractional progress (current/total) ---
|
| 516 |
-
progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video
|
| 517 |
|
| 518 |
-
# ---
|
| 519 |
audio_clip = AudioFileClip(temp_audio_path)
|
| 520 |
|
| 521 |
# --- Clip Composition ---
|
|
@@ -542,7 +588,7 @@ def process_audio_to_video(
|
|
| 542 |
audio_bitrate="320k", fps=RENDER_FPS,
|
| 543 |
logger='bar', threads=os.cpu_count(), preset='ultrafast')
|
| 544 |
print("High-quality AAC audio encoding complete.")
|
| 545 |
-
|
| 546 |
final_clip.close()
|
| 547 |
|
| 548 |
# Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
|
|
@@ -550,8 +596,8 @@ def process_audio_to_video(
|
|
| 550 |
|
| 551 |
# --- Use fractional progress (current/total) ---
|
| 552 |
progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
|
| 553 |
-
|
| 554 |
-
# ---
|
| 555 |
increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
|
| 556 |
|
| 557 |
return final_output_path
|
|
@@ -573,29 +619,63 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
| 573 |
with gr.Column(scale=1):
|
| 574 |
# --- Changed to gr.Files for multi-upload ---
|
| 575 |
audio_inputs = gr.Files(
|
| 576 |
-
label="Upload Audio File(s)",
|
| 577 |
file_count="multiple",
|
| 578 |
file_types=["audio"]
|
| 579 |
)
|
| 580 |
|
| 581 |
-
# --- Image
|
| 582 |
-
gr.
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
with gr.Accordion("Visualizer Options", open=True):
|
| 600 |
with gr.Row():
|
| 601 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
|
@@ -611,7 +691,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
| 611 |
# --- Checkbox for number formatting ---
|
| 612 |
format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
|
| 613 |
gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
|
| 614 |
-
|
| 615 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
| 616 |
# This list can include multiple names for the same font to improve matching.
|
| 617 |
preferred_fonts = [
|
|
@@ -634,15 +714,15 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
| 634 |
default_font = FONT_DISPLAY_NAMES[0]
|
| 635 |
|
| 636 |
font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
|
| 637 |
-
|
| 638 |
with gr.Row():
|
| 639 |
font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
|
| 640 |
font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
|
| 641 |
-
|
| 642 |
with gr.Row():
|
| 643 |
font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
|
| 644 |
font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
|
| 645 |
-
|
| 646 |
gr.Markdown("Text Position")
|
| 647 |
with gr.Row():
|
| 648 |
pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
|
|
@@ -652,20 +732,39 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
|
|
| 652 |
|
| 653 |
with gr.Column(scale=2):
|
| 654 |
video_output = gr.Video(label="Generated Video")
|
| 655 |
-
|
| 656 |
-
# ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 657 |
submit_btn.click(
|
| 658 |
fn=process_audio_to_video,
|
| 659 |
-
inputs=
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
width_input, height_input,
|
| 663 |
-
fg_color, bg_color,
|
| 664 |
-
font_name_dd, font_size_slider, font_color_picker,
|
| 665 |
-
font_bg_color_picker, font_bg_alpha_slider,
|
| 666 |
-
pos_h_radio, pos_v_radio
|
| 667 |
-
],
|
| 668 |
-
outputs=video_output
|
| 669 |
)
|
| 670 |
|
| 671 |
if __name__ == "__main__":
|
|
|
|
| 9 |
import soundfile as sf
|
| 10 |
import matplotlib.font_manager as fm
|
| 11 |
from PIL import ImageFont
|
| 12 |
+
from typing import Tuple, List, Dict, Set
|
| 13 |
from mutagen.flac import FLAC
|
| 14 |
from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
|
| 15 |
|
|
|
|
| 28 |
elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
|
| 29 |
return name_bytes.decode('mac_roman').strip('\x00')
|
| 30 |
elif platform_id == 0: # Unicode
|
| 31 |
+
return name_bytes.decode('utf_16_be').strip('\x00')
|
| 32 |
else: # Fallback
|
| 33 |
return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
|
| 34 |
except Exception:
|
|
|
|
| 36 |
|
| 37 |
try:
|
| 38 |
with open(font_path, 'rb') as f: data = f.read()
|
| 39 |
+
def read_ushort(offset):
|
| 40 |
+
return struct.unpack('>H', data[offset:offset+2])[0]
|
| 41 |
+
def read_ulong(offset):
|
| 42 |
+
return struct.unpack('>I', data[offset:offset+4])[0]
|
| 43 |
font_offsets = [0]
|
| 44 |
# Check for TTC (TrueType Collection) header
|
| 45 |
if data[:4] == b'ttcf':
|
|
|
|
| 48 |
|
| 49 |
# For simplicity, we only parse the first font in a TTC
|
| 50 |
font_offset = font_offsets[0]
|
| 51 |
+
|
| 52 |
num_tables = read_ushort(font_offset + 4)
|
| 53 |
name_table_offset = -1
|
| 54 |
# Locate the 'name' table
|
|
|
|
| 56 |
entry_offset = font_offset + 12 + i * 16
|
| 57 |
tag = data[entry_offset:entry_offset+4]
|
| 58 |
if tag == b'name':
|
| 59 |
+
name_table_offset = read_ulong(entry_offset + 8)
|
| 60 |
+
break
|
| 61 |
+
|
| 62 |
+
if name_table_offset == -1:
|
| 63 |
+
return None, None
|
| 64 |
+
|
| 65 |
count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
|
| 66 |
name_candidates = {}
|
| 67 |
# Iterate through all name records
|
| 68 |
for i in range(count):
|
| 69 |
rec_offset = name_table_offset + 6 + i * 12
|
| 70 |
platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
|
| 71 |
+
|
| 72 |
if name_id == 4: # We only care about the "Full Font Name"
|
| 73 |
string_pos = name_table_offset + string_offset + offset
|
| 74 |
value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
|
| 75 |
+
|
| 76 |
if value:
|
| 77 |
# Store candidates based on language ID
|
| 78 |
+
if language_id in [1028, 2052, 3076, 4100, 5124]:
|
| 79 |
+
name_candidates["zh"] = value
|
| 80 |
+
elif language_id == 1041:
|
| 81 |
+
name_candidates["ja"] = value
|
| 82 |
+
elif language_id == 1042:
|
| 83 |
+
name_candidates["ko"] = value
|
| 84 |
+
elif language_id in [1033, 0]:
|
| 85 |
+
name_candidates["en"] = value
|
| 86 |
else:
|
| 87 |
+
if "other" not in name_candidates:
|
| 88 |
+
name_candidates["other"] = value
|
| 89 |
+
|
| 90 |
# Return the best candidate based on language priority
|
| 91 |
+
if name_candidates.get("zh"):
|
| 92 |
+
return name_candidates.get("zh"), "zh"
|
| 93 |
+
if name_candidates.get("ja"):
|
| 94 |
+
return name_candidates.get("ja"), "ja"
|
| 95 |
+
if name_candidates.get("ko"):
|
| 96 |
+
return name_candidates.get("ko"), "ko"
|
| 97 |
+
if name_candidates.get("other"):
|
| 98 |
+
return name_candidates.get("other"), "other"
|
| 99 |
+
if name_candidates.get("en"):
|
| 100 |
+
return name_candidates.get("en"), "en"
|
| 101 |
return None, None
|
| 102 |
+
|
| 103 |
except Exception:
|
| 104 |
return None, None
|
| 105 |
|
|
|
|
| 119 |
for path in all_font_files:
|
| 120 |
display_name, lang_tag = get_font_display_name(path)
|
| 121 |
is_fallback = display_name is None
|
| 122 |
+
|
| 123 |
if is_fallback:
|
| 124 |
# Create a fallback name from the filename
|
| 125 |
display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
|
| 126 |
lang_tag = 'fallback'
|
| 127 |
+
|
| 128 |
if display_name and display_name not in font_map:
|
| 129 |
font_map[display_name] = path
|
| 130 |
found_names.append((display_name, is_fallback, lang_tag))
|
| 131 |
+
|
| 132 |
# Define sort priority for languages
|
| 133 |
sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
|
| 134 |
|
| 135 |
# Sort by priority, then alphabetically
|
| 136 |
found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
|
| 137 |
+
|
| 138 |
sorted_display_names = [name for name, _, _ in found_names]
|
| 139 |
return font_map, sorted_display_names
|
| 140 |
|
|
|
|
| 201 |
'-c:a', 'copy', # Copy audio without re-encoding
|
| 202 |
output_path
|
| 203 |
]
|
| 204 |
+
|
| 205 |
try:
|
| 206 |
# Execute the command
|
| 207 |
# Using capture_output to hide ffmpeg logs from the main console unless an error occurs
|
|
|
|
| 216 |
raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
|
| 217 |
|
| 218 |
|
| 219 |
+
# --- HELPER FUNCTION for parsing track ranges ---
|
| 220 |
+
def parse_track_ranges(range_str: str) -> Set[int]:
|
| 221 |
+
"""Parses a string like '1-4, 7, 10-13' into a set of integers."""
|
| 222 |
+
if not range_str:
|
| 223 |
+
return set()
|
| 224 |
+
|
| 225 |
+
indices = set()
|
| 226 |
+
parts = range_str.split(',')
|
| 227 |
+
for part in parts:
|
| 228 |
+
part = part.strip()
|
| 229 |
+
if not part:
|
| 230 |
+
continue
|
| 231 |
+
if '-' in part:
|
| 232 |
+
try:
|
| 233 |
+
start, end = map(int, part.split('-'))
|
| 234 |
+
indices.update(range(start, end + 1))
|
| 235 |
+
except ValueError:
|
| 236 |
+
print(f"Warning: Could not parse range '{part}'. Skipping.")
|
| 237 |
+
else:
|
| 238 |
+
try:
|
| 239 |
+
indices.add(int(part))
|
| 240 |
+
except ValueError:
|
| 241 |
+
print(f"Warning: Could not parse track number '{part}'. Skipping.")
|
| 242 |
+
return indices
|
| 243 |
+
|
| 244 |
+
|
| 245 |
# --- Main Processing Function ---
|
| 246 |
+
def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
|
| 247 |
+
# --- Correctly unpack all arguments from *args using slicing ---
|
| 248 |
+
MAX_GROUPS = 10 # This MUST match the UI definition
|
| 249 |
+
|
| 250 |
+
# Define the structure of the *args tuple based on the `all_inputs` list
|
| 251 |
+
audio_files = args[0]
|
| 252 |
+
|
| 253 |
+
# Slice the args tuple to get the continuous blocks of inputs
|
| 254 |
+
all_track_strs = args[1 : 1 + MAX_GROUPS]
|
| 255 |
+
all_image_lists = args[1 + MAX_GROUPS : 1 + MAX_GROUPS * 2]
|
| 256 |
+
|
| 257 |
+
# Group inputs are packed in pairs (track_str, image_list)
|
| 258 |
+
group_definitions = []
|
| 259 |
+
for i in range(MAX_GROUPS):
|
| 260 |
+
group_definitions.append({
|
| 261 |
+
"tracks_str": all_track_strs[i],
|
| 262 |
+
"images": all_image_lists[i]
|
| 263 |
+
})
|
| 264 |
+
|
| 265 |
+
# Unpack the remaining arguments with correct indexing
|
| 266 |
+
arg_offset = 1 + MAX_GROUPS * 2
|
| 267 |
+
fallback_images = args[arg_offset]
|
| 268 |
+
format_double_digits = args[arg_offset + 1]
|
| 269 |
+
video_width = args[arg_offset + 2]
|
| 270 |
+
video_height = args[arg_offset + 3]
|
| 271 |
+
spec_fg_color = args[arg_offset + 4]
|
| 272 |
+
spec_bg_color = args[arg_offset + 5]
|
| 273 |
+
font_name = args[arg_offset + 6]
|
| 274 |
+
font_size = args[arg_offset + 7]
|
| 275 |
+
font_color = args[arg_offset + 8]
|
| 276 |
+
font_bg_color = args[arg_offset + 9]
|
| 277 |
+
font_bg_alpha = args[arg_offset + 10]
|
| 278 |
+
pos_h = args[arg_offset + 11]
|
| 279 |
+
pos_v = args[arg_offset + 12]
|
| 280 |
+
|
| 281 |
if not audio_files:
|
| 282 |
raise gr.Error("Please upload at least one audio file.")
|
| 283 |
if not font_name:
|
| 284 |
raise gr.Error("Please select a font from the list.")
|
| 285 |
|
| 286 |
progress(0, desc="Initializing...")
|
| 287 |
+
|
| 288 |
# Define paths for temporary and final files
|
| 289 |
timestamp = int(time.time())
|
| 290 |
temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
|
|
|
|
| 318 |
raise ValueError(f"Could not parse rgb color string: {color_str}")
|
| 319 |
else:
|
| 320 |
raise ValueError(f"Unknown color format: {color_str}")
|
| 321 |
+
|
| 322 |
# Use the new robust parser for all color inputs
|
| 323 |
fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
|
| 324 |
grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
|
|
|
|
| 328 |
# --- Define total steps for the progress bar ---
|
| 329 |
TOTAL_STEPS = 5
|
| 330 |
|
| 331 |
+
# --- Stage 1: Audio Processing & Master Track List Creation ---
|
| 332 |
+
master_track_list, y_accumulator, current_sr = [], [], None
|
| 333 |
+
total_duration, global_track_counter = 0.0, 0
|
|
|
|
|
|
|
| 334 |
|
| 335 |
# --- Use `progress.tqdm` to create a progress bar for this loop ---
|
| 336 |
for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
|
|
|
|
| 363 |
|
| 364 |
print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
|
| 365 |
except Exception as e:
|
| 366 |
+
print(f"Warning: Could not parse CUE sheet for {os.path.basename(audio_path)}: {e}")
|
| 367 |
+
|
| 368 |
+
if cue_tracks:
|
| 369 |
+
for track_idx, track in enumerate(cue_tracks):
|
| 370 |
+
global_track_counter += 1
|
| 371 |
+
start_time = track.get('start_time', 0)
|
| 372 |
+
end_time = cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration
|
| 373 |
+
master_track_list.append({"global_index": global_track_counter, "title": track.get('title', 'Unknown'), "start_time": total_duration + start_time, "end_time": total_duration + end_time})
|
| 374 |
+
else:
|
| 375 |
+
global_track_counter += 1
|
| 376 |
+
master_track_list.append({"global_index": global_track_counter, "title": os.path.splitext(os.path.basename(audio_path))[0], "start_time": total_duration, "end_time": total_duration + file_duration})
|
| 377 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
total_duration += file_duration
|
| 379 |
+
|
| 380 |
# --- Concatenate along the time axis (axis=1) for stereo arrays ---
|
| 381 |
y_combined = np.concatenate(y_accumulator, axis=1)
|
| 382 |
duration = total_duration
|
|
|
|
| 384 |
# --- Transpose the array for soundfile to write stereo correctly ---
|
| 385 |
sf.write(temp_audio_path, y_combined.T, current_sr)
|
| 386 |
print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
|
| 387 |
+
|
| 388 |
# --- Update progress to the next stage, use fractional progress (current/total) ---
|
| 389 |
+
progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Mapping Images to Tracks")
|
| 390 |
|
| 391 |
+
# --- Stage 2: Map Tracks to Image Groups ---
|
| 392 |
+
parsed_groups = [parse_track_ranges(g['tracks_str']) for g in group_definitions]
|
| 393 |
+
track_to_images_map = {}
|
| 394 |
+
for track_info in master_track_list:
|
| 395 |
+
track_idx = track_info['global_index']
|
| 396 |
+
assigned = False
|
| 397 |
+
for i, group_indices in enumerate(parsed_groups):
|
| 398 |
+
if track_idx in group_indices:
|
| 399 |
+
track_to_images_map[track_idx] = group_definitions[i]['images']
|
| 400 |
+
assigned = True
|
| 401 |
+
break
|
| 402 |
+
if not assigned:
|
| 403 |
+
track_to_images_map[track_idx] = fallback_images
|
| 404 |
+
|
| 405 |
+
# --- Stage 3: Generate ImageClips based on contiguous blocks ---
|
| 406 |
+
image_clips = []
|
| 407 |
+
if any(track_to_images_map.values()):
|
| 408 |
+
current_track_cursor = 0
|
| 409 |
+
while current_track_cursor < len(master_track_list):
|
| 410 |
+
start_track_info = master_track_list[current_track_cursor]
|
| 411 |
+
image_set_for_block = track_to_images_map.get(start_track_info['global_index'])
|
| 412 |
+
|
| 413 |
+
# Find the end of the contiguous block of tracks that use the same image set
|
| 414 |
+
end_track_cursor = current_track_cursor
|
| 415 |
+
while (end_track_cursor + 1 < len(master_track_list) and
|
| 416 |
+
track_to_images_map.get(master_track_list[end_track_cursor + 1]['global_index']) == image_set_for_block):
|
| 417 |
+
end_track_cursor += 1
|
| 418 |
+
|
| 419 |
+
end_track_info = master_track_list[end_track_cursor]
|
| 420 |
+
|
| 421 |
+
block_start_time = start_track_info['start_time']
|
| 422 |
+
block_end_time = end_track_info['end_time']
|
| 423 |
+
block_duration = block_end_time - block_start_time
|
| 424 |
+
|
| 425 |
+
if image_set_for_block and block_duration > 0:
|
| 426 |
+
print(f"Creating image block for tracks {start_track_info['global_index']}-{end_track_info['global_index']} (Time: {block_start_time:.2f}s - {block_end_time:.2f}s)")
|
| 427 |
+
time_per_image = block_duration / len(image_set_for_block)
|
| 428 |
+
for i, img_path in enumerate(image_set_for_block):
|
| 429 |
+
def create_image_layer(path, start, dur):
|
| 430 |
+
try:
|
| 431 |
+
img = ImageClip(path)
|
| 432 |
+
scale = min(WIDTH/img.w, HEIGHT/img.h)
|
| 433 |
+
resized_img = img.resized(scale)
|
| 434 |
+
return CompositeVideoClip([resized_img.with_position("center")], size=(WIDTH, HEIGHT)).with_duration(dur).with_start(start)
|
| 435 |
+
except Exception as e:
|
| 436 |
+
print(f"Warning: Failed to process image '{path}'. Skipping. Error: {e}")
|
| 437 |
+
return None
|
| 438 |
+
|
| 439 |
+
clip = create_image_layer(img_path, block_start_time + i * time_per_image, time_per_image)
|
| 440 |
+
if clip:
|
| 441 |
+
image_clips.append(clip)
|
| 442 |
+
|
| 443 |
+
current_track_cursor = end_track_cursor + 1
|
| 444 |
+
|
| 445 |
+
progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Text & Spectrogram")
|
| 446 |
+
|
| 447 |
+
# --- Stage 4: Generate Text and Spectrogram ---
|
| 448 |
+
# --- Text Overlay Logic using the aggregated track info
|
| 449 |
+
text_clips = [] # Text clips are now simpler as they don't depend on complex file logic anymore
|
| 450 |
+
|
| 451 |
+
font_path = SYSTEM_FONTS_MAP.get(font_name)
|
| 452 |
+
if not font_path:
|
| 453 |
+
raise gr.Error(f"Font path for '{font_name}' not found!")
|
| 454 |
+
|
| 455 |
+
# Use the robust parser for text colors as well
|
| 456 |
+
font_bg_rgb = parse_color_to_rgb(font_bg_color)
|
| 457 |
|
| 458 |
+
position = (pos_h.lower(), pos_v.lower())
|
| 459 |
+
|
| 460 |
+
print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
|
| 461 |
|
| 462 |
+
# Create the RGBA tuple for the background color.
|
| 463 |
+
# The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
|
| 464 |
+
bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
|
| 465 |
+
|
| 466 |
+
# 1. Define a maximum width for the caption. 90% of the video width is a good choice.
|
| 467 |
+
caption_width = int(WIDTH * 0.9)
|
| 468 |
|
| 469 |
+
# --- Get font metrics to calculate dynamic padding ---
|
| 470 |
+
try:
|
| 471 |
+
# Load the font with Pillow to access its metrics
|
| 472 |
+
pil_font = ImageFont.truetype(font_path, size=font_size)
|
| 473 |
+
_, descent = pil_font.getmetrics()
|
| 474 |
+
# Calculate a bottom margin to compensate for the font's descent.
|
| 475 |
+
# A small constant is added as a safety buffer.
|
| 476 |
+
# This prevents clipping on fonts with large descenders (like 'g', 'p').
|
| 477 |
+
bottom_margin = int(descent * 0.5) + 2
|
| 478 |
+
print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
|
| 479 |
+
except Exception as e:
|
| 480 |
+
# Fallback in case of any font loading error
|
| 481 |
+
print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
|
| 482 |
+
bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
|
| 483 |
|
| 484 |
+
for track in master_track_list:
|
| 485 |
+
text_duration = track['end_time'] - track['start_time']
|
| 486 |
+
if text_duration <= 0:
|
| 487 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
+
# Construct display text based on pre-formatted number string
|
| 490 |
+
num_str = f"{track['global_index']:02d}" if format_double_digits else str(track['global_index'])
|
| 491 |
+
display_text = f"{num_str}. {track['title']}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 493 |
|
| 494 |
+
# 1. Create the TextClip first without positioning to get its size
|
| 495 |
+
txt_clip = TextClip(
|
| 496 |
+
text=display_text.strip(),
|
| 497 |
+
font_size=font_size,
|
| 498 |
+
color=font_color,
|
| 499 |
+
font=font_path,
|
| 500 |
+
bg_color=bg_color_tuple,
|
| 501 |
+
method='caption', # <-- Set method to caption
|
| 502 |
+
size=(caption_width, None), # <-- Provide size for wrapping
|
| 503 |
+
margin=(0, 0, 0, bottom_margin)
|
| 504 |
+
).with_position(position).with_duration(text_duration).with_start(track['start_time'])
|
| 505 |
+
|
| 506 |
+
text_clips.append(txt_clip)
|
| 507 |
+
|
| 508 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 509 |
|
| 510 |
N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
|
| 511 |
MIN_DB, MAX_DB = -80.0, 0.0
|
|
|
|
| 552 |
|
| 553 |
video_clip = VideoClip(frame_function=frame_generator, duration=duration)
|
| 554 |
|
| 555 |
+
# --- Set Spectrogram Opacity ---
|
| 556 |
# If image clips were created, make the spectrogram layer 50% transparent.
|
| 557 |
if image_clips:
|
| 558 |
print("Applying 50% opacity to spectrogram layer.")
|
| 559 |
video_clip = video_clip.with_opacity(0.5)
|
| 560 |
+
|
| 561 |
# --- Use fractional progress (current/total) ---
|
| 562 |
+
progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video")
|
| 563 |
|
| 564 |
+
# --- Composition and Rendering ---
|
| 565 |
audio_clip = AudioFileClip(temp_audio_path)
|
| 566 |
|
| 567 |
# --- Clip Composition ---
|
|
|
|
| 588 |
audio_bitrate="320k", fps=RENDER_FPS,
|
| 589 |
logger='bar', threads=os.cpu_count(), preset='ultrafast')
|
| 590 |
print("High-quality AAC audio encoding complete.")
|
| 591 |
+
|
| 592 |
final_clip.close()
|
| 593 |
|
| 594 |
# Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
|
|
|
|
| 596 |
|
| 597 |
# --- Use fractional progress (current/total) ---
|
| 598 |
progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
|
| 599 |
+
|
| 600 |
+
# --- Finalizing ---
|
| 601 |
increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
|
| 602 |
|
| 603 |
return final_output_path
|
|
|
|
| 619 |
with gr.Column(scale=1):
|
| 620 |
# --- Changed to gr.Files for multi-upload ---
|
| 621 |
audio_inputs = gr.Files(
|
| 622 |
+
label="Upload Audio File(s)",
|
| 623 |
file_count="multiple",
|
| 624 |
file_types=["audio"]
|
| 625 |
)
|
| 626 |
|
| 627 |
+
# --- Grouped Image Section ---
|
| 628 |
+
with gr.Accordion("Grouped Image Backgrounds (Advanced)", open=False):
|
| 629 |
+
gr.Markdown("Define groups of tracks and assign specific images to them. Tracks are numbered globally starting from 1 across all uploaded files.")
|
| 630 |
+
|
| 631 |
+
MAX_GROUPS = 10
|
| 632 |
+
group_track_inputs = []
|
| 633 |
+
group_image_inputs = []
|
| 634 |
+
group_accordions = []
|
| 635 |
+
|
| 636 |
+
# --- Create a centralized update function ---
|
| 637 |
+
def update_group_visibility(target_count: int):
|
| 638 |
+
"""Updates the visibility of all group accordions and the state of the control buttons."""
|
| 639 |
+
# Clamp the target count to be within bounds
|
| 640 |
+
target_count = max(1, min(target_count, MAX_GROUPS))
|
| 641 |
+
|
| 642 |
+
updates = {visible_groups_state: target_count}
|
| 643 |
+
# Update visibility for each accordion
|
| 644 |
+
for i in range(MAX_GROUPS):
|
| 645 |
+
updates[group_accordions[i]] = gr.update(visible=(i < target_count))
|
| 646 |
+
|
| 647 |
+
# Update button states
|
| 648 |
+
updates[add_group_btn] = gr.update(visible=(target_count < MAX_GROUPS))
|
| 649 |
+
updates[remove_group_btn] = gr.update(interactive=(target_count > 1))
|
| 650 |
+
|
| 651 |
+
return updates
|
| 652 |
+
|
| 653 |
+
# --- Create simple wrapper functions for adding and removing ---
|
| 654 |
+
def add_group(current_count: int):
|
| 655 |
+
return update_group_visibility(current_count + 1)
|
| 656 |
+
|
| 657 |
+
def remove_group(current_count: int):
|
| 658 |
+
return update_group_visibility(current_count - 1)
|
| 659 |
+
|
| 660 |
+
# Pre-build all group components
|
| 661 |
+
for i in range(MAX_GROUPS):
|
| 662 |
+
with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
|
| 663 |
+
track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
|
| 664 |
+
image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
| 665 |
+
group_track_inputs.append(track_input)
|
| 666 |
+
group_image_inputs.append(image_input)
|
| 667 |
+
group_accordions.append(acc)
|
| 668 |
+
|
| 669 |
+
visible_groups_state = gr.State(1)
|
| 670 |
+
# --- Add a remove button and put both in a row ---
|
| 671 |
+
with gr.Row():
|
| 672 |
+
remove_group_btn = gr.Button("- Remove Last Group", variant="secondary", interactive=False)
|
| 673 |
+
add_group_btn = gr.Button("+ Add Image Group", variant="secondary")
|
| 674 |
+
|
| 675 |
+
with gr.Accordion("Fallback / Default Images", open=True):
|
| 676 |
+
gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
|
| 677 |
+
fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
|
| 678 |
+
|
| 679 |
with gr.Accordion("Visualizer Options", open=True):
|
| 680 |
with gr.Row():
|
| 681 |
width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
|
|
|
|
| 691 |
# --- Checkbox for number formatting ---
|
| 692 |
format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
|
| 693 |
gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
|
| 694 |
+
|
| 695 |
# Define a priority list for default fonts, starting with common Japanese ones.
|
| 696 |
# This list can include multiple names for the same font to improve matching.
|
| 697 |
preferred_fonts = [
|
|
|
|
| 714 |
default_font = FONT_DISPLAY_NAMES[0]
|
| 715 |
|
| 716 |
font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
|
| 717 |
+
|
| 718 |
with gr.Row():
|
| 719 |
font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
|
| 720 |
font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
|
| 721 |
+
|
| 722 |
with gr.Row():
|
| 723 |
font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
|
| 724 |
font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
|
| 725 |
+
|
| 726 |
gr.Markdown("Text Position")
|
| 727 |
with gr.Row():
|
| 728 |
pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
|
|
|
|
| 732 |
|
| 733 |
with gr.Column(scale=2):
|
| 734 |
video_output = gr.Video(label="Generated Video")
|
| 735 |
+
|
| 736 |
+
# --- Define the full list of outputs for the update functions ---
|
| 737 |
+
group_update_outputs = [visible_groups_state, add_group_btn, remove_group_btn] + group_accordions
|
| 738 |
+
|
| 739 |
+
# Connect the "Add Group" button to its update function
|
| 740 |
+
add_group_btn.click(
|
| 741 |
+
fn=add_group,
|
| 742 |
+
inputs=visible_groups_state,
|
| 743 |
+
outputs=group_update_outputs
|
| 744 |
+
)
|
| 745 |
+
|
| 746 |
+
remove_group_btn.click(
|
| 747 |
+
fn=remove_group,
|
| 748 |
+
inputs=visible_groups_state,
|
| 749 |
+
outputs=group_update_outputs
|
| 750 |
+
)
|
| 751 |
+
|
| 752 |
+
# --- Define the master list of all inputs for the main button ---
|
| 753 |
+
all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
|
| 754 |
+
fallback_image_input,
|
| 755 |
+
format_double_digits_checkbox,
|
| 756 |
+
width_input, height_input,
|
| 757 |
+
fg_color, bg_color,
|
| 758 |
+
font_name_dd, font_size_slider, font_color_picker,
|
| 759 |
+
font_bg_color_picker, font_bg_alpha_slider,
|
| 760 |
+
pos_h_radio, pos_v_radio
|
| 761 |
+
]
|
| 762 |
+
|
| 763 |
submit_btn.click(
|
| 764 |
fn=process_audio_to_video,
|
| 765 |
+
inputs=all_inputs,
|
| 766 |
+
outputs=video_output,
|
| 767 |
+
show_progress="full"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 768 |
)
|
| 769 |
|
| 770 |
if __name__ == "__main__":
|