Spaces:

avans06
/

Audio_Spectrogram_Video_Generator

Sleeping

avans06 commited on Aug 16, 2025

Commit

c6d7d8f

1 Parent(s): 2c95cb8

feat(ui, core): Implement advanced grouped image backgrounds

This major refactoring introduces a powerful new feature allowing users to assign specific sets of images to designated groups of tracks. This transforms the application from a linear visualizer into a tool capable of creating dynamic, context-aware videos with thematic sections.

The implementation required a complete overhaul of the UI for image uploads and a fundamental rewrite of the backend logic for track processing and image distribution.

Dynamic Group Management:
The single image uploader has been replaced with a dynamic interface for defining up to 10 distinct image groups.
Users can now click "+ Add Image Group" and "- Remove Last Group" buttons to manage the number of visible group definitions.
This is simulated by managing the visibility of a pre-defined maximum number of groups, providing a seamless user experience.

Group Definition:
Each group consists of a Textbox for defining track ranges (e.g., "1-4, 7, 10-13") and a dedicated Files uploader for that group's specific images.

Fallback Images:
A separate "Fallback / Default Images" uploader is provided for any tracks that are not explicitly assigned to a group.

Files changed (1) hide show

app.py +329 -230

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ import subprocess
 import soundfile as sf
 import matplotlib.font_manager as fm
 from PIL import ImageFont
-from typing import Tuple, List, Dict
 from mutagen.flac import FLAC
 from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
@@ -28,7 +28,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
             elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
                 return name_bytes.decode('mac_roman').strip('\x00')
             elif platform_id == 0: # Unicode
-                 return name_bytes.decode('utf_16_be').strip('\x00')
             else: # Fallback
                 return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
         except Exception:
@@ -36,9 +36,10 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
     try:
         with open(font_path, 'rb') as f: data = f.read()
-        def read_ushort(offset): return struct.unpack('>H', data[offset:offset+2])[0]
-        def read_ulong(offset): return struct.unpack('>I', data[offset:offset+4])[0]
         font_offsets = [0]
         # Check for TTC (TrueType Collection) header
         if data[:4] == b'ttcf':
@@ -47,7 +48,7 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
         # For simplicity, we only parse the first font in a TTC
         font_offset = font_offsets[0]
         num_tables = read_ushort(font_offset + 4)
         name_table_offset = -1
         # Locate the 'name' table
@@ -55,38 +56,50 @@ def get_font_display_name(font_path: str) -> Tuple[str, str]:
             entry_offset = font_offset + 12 + i * 16
             tag = data[entry_offset:entry_offset+4]
             if tag == b'name':
-                name_table_offset = read_ulong(entry_offset + 8); break
-        if name_table_offset == -1: return None, None
         count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
         name_candidates = {}
         # Iterate through all name records
         for i in range(count):
             rec_offset = name_table_offset + 6 + i * 12
             platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
             if name_id == 4:  # We only care about the "Full Font Name"
                 string_pos = name_table_offset + string_offset + offset
                 value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
                 if value:
                     # Store candidates based on language ID
-                    if language_id in [1028, 2052, 3076, 4100, 5124]: name_candidates["zh"] = value # Chinese
-                    elif language_id == 1041: name_candidates["ja"] = value # Japanese
-                    elif language_id == 1042: name_candidates["ko"] = value # Korean
-                    elif language_id in [1033, 0]: name_candidates["en"] = value # English
                     else:
-                        if "other" not in name_candidates: name_candidates["other"] = value
         # Return the best candidate based on language priority
-        if name_candidates.get("zh"): return name_candidates.get("zh"), "zh"
-        if name_candidates.get("ja"): return name_candidates.get("ja"), "ja"
-        if name_candidates.get("ko"): return name_candidates.get("ko"), "ko"
-        if name_candidates.get("other"): return name_candidates.get("other"), "other"
-        if name_candidates.get("en"): return name_candidates.get("en"), "en"
         return None, None
     except Exception:
         return None, None
@@ -106,22 +119,22 @@ def get_font_data() -> Tuple[Dict[str, str], List[str]]:
     for path in all_font_files:
         display_name, lang_tag = get_font_display_name(path)
         is_fallback = display_name is None
         if is_fallback:
             # Create a fallback name from the filename
             display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
             lang_tag = 'fallback'
         if display_name and display_name not in font_map:
             font_map[display_name] = path
             found_names.append((display_name, is_fallback, lang_tag))
     # Define sort priority for languages
     sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
     # Sort by priority, then alphabetically
     found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
     sorted_display_names = [name for name, _, _ in found_names]
     return font_map, sorted_display_names
@@ -188,7 +201,7 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
         '-c:a', 'copy',             # Copy audio without re-encoding
         output_path
     ]
     try:
         # Execute the command
         # Using capture_output to hide ffmpeg logs from the main console unless an error occurs
@@ -203,24 +216,75 @@ def increase_video_framerate(input_path: str, output_path: str, target_fps: int
         raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
 # --- Main Processing Function ---
-def process_audio_to_video(
-    audio_files: List[str], image_paths: List[str],
-    format_double_digits: bool,
-    video_width: int, video_height: int,
-    spec_fg_color: str, spec_bg_color: str,
-    font_name: str, font_size: int, font_color: str,
-    font_bg_color: str, font_bg_alpha: float,
-    pos_h: str, pos_v: str,
-    progress=gr.Progress(track_tqdm=True)
-) -> str:
     if not audio_files:
         raise gr.Error("Please upload at least one audio file.")
     if not font_name:
         raise gr.Error("Please select a font from the list.")
     progress(0, desc="Initializing...")
     # Define paths for temporary and final files
     timestamp = int(time.time())
     temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
@@ -254,7 +318,7 @@ def process_audio_to_video(
                 raise ValueError(f"Could not parse rgb color string: {color_str}")
         else:
             raise ValueError(f"Unknown color format: {color_str}")
     # Use the new robust parser for all color inputs
     fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
     grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
@@ -264,11 +328,9 @@ def process_audio_to_video(
         # --- Define total steps for the progress bar ---
         TOTAL_STEPS = 5
-        # --- 1. Audio Processing & Track Info Aggregation ---
-        all_tracks_info = []
-        total_duration = 0.0
-        y_accumulator = []
-        current_sr = None
         # --- Use `progress.tqdm` to create a progress bar for this loop ---
         for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
@@ -301,48 +363,20 @@ def process_audio_to_video(
                         print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
                 except Exception as e:
-                    print(f"Warning: Could not read or parse CUE sheet for {os.path.basename(audio_path)}: {e}")
-            # --- Apply New Numbering Logic ---
-            file_num = file_idx + 1 # File numbering starts from 1
-            if len(audio_files) > 1:
-                if cue_tracks: # Scenario 3: Multiple files, this one has CUE
-                    for track_idx, track in enumerate(cue_tracks):
-                        track_num = track_idx + 1 # Track numbering starts from 1
-                        number_str = f"{file_num:02d}-{track_num:02d}" if format_double_digits else f"{file_num}-{track_num}"
-                        all_tracks_info.append({
-                            "title": track.get('title', 'Unknown Track'),
-                            "start_time": total_duration + track.get('start_time', 0),
-                            "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
-                            "number_str": number_str
-                        })
-                else: # Scenario 2: Multiple files, this one has NO CUE
-                    number_str = f"{file_num:02d}" if format_double_digits else str(file_num)
-                    all_tracks_info.append({
-                        "title": os.path.splitext(os.path.basename(audio_path))[0],
-                        "start_time": total_duration, "end_time": total_duration + file_duration,
-                        "number_str": number_str
-                    })
-            else: # Scenario 1: Single file upload
-                if cue_tracks: # With CUE
-                    for track_idx, track in enumerate(cue_tracks):
-                        track_num = track_idx + 1
-                        number_str = f"{track_num:02d}" if format_double_digits else str(track_num)
-                        all_tracks_info.append({
-                            "title": track.get('title', 'Unknown Track'),
-                            "start_time": total_duration + track.get('start_time', 0),
-                            "end_time": total_duration + (cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration),
-                            "number_str": f"{number_str}." # Add a dot for single file CUE tracks
-                        })
-                else: # No CUE
-                    all_tracks_info.append({
-                        "title": os.path.splitext(os.path.basename(audio_path))[0],
-                        "start_time": total_duration, "end_time": total_duration + file_duration,
-                        "number_str": None # Signal to not show any number
-                    })
             total_duration += file_duration
         # --- Concatenate along the time axis (axis=1) for stereo arrays ---
         y_combined = np.concatenate(y_accumulator, axis=1)
         duration = total_duration
@@ -350,116 +384,128 @@ def process_audio_to_video(
         # --- Transpose the array for soundfile to write stereo correctly ---
         sf.write(temp_audio_path, y_combined.T, current_sr)
         print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
         # --- Update progress to the next stage, use fractional progress (current/total) ---
-        progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Generating Text Overlays")
-        # --- 2. Text Overlay Logic using the aggregated track info
-        text_clips = []
-        if all_tracks_info:
-            font_path = SYSTEM_FONTS_MAP.get(font_name)
-            if not font_path: raise gr.Error(f"Font path for '{font_name}' not found!")
-            # Use the robust parser for text colors as well
-            font_bg_rgb = parse_color_to_rgb(font_bg_color)
-            position = (pos_h.lower(), pos_v.lower())
-            print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
-            # Create the RGBA tuple for the background color.
-            # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
-            bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
-            # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
-            caption_width = int(WIDTH * 0.9)
-            # --- Get font metrics to calculate dynamic padding ---
-            try:
-                # Load the font with Pillow to access its metrics
-                pil_font = ImageFont.truetype(font_path, size=font_size)
-                _, descent = pil_font.getmetrics()
-                # Calculate a bottom margin to compensate for the font's descent.
-                # A small constant is added as a safety buffer.
-                # This prevents clipping on fonts with large descenders (like 'g', 'p').
-                bottom_margin = int(descent * 0.5) + 2
-                print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
-            except Exception as e:
-                # Fallback in case of any font loading error
-                print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
-                bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
-            for track in all_tracks_info:
-                text_duration = track['end_time'] - track['start_time']
-                if text_duration <= 0:
-                    continue
-                # Construct display text based on pre-formatted number string
-                display_text = f"{track['number_str']} {track['title']}" if track['number_str'] else track['title']
-                # 1. Create the TextClip first without positioning to get its size
-                txt_clip = TextClip(
-                    text=display_text.strip(),
-                    font_size=font_size,
-                    color=font_color,
-                    font=font_path,
-                    bg_color=bg_color_tuple,
-                    method='caption', # <-- Set method to caption
-                    size=(caption_width, None), # <-- Provide size for wrapping
-                    margin=(0, 0, 0, bottom_margin)
-                ).with_position(position).with_duration(text_duration).with_start(track['start_time'])
-                text_clips.append(txt_clip)
-        # --- Update progress to the next stage, use fractional progress (current/total) ---
-        progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Visual Layers")
-        # --- 3. Image and Spectrogram Logic ---
-        image_clips = []
-        if image_paths and len(image_paths) > 0:
-            print(f"Found {len(image_paths)} images to process.")
-            # Simplified logic: calculate time per image, max 3 mins, and loop.
-            img_duration = duration / len(image_paths)
-            for i, img_path in enumerate(image_paths):
-                # --- HELPER FUNCTION FOR ROBUST IMAGE CLIPS ---
-                def create_image_layer(img_path, start, dur):
-                    """
-                    Creates an image layer that fits entirely within the video frame.
-                    It scales the image down to fit and centers it on a transparent background.
-                    """
-                    # This function implements a "cover" scaling mode to ensure the image
-                    # fills the entire video frame without leaving black bars.
-                    try:
-                        img_clip_raw = ImageClip(img_path)
-                        # 1. Calculate scaling factor to "contain" the image (fit inside).
-                        #    We use min() to find the ratio that requires the most shrinkage,
-                        #    ensuring the whole image fits without being cropped.
-                        scale_factor = min(WIDTH / img_clip_raw.w, HEIGHT / img_clip_raw.h)
-                        # 2. Resize the image so it fits perfectly within the video dimensions.
-                        resized_clip = img_clip_raw.resized(scale_factor)
-                        # 3. Create a composite clip to position the resized image on a
-                        #    correctly-sized transparent canvas. This is the key to preventing overflow.
-                        final_layer = CompositeVideoClip(
-                            [resized_clip.with_position("center")],
-                            size=(WIDTH, HEIGHT)
-                        )
-                        # 4. Set the timing on the final composite layer.
-                        return final_layer.with_duration(dur).with_start(start)
-                    except Exception as e:
-                        print(f"Warning: Failed to process image '{img_path}'. Skipping. Error: {e}")
-                        return None
-                # Create an ImageClip for the duration of the track.
-                clip = create_image_layer(img_path, i * img_duration, img_duration)
-                if clip:
-                    image_clips.append(clip)
         N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
         MIN_DB, MAX_DB = -80.0, 0.0
@@ -506,16 +552,16 @@ def process_audio_to_video(
         video_clip = VideoClip(frame_function=frame_generator, duration=duration)
-        # --- NEW: Set Spectrogram Opacity ---
         # If image clips were created, make the spectrogram layer 50% transparent.
         if image_clips:
             print("Applying 50% opacity to spectrogram layer.")
             video_clip = video_clip.with_opacity(0.5)
         # --- Use fractional progress (current/total) ---
-        progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video (this may take time)")
-        # --- 4. Composition and Rendering ---
         audio_clip = AudioFileClip(temp_audio_path)
         # --- Clip Composition ---
@@ -542,7 +588,7 @@ def process_audio_to_video(
                 audio_bitrate="320k", fps=RENDER_FPS,
                 logger='bar', threads=os.cpu_count(), preset='ultrafast')
             print("High-quality AAC audio encoding complete.")
         final_clip.close()
         # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
@@ -550,8 +596,8 @@ def process_audio_to_video(
         # --- Use fractional progress (current/total) ---
         progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
-        # --- 5. Finalizing ---
         increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
         return final_output_path
@@ -573,29 +619,63 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
         with gr.Column(scale=1):
             # --- Changed to gr.Files for multi-upload ---
             audio_inputs = gr.Files(
-                label="Upload Audio File(s)",
                 file_count="multiple",
                 file_types=["audio"]
             )
-            # --- Image Upload Component ---
-            gr.Markdown("### Background Image Options (Optional)")
-            gr.Markdown(
-                """
-                When background images are uploaded, they will be displayed in a looping sequence.
-                - The display duration for each image is calculated by dividing the total video length by the number of images, with a maximum duration of **3 minutes** per image.
-                - The sequence loops until the video ends.
-                """
-            )
-            image_uploads = gr.File(
-                label="Upload Background Images",
-                file_count="multiple", # Allow multiple files
-                # Replace the generic "image" category with a specific list of extensions.
-                # Note that the dot (.) before each extension is required.
-                file_types=[".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp", ".avif"]
-            )
             with gr.Accordion("Visualizer Options", open=True):
                 with gr.Row():
                     width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
@@ -611,7 +691,7 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                 # --- Checkbox for number formatting ---
                 format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
                 gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
                 # Define a priority list for default fonts, starting with common Japanese ones.
                 # This list can include multiple names for the same font to improve matching.
                 preferred_fonts = [
@@ -634,15 +714,15 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
                     default_font = FONT_DISPLAY_NAMES[0]
                 font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
                 with gr.Row():
                     font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
                     font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
                 with gr.Row():
                     font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
                     font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
                 gr.Markdown("Text Position")
                 with gr.Row():
                     pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
@@ -652,20 +732,39 @@ with gr.Blocks(title="Spectrogram Video Generator") as iface:
         with gr.Column(scale=2):
             video_output = gr.Video(label="Generated Video")
-    # --- Update inputs for the click event ---
     submit_btn.click(
         fn=process_audio_to_video,
-        inputs=[
-            audio_inputs, image_uploads,
-            format_double_digits_checkbox,
-            width_input, height_input,
-            fg_color, bg_color,
-            font_name_dd, font_size_slider, font_color_picker,
-            font_bg_color_picker, font_bg_alpha_slider,
-            pos_h_radio, pos_v_radio
-        ],
-        outputs=video_output
     )
 if __name__ == "__main__":

 import soundfile as sf
 import matplotlib.font_manager as fm
 from PIL import ImageFont
+from typing import Tuple, List, Dict, Set
 from mutagen.flac import FLAC
 from moviepy import CompositeVideoClip, TextClip, VideoClip, AudioFileClip, ImageClip
             elif platform_id == 1 and encoding_id == 0: # Macintosh, Roman
                 return name_bytes.decode('mac_roman').strip('\x00')
             elif platform_id == 0: # Unicode
+                return name_bytes.decode('utf_16_be').strip('\x00')
             else: # Fallback
                 return name_bytes.decode('utf_8', errors='ignore').strip('\x00')
         except Exception:
     try:
         with open(font_path, 'rb') as f: data = f.read()
+        def read_ushort(offset):
+            return struct.unpack('>H', data[offset:offset+2])[0]
+        def read_ulong(offset):
+            return struct.unpack('>I', data[offset:offset+4])[0]
         font_offsets = [0]
         # Check for TTC (TrueType Collection) header
         if data[:4] == b'ttcf':
         # For simplicity, we only parse the first font in a TTC
         font_offset = font_offsets[0]
         num_tables = read_ushort(font_offset + 4)
         name_table_offset = -1
         # Locate the 'name' table
             entry_offset = font_offset + 12 + i * 16
             tag = data[entry_offset:entry_offset+4]
             if tag == b'name':
+                name_table_offset = read_ulong(entry_offset + 8)
+                break
+        if name_table_offset == -1:
+            return None, None
         count, string_offset = read_ushort(name_table_offset + 2), read_ushort(name_table_offset + 4)
         name_candidates = {}
         # Iterate through all name records
         for i in range(count):
             rec_offset = name_table_offset + 6 + i * 12
             platform_id, encoding_id, language_id, name_id, length, offset = struct.unpack('>HHHHHH', data[rec_offset:rec_offset+12])
             if name_id == 4:  # We only care about the "Full Font Name"
                 string_pos = name_table_offset + string_offset + offset
                 value = decode_name_string(data[string_pos : string_pos + length], platform_id, encoding_id)
                 if value:
                     # Store candidates based on language ID
+                    if language_id in [1028, 2052, 3076, 4100, 5124]:
+                        name_candidates["zh"] = value
+                    elif language_id == 1041:
+                        name_candidates["ja"] = value
+                    elif language_id == 1042:
+                        name_candidates["ko"] = value
+                    elif language_id in [1033, 0]:
+                        name_candidates["en"] = value
                     else:
+                        if "other" not in name_candidates:
+                            name_candidates["other"] = value
         # Return the best candidate based on language priority
+        if name_candidates.get("zh"):
+            return name_candidates.get("zh"), "zh"
+        if name_candidates.get("ja"):
+            return name_candidates.get("ja"), "ja"
+        if name_candidates.get("ko"):
+            return name_candidates.get("ko"), "ko"
+        if name_candidates.get("other"):
+            return name_candidates.get("other"), "other"
+        if name_candidates.get("en"):
+            return name_candidates.get("en"), "en"
         return None, None
     except Exception:
         return None, None
     for path in all_font_files:
         display_name, lang_tag = get_font_display_name(path)
         is_fallback = display_name is None
         if is_fallback:
             # Create a fallback name from the filename
             display_name = os.path.splitext(os.path.basename(path))[0].replace('-', ' ').replace('_', ' ').title()
             lang_tag = 'fallback'
         if display_name and display_name not in font_map:
             font_map[display_name] = path
             found_names.append((display_name, is_fallback, lang_tag))
     # Define sort priority for languages
     sort_order = {'zh': 0, 'ja': 1, 'ko': 2, 'en': 3, 'other': 4, 'fallback': 5}
     # Sort by priority, then alphabetically
     found_names.sort(key=lambda x: (sort_order.get(x[2], 99), x[0]))
     sorted_display_names = [name for name, _, _ in found_names]
     return font_map, sorted_display_names
         '-c:a', 'copy',             # Copy audio without re-encoding
         output_path
     ]
     try:
         # Execute the command
         # Using capture_output to hide ffmpeg logs from the main console unless an error occurs
         raise gr.Error(f"FFmpeg failed to increase the framerate. See console for details. Error: {e.stderr}")
+# --- HELPER FUNCTION for parsing track ranges ---
+def parse_track_ranges(range_str: str) -> Set[int]:
+    """Parses a string like '1-4, 7, 10-13' into a set of integers."""
+    if not range_str:
+        return set()
+    indices = set()
+    parts = range_str.split(',')
+    for part in parts:
+        part = part.strip()
+        if not part:
+            continue
+        if '-' in part:
+            try:
+                start, end = map(int, part.split('-'))
+                indices.update(range(start, end + 1))
+            except ValueError:
+                print(f"Warning: Could not parse range '{part}'. Skipping.")
+        else:
+            try:
+                indices.add(int(part))
+            except ValueError:
+                print(f"Warning: Could not parse track number '{part}'. Skipping.")
+    return indices
 # --- Main Processing Function ---
+def process_audio_to_video(*args, progress=gr.Progress(track_tqdm=True)):
+    # --- Correctly unpack all arguments from *args using slicing ---
+    MAX_GROUPS = 10  # This MUST match the UI definition
+    # Define the structure of the *args tuple based on the `all_inputs` list
+    audio_files = args[0]
+    # Slice the args tuple to get the continuous blocks of inputs
+    all_track_strs = args[1 : 1 + MAX_GROUPS]
+    all_image_lists = args[1 + MAX_GROUPS : 1 + MAX_GROUPS * 2]
+    # Group inputs are packed in pairs (track_str, image_list)
+    group_definitions = []
+    for i in range(MAX_GROUPS):
+        group_definitions.append({
+            "tracks_str": all_track_strs[i],
+            "images": all_image_lists[i]
+        })
+    # Unpack the remaining arguments with correct indexing
+    arg_offset = 1 + MAX_GROUPS * 2
+    fallback_images = args[arg_offset]
+    format_double_digits = args[arg_offset + 1]
+    video_width = args[arg_offset + 2]
+    video_height = args[arg_offset + 3]
+    spec_fg_color = args[arg_offset + 4]
+    spec_bg_color = args[arg_offset + 5]
+    font_name = args[arg_offset + 6]
+    font_size = args[arg_offset + 7]
+    font_color = args[arg_offset + 8]
+    font_bg_color = args[arg_offset + 9]
+    font_bg_alpha = args[arg_offset + 10]
+    pos_h = args[arg_offset + 11]
+    pos_v = args[arg_offset + 12]
     if not audio_files:
         raise gr.Error("Please upload at least one audio file.")
     if not font_name:
         raise gr.Error("Please select a font from the list.")
     progress(0, desc="Initializing...")
     # Define paths for temporary and final files
     timestamp = int(time.time())
     temp_fps1_path = f"temp_{timestamp}_fps1.mp4"
                 raise ValueError(f"Could not parse rgb color string: {color_str}")
         else:
             raise ValueError(f"Unknown color format: {color_str}")
     # Use the new robust parser for all color inputs
     fg_rgb, bg_rgb = parse_color_to_rgb(spec_fg_color), parse_color_to_rgb(spec_bg_color)
     grid_rgb = tuple(min(c + 40, 255) for c in bg_rgb)
         # --- Define total steps for the progress bar ---
         TOTAL_STEPS = 5
+        # --- Stage 1: Audio Processing & Master Track List Creation ---
+        master_track_list, y_accumulator, current_sr = [], [], None
+        total_duration, global_track_counter = 0.0, 0
         # --- Use `progress.tqdm` to create a progress bar for this loop ---
         for file_idx, audio_path in enumerate(progress.tqdm(audio_files, desc=f"Stage 1/{TOTAL_STEPS}: Analyzing Audio Files")):
                         print(f"Successfully parsed {len(cue_tracks)} tracks from CUE sheet.")
                 except Exception as e:
+                    print(f"Warning: Could not parse CUE sheet for {os.path.basename(audio_path)}: {e}")
+            if cue_tracks:
+                for track_idx, track in enumerate(cue_tracks):
+                    global_track_counter += 1
+                    start_time = track.get('start_time', 0)
+                    end_time = cue_tracks[track_idx+1].get('start_time', file_duration) if track_idx + 1 < len(cue_tracks) else file_duration
+                    master_track_list.append({"global_index": global_track_counter, "title": track.get('title', 'Unknown'), "start_time": total_duration + start_time, "end_time": total_duration + end_time})
+            else:
+                global_track_counter += 1
+                master_track_list.append({"global_index": global_track_counter, "title": os.path.splitext(os.path.basename(audio_path))[0], "start_time": total_duration, "end_time": total_duration + file_duration})
             total_duration += file_duration
         # --- Concatenate along the time axis (axis=1) for stereo arrays ---
         y_combined = np.concatenate(y_accumulator, axis=1)
         duration = total_duration
         # --- Transpose the array for soundfile to write stereo correctly ---
         sf.write(temp_audio_path, y_combined.T, current_sr)
         print(f"Combined all audio files into one. Total duration: {duration:.2f}s")
         # --- Update progress to the next stage, use fractional progress (current/total) ---
+        progress(1 / TOTAL_STEPS, desc=f"Stage 2/{TOTAL_STEPS}: Mapping Images to Tracks")
+        # --- Stage 2: Map Tracks to Image Groups ---
+        parsed_groups = [parse_track_ranges(g['tracks_str']) for g in group_definitions]
+        track_to_images_map = {}
+        for track_info in master_track_list:
+            track_idx = track_info['global_index']
+            assigned = False
+            for i, group_indices in enumerate(parsed_groups):
+                if track_idx in group_indices:
+                    track_to_images_map[track_idx] = group_definitions[i]['images']
+                    assigned = True
+                    break
+            if not assigned:
+                track_to_images_map[track_idx] = fallback_images
+        # --- Stage 3: Generate ImageClips based on contiguous blocks ---
+        image_clips = []
+        if any(track_to_images_map.values()):
+            current_track_cursor = 0
+            while current_track_cursor < len(master_track_list):
+                start_track_info = master_track_list[current_track_cursor]
+                image_set_for_block = track_to_images_map.get(start_track_info['global_index'])
+                # Find the end of the contiguous block of tracks that use the same image set
+                end_track_cursor = current_track_cursor
+                while (end_track_cursor + 1 < len(master_track_list) and
+                       track_to_images_map.get(master_track_list[end_track_cursor + 1]['global_index']) == image_set_for_block):
+                    end_track_cursor += 1
+                end_track_info = master_track_list[end_track_cursor]
+                block_start_time = start_track_info['start_time']
+                block_end_time = end_track_info['end_time']
+                block_duration = block_end_time - block_start_time
+                if image_set_for_block and block_duration > 0:
+                    print(f"Creating image block for tracks {start_track_info['global_index']}-{end_track_info['global_index']} (Time: {block_start_time:.2f}s - {block_end_time:.2f}s)")
+                    time_per_image = block_duration / len(image_set_for_block)
+                    for i, img_path in enumerate(image_set_for_block):
+                        def create_image_layer(path, start, dur):
+                            try:
+                                img = ImageClip(path)
+                                scale = min(WIDTH/img.w, HEIGHT/img.h)
+                                resized_img = img.resized(scale)
+                                return CompositeVideoClip([resized_img.with_position("center")], size=(WIDTH, HEIGHT)).with_duration(dur).with_start(start)
+                            except Exception as e:
+                                print(f"Warning: Failed to process image '{path}'. Skipping. Error: {e}")
+                                return None
+                        clip = create_image_layer(img_path, block_start_time + i * time_per_image, time_per_image)
+                        if clip:
+                            image_clips.append(clip)
+                current_track_cursor = end_track_cursor + 1
+        progress(2 / TOTAL_STEPS, desc=f"Stage 3/{TOTAL_STEPS}: Generating Text & Spectrogram")
+        # --- Stage 4: Generate Text and Spectrogram ---
+        # --- Text Overlay Logic using the aggregated track info
+        text_clips = [] # Text clips are now simpler as they don't depend on complex file logic anymore
+        font_path = SYSTEM_FONTS_MAP.get(font_name)
+        if not font_path:
+            raise gr.Error(f"Font path for '{font_name}' not found!")
+        # Use the robust parser for text colors as well
+        font_bg_rgb = parse_color_to_rgb(font_bg_color)
+        position = (pos_h.lower(), pos_v.lower())
+        print(f"Using font: {font_name}, Size: {font_size}, Position: {position}")
+        # Create the RGBA tuple for the background color.
+        # The alpha value is converted from a 0.0-1.0 float to a 0-255 integer.
+        bg_color_tuple = (font_bg_rgb[0], font_bg_rgb[1], font_bg_rgb[2], int(font_bg_alpha * 255))
+        # 1. Define a maximum width for the caption. 90% of the video width is a good choice.
+        caption_width = int(WIDTH * 0.9)
+        # --- Get font metrics to calculate dynamic padding ---
+        try:
+            # Load the font with Pillow to access its metrics
+            pil_font = ImageFont.truetype(font_path, size=font_size)
+            _, descent = pil_font.getmetrics()
+            # Calculate a bottom margin to compensate for the font's descent.
+            # A small constant is added as a safety buffer.
+            # This prevents clipping on fonts with large descenders (like 'g', 'p').
+            bottom_margin = int(descent * 0.5) + 2
+            print(f"Font '{font_name}' descent: {descent}. Applying dynamic bottom margin of {bottom_margin}px.")
+        except Exception as e:
+            # Fallback in case of any font loading error
+            print(f"Warning: Could not get font metrics for '{font_name}'. Using fixed margin. Error: {e}")
+            bottom_margin = int(WIDTH * 0.01) # A small fixed fallback
+        for track in master_track_list:
+            text_duration = track['end_time'] - track['start_time']
+            if text_duration <= 0:
+                continue
+            # Construct display text based on pre-formatted number string
+            num_str = f"{track['global_index']:02d}" if format_double_digits else str(track['global_index'])
+            display_text = f"{num_str}. {track['title']}"
+            # 1. Create the TextClip first without positioning to get its size
+            txt_clip = TextClip(
+                text=display_text.strip(),
+                font_size=font_size,
+                color=font_color,
+                font=font_path,
+                bg_color=bg_color_tuple,
+                method='caption', # <-- Set method to caption
+                size=(caption_width, None), # <-- Provide size for wrapping
+                margin=(0, 0, 0, bottom_margin)
+            ).with_position(position).with_duration(text_duration).with_start(track['start_time'])
+            text_clips.append(txt_clip)
         N_FFT, HOP_LENGTH, N_BANDS = 2048, 512, 32
         MIN_DB, MAX_DB = -80.0, 0.0
         video_clip = VideoClip(frame_function=frame_generator, duration=duration)
+        # --- Set Spectrogram Opacity ---
         # If image clips were created, make the spectrogram layer 50% transparent.
         if image_clips:
             print("Applying 50% opacity to spectrogram layer.")
             video_clip = video_clip.with_opacity(0.5)
         # --- Use fractional progress (current/total) ---
+        progress(3 / TOTAL_STEPS, desc=f"Stage 4/{TOTAL_STEPS}: Rendering Base Video")
+        # --- Composition and Rendering ---
         audio_clip = AudioFileClip(temp_audio_path)
         # --- Clip Composition ---
                 audio_bitrate="320k", fps=RENDER_FPS,
                 logger='bar', threads=os.cpu_count(), preset='ultrafast')
             print("High-quality AAC audio encoding complete.")
         final_clip.close()
         # Step 2: Use FFmpeg to quickly increase the framerate to 24 FPS
         # --- Use fractional progress (current/total) ---
         progress(4 / TOTAL_STEPS, desc=f"Stage 5/{TOTAL_STEPS}: Finalizing Video")
+        # --- Finalizing ---
         increase_video_framerate(temp_fps1_path, final_output_path, target_fps=PLAYBACK_FPS)
         return final_output_path
         with gr.Column(scale=1):
             # --- Changed to gr.Files for multi-upload ---
             audio_inputs = gr.Files(
+                label="Upload Audio File(s)",
                 file_count="multiple",
                 file_types=["audio"]
             )
+            # --- Grouped Image Section ---
+            with gr.Accordion("Grouped Image Backgrounds (Advanced)", open=False):
+                gr.Markdown("Define groups of tracks and assign specific images to them. Tracks are numbered globally starting from 1 across all uploaded files.")
+                MAX_GROUPS = 10
+                group_track_inputs = []
+                group_image_inputs = []
+                group_accordions = []
+                # --- Create a centralized update function ---
+                def update_group_visibility(target_count: int):
+                    """Updates the visibility of all group accordions and the state of the control buttons."""
+                    # Clamp the target count to be within bounds
+                    target_count = max(1, min(target_count, MAX_GROUPS))
+                    updates = {visible_groups_state: target_count}
+                    # Update visibility for each accordion
+                    for i in range(MAX_GROUPS):
+                        updates[group_accordions[i]] = gr.update(visible=(i < target_count))
+                    # Update button states
+                    updates[add_group_btn] = gr.update(visible=(target_count < MAX_GROUPS))
+                    updates[remove_group_btn] = gr.update(interactive=(target_count > 1))
+                    return updates
+                # --- Create simple wrapper functions for adding and removing ---
+                def add_group(current_count: int):
+                    return update_group_visibility(current_count + 1)
+                def remove_group(current_count: int):
+                    return update_group_visibility(current_count - 1)
+                # Pre-build all group components
+                for i in range(MAX_GROUPS):
+                    with gr.Accordion(f"Image Group {i+1}", visible=(i==0)) as acc:
+                        track_input = gr.Textbox(label=f"Tracks for Group {i+1} (e.g., '1-4, 7')")
+                        image_input = gr.Files(label=f"Images for Group {i+1}", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
+                        group_track_inputs.append(track_input)
+                        group_image_inputs.append(image_input)
+                        group_accordions.append(acc)
+                visible_groups_state = gr.State(1)
+                # --- Add a remove button and put both in a row ---
+                with gr.Row():
+                    remove_group_btn = gr.Button("- Remove Last Group", variant="secondary", interactive=False)
+                    add_group_btn = gr.Button("+ Add Image Group", variant="secondary")
+                with gr.Accordion("Fallback / Default Images", open=True):
+                    gr.Markdown("These images will be used for any tracks not assigned to a specific group above.")
+                    fallback_image_input = gr.Files(label="Fallback Images", file_count="multiple", file_types=[".png", ".jpg", ".jpeg", ".webp", ".avif"])
             with gr.Accordion("Visualizer Options", open=True):
                 with gr.Row():
                     width_input = gr.Number(value=1920, label="Video Width (px)", precision=0)
                 # --- Checkbox for number formatting ---
                 format_double_digits_checkbox = gr.Checkbox(label="Format track numbers as double digits (e.g., 01, 05-09)", value=True)
                 gr.Markdown("If the CUE sheet or filenames contain non-English characters, please select a compatible font.")
                 # Define a priority list for default fonts, starting with common Japanese ones.
                 # This list can include multiple names for the same font to improve matching.
                 preferred_fonts = [
                     default_font = FONT_DISPLAY_NAMES[0]
                 font_name_dd = gr.Dropdown(choices=FONT_DISPLAY_NAMES, value=default_font, label="Font Family")
                 with gr.Row():
                     font_size_slider = gr.Slider(minimum=12, maximum=256, value=80, step=1, label="Font Size")
                     font_color_picker = gr.ColorPicker(value="#FFFFFF", label="Font Color")
                 with gr.Row():
                     font_bg_color_picker = gr.ColorPicker(value="#000000", label="Text BG Color")
                     font_bg_alpha_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.6, step=0.05, label="Text BG Opacity")
                 gr.Markdown("Text Position")
                 with gr.Row():
                     pos_h_radio = gr.Radio(["left", "center", "right"], value="center", label="Horizontal Align")
         with gr.Column(scale=2):
             video_output = gr.Video(label="Generated Video")
+    # --- Define the full list of outputs for the update functions ---
+    group_update_outputs = [visible_groups_state, add_group_btn, remove_group_btn] + group_accordions
+    # Connect the "Add Group" button to its update function
+    add_group_btn.click(
+        fn=add_group,
+        inputs=visible_groups_state,
+        outputs=group_update_outputs
+    )
+    remove_group_btn.click(
+        fn=remove_group,
+        inputs=visible_groups_state,
+        outputs=group_update_outputs
+    )
+    # --- Define the master list of all inputs for the main button ---
+    all_inputs = [audio_inputs] + group_track_inputs + group_image_inputs + [
+        fallback_image_input,
+        format_double_digits_checkbox,
+        width_input, height_input,
+        fg_color, bg_color,
+        font_name_dd, font_size_slider, font_color_picker,
+        font_bg_color_picker, font_bg_alpha_slider,
+        pos_h_radio, pos_v_radio
+    ]
     submit_btn.click(
         fn=process_audio_to_video,
+        inputs=all_inputs,
+        outputs=video_output,
+        show_progress="full"
     )
 if __name__ == "__main__":