Spaces:

DODI-Research
/

auto-foley-editor

Running

App Files Files

TMBoeren commited on Jan 15, 2025

Commit

df8fe58

1 Parent(s): 65d9759

Initial commit

Browse files

Files changed (12) hide show

.gitignore +10 -0
LICENSE +21 -0
app.py +762 -0
auto_foley/__init__.py +0 -0
auto_foley/run_auto_foley.py +682 -0
auto_foley/services.py +87 -0
auto_foley/ttsfx_elevenlabs.py +45 -0
auto_foley/video_comping.py +125 -0
auto_foley/vision_lm_chatgpt.py +197 -0
custom_script.js +166 -0
custom_style.css +15 -0
requirements.txt +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+output_videos/*
+*.mp4
+.gradio/
+.venv/
+venv/
+**/__pycache__/
+*.pyc
+*.pyo

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 DODI-Research
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,762 @@

+import gradio as gr
+import json
+import math
+import os
+import shutil
+from auto_foley import run_auto_foley as af
+from datetime import datetime
+from gradio_vistimeline import VisTimeline
+TIMELINE_ID = "editor-tab-timeline"
+OUTPUT_VIDEO_ID = "output-video-player"
+TRACK_LENGTH_ID = "track-length-item"
+# --- Demo specific helper functions ---
+def parse_date_to_milliseconds(date):
+    if isinstance(date, int):   # Input is already in milliseconds (Unix timestamp)
+        return date
+    elif isinstance(date, str): # Input is ISO8601 datetime string
+        dt = datetime.fromisoformat(date.replace("Z", "+00:00"))
+        epoch = datetime(1970, 1, 1, tzinfo=dt.tzinfo)
+        return int((dt - epoch).total_seconds() * 1000)
+    else:
+        return 0
+def parse_frame_to_timestamp(frame, framerate):
+    # Convert frame number with given framerate to corresponding time in milliseconds
+    exact_ms = (frame / framerate) * 1000
+    # Round up to nearest 50ms, which is the smallest time step on the maximum zoom of our timeline
+    rounded_ms = math.ceil(exact_ms / 50) * 50
+    return int(rounded_ms)
+def format_video_info(video_info):
+    if video_info is None:
+        return ""
+    label_names = {
+        "Width": "Width",
+        "Height": "Height",
+        "Duration": "Length in seconds",
+        "FrameCount": "Number of frames",
+        "FrameRate": "Frame rate (fps)",
+        "FrameInterval": "Frame interval"
+    }
+    info = ""
+    for key, label in label_names.items():
+        if key in video_info:
+            info += f"{label}: {video_info[key]}\n"
+    return info
+def update_video_info_advanced_input(frame_interval, downscale_samples, downscale_target, video_info):
+    """
+    Save the frame interval to the current job state and update the markdown text above the frame interval slider
+    """
+    if video_info is None:
+        return "Upload a video first.", video_info
+    video_info['DownScaleSamples'] = downscale_samples
+    video_info['FrameInterval'] = frame_interval
+    try:
+        if downscale_samples:
+            max_side = int(downscale_target[:-2]) # Remove the "px" from the "512px" format that the dropdown returns
+            video_info['DownscaledWidth'], video_info['DownscaledHeight'] = af.downscale_dimensions(video_info['Width'], video_info['Height'], max_side)
+        else:
+            video_info['DownscaledWidth'] = video_info['Width']
+            video_info['DownscaledHeight'] = video_info['Height']
+        frame_count = video_info['FrameCount']
+        frame_rate = video_info['FrameRate']
+        if not frame_count or not frame_rate:
+            return "Video information not available.", video_info
+        samples_count = (frame_count // frame_interval) + 2
+        samples_per_second = frame_rate / frame_interval
+        cost = af.calculate_video_input_cost(video_info['DownscaledWidth'], video_info['DownscaledHeight'], samples_count)
+        return f"Minimum input cost: {cost}<br />Video will be split into {samples_count} samples total. Or approximately {samples_per_second:.1f} samples per second.", video_info
+    except Exception as e:
+        return f"Error calculating frame interval: {str(e)}", video_info
+# --- Tab 1 UI State Management ---
+def trigger_frame_interval_slider_rerender(on_video_uploaded_state):
+    return not on_video_uploaded_state
+def get_slider_config(video_info):
+    if not video_info:
+        return None
+    total_frames = video_info.get('FrameCount', {}).get('Value', 0)
+    framerate = video_info.get('FrameRate', {}).get('Value', 0)
+    if not total_frames or not framerate:
+        return None
+    max_interval = total_frames // 2
+    return {
+        'minimum': 1,
+        'maximum': max_interval,
+        'step': 1,
+        'value': framerate,
+        'label': f"Frame Interval (1-{max_interval})"
+    }
+def get_generate_descriptions_button(is_interactive):
+    return gr.Button("Generate Video Description and Audio Sources", interactive=is_interactive)
+def get_generate_audio_button(is_interactive):
+    return gr.Button("Generate Audio", variant="primary", interactive=is_interactive)
+def set_generate_buttons_active():
+    return get_generate_descriptions_button(True), get_generate_audio_button(True)
+def set_generate_buttons_inactive():
+    return get_generate_descriptions_button(False), get_generate_audio_button(False)
+def go_to_tab(id):
+    return gr.Tabs(selected=id)
+# --- Tab 1 Functionality ---
+def on_video_upload(video):
+    if video is None:
+        return get_generate_descriptions_button(False), get_generate_audio_button(False), None, "", None, ""
+    try:
+        video_info = af.get_video_info(video)
+    except Exception as e:
+        gr.Warning(f"Error: {e}")
+    return get_generate_descriptions_button(True), get_generate_audio_button(True), video_info, "", None, ""
+def generate_descriptions(video, video_info, prompt_instruction, vision_lm_api_key):
+    if not video or video_info is None:
+        return None, "", video_info
+    try:
+        audio_sources, _ = af.process_video(video, video_info['FrameInterval'], video_info['DownscaledWidth'], video_info['DownscaledHeight'], prompt_instruction, vision_lm_api_key)
+        json_output = json.dumps(audio_sources, indent=4)
+        return json_output, json_output, audio_sources
+    except Exception as e:
+        gr.Warning(f"Error: {e}")
+        return None, "", {}
+def generate_all_audio(video, video_info, prompt_instruction, generate_descriptions_json_output, generate_descriptions_json_textbox, vision_lm_api_key, ttsfx_api_key, progress=gr.Progress()):
+    # Check if user has provided their own descriptions through the advanced input textbox
+    valid_json = True
+    if generate_descriptions_json_textbox and not generate_descriptions_json_textbox.isspace():
+        # Validate the expected structure
+        try:
+            audio_sources = json.loads(generate_descriptions_json_textbox)
+            if audio_sources and not isinstance(audio_sources, dict):
+                valid_json = False
+            else:
+                required_keys = {'AudioSources', 'AmbientAudioSources'}
+                if not all(key in audio_sources for key in required_keys):
+                    valid_json = False
+                if not isinstance(audio_sources['AudioSources'], list) or not isinstance(audio_sources['AmbientAudioSources'], list):
+                    valid_json = False
+        except:
+            valid_json = False
+    else:
+        valid_json = False
+    # If descriptions weren't given, generate them now
+    if not valid_json:
+        progress((1, 3), desc="Processing video")
+        try:
+            audio_sources, _ = af.process_video(video, video_info['FrameInterval'], video_info['DownscaledWidth'], video_info['DownscaledHeight'], prompt_instruction, vision_lm_api_key)
+            json_output = json.dumps(audio_sources, indent=4)
+            generate_descriptions_json_output = json_output
+            generate_descriptions_json_textbox = json_output
+        except Exception as e:
+            raise gr.Error(f"Could not generate audio: {e}")
+    # Generate audio files all the audio sources
+    progress((2, 3), desc="Generating audio")
+    try:
+        audio_sources = af.generate_all_audio(audio_sources, ttsfx_api_key)
+    except Exception as e:
+        raise gr.Error(f"Could not generate audio: {e}")
+    return "", audio_sources, generate_descriptions_json_output, generate_descriptions_json_textbox
+# --- Tab 2 UI State Management ---
+def copy_video_info_to_edit_tab(video_path, video_info):
+    video_info['VideoPath'] = video_path
+    return video_info
+def copy_video_info_to_edit_tab_if_none(video_path, video_input_info, video_edit_info):
+    if not video_edit_info and video_input_info:
+        video_edit_info = copy_video_info_to_edit_tab(video_path, video_input_info)
+    return video_edit_info
+def set_render_button_state(unrendered_changes_flag):
+    return gr.Button("Combine All Audio & Render Video", variant="primary", interactive=unrendered_changes_flag)
+def reset_new_audio_source_counter():
+    return 0
+def set_buttons_state_selected_audio_source(selected_audio_source):
+    set_interactive = selected_audio_source is not None
+    return gr.Button(value="Delete Selected Audio Source", variant="stop", interactive=set_interactive), gr.Button("Generate", variant="primary", interactive=set_interactive), gr.Button("Save Changes", interactive=set_interactive)
+def sync_form_to_selected_audio_source(selected_audio_source):
+    accordion_label = "Edit Audio Source Properties"
+    if selected_audio_source is None:
+        return gr.Accordion(label=accordion_label, open=False), 1.0, None, ""
+    return gr.Accordion(label=accordion_label, open=True), selected_audio_source.get('Volume', 1.0), selected_audio_source.get('AudioPath', None), selected_audio_source['SoundDescription']
+# --- Tab 2 VisTimelineData & AudioSource helper functions  ---
+def parse_single_audio_source(audio_source, video_fps, group_id):
+    timeline_item = {
+        "id": audio_source['SourceSlugID'],
+        "content": audio_source['SoundDescription'],
+        "group": group_id,
+        "start": parse_frame_to_timestamp(audio_source['StartFrameIndex'], video_fps),
+        "end": parse_frame_to_timestamp(audio_source['EndFrameIndex'], video_fps)
+    }
+    return timeline_item
+def parse_audio_sources_to_timeline_data(audio_sources, video_info):
+    video_fps = video_info['FrameRate']
+    last_frame = video_info['FrameCount'] - 1
+    timeline_data = {
+        "groups": [{"id": "track-length", "content": ""}, {"id": 1, "content": ""}, {"id": 2, "content": ""}],
+        "items": [
+            {
+                "id": TRACK_LENGTH_ID,
+                "content": "",
+                "group": "track-length",
+                "selectable": False,
+                "type": "background",
+                "start": 0,
+                "end": parse_frame_to_timestamp(last_frame, video_fps),
+                "className": "color-primary-600"
+            }
+        ]
+    }
+    for audio_source in audio_sources.get('AudioSources', []):
+        timeline_data['items'].append(parse_single_audio_source(audio_source, video_fps, 1))
+    for ambient_audio_source in audio_sources.get('AmbientAudioSources', []):
+        timeline_data['items'].append(parse_single_audio_source(ambient_audio_source, video_fps, 2))
+    return timeline_data
+def get_audio_source_by_slug(audio_sources, slug):
+    for audio_source in audio_sources.get('AudioSources', []):
+        if audio_source['SourceSlugID'] == slug:
+            return audio_source
+    for audio_source in audio_sources.get('AmbientAudioSources', []):
+        if audio_source['SourceSlugID'] == slug:
+            return audio_source
+    return None
+def update_audio_source_with_timeline_item_data(audio_source, timeline_item, max_duration, frame_rate):
+    start_ms = max(0, parse_date_to_milliseconds(timeline_item["start"]))
+    end_ms = min(max_duration, parse_date_to_milliseconds(timeline_item["end"]))
+    audio_source['StartFrameIndex'] = int((start_ms / 1000) * frame_rate)
+    audio_source['EndFrameIndex'] = int((end_ms / 1000) * frame_rate)
+    audio_source['Duration'] = (end_ms - start_ms) / 1000
+    return audio_source
+# --- Tab 2 Timeline  ---
+def focus_timeline_on_tab_select(set_timeline_window_on_next_tab_change, trigger_timeline_window_focus):
+    if set_timeline_window_on_next_tab_change == True:
+        return False, not trigger_timeline_window_focus
+    return set_timeline_window_on_next_tab_change, trigger_timeline_window_focus
+def focus_timeline_on_new_source_added(audio_sources, trigger_timeline_window_focus):
+    # Only focus the timeline if there's only one audio source because this would mean there were none earlier
+    audio_count = len(audio_sources.get('AudioSources', []) + audio_sources.get('AmbientAudioSources', []))
+    if audio_count > 1:
+        return trigger_timeline_window_focus
+    return not trigger_timeline_window_focus
+def on_timeline_item_select(audio_sources, event_data: gr.EventData):
+    selected_ids = event_data._data
+    if not selected_ids:
+        return None
+    return get_audio_source_by_slug(audio_sources, selected_ids[0]) # Because we instantiate all timeline items with their ids set to the audio_source's slug
+def on_timeline_input(timeline: dict[str, any], all_audio_sources, video_info):
+    if hasattr(timeline, "model_dump"):
+        data = timeline.model_dump(exclude_none=True)
+    else:
+        data = timeline
+    video_duration_ms = video_info['Duration'] * 1000
+    frame_rate = video_info['FrameRate']
+    for audio_source in all_audio_sources.get('AudioSources', []):
+        for timeline_item in data['items']:
+            if timeline_item['id'] == audio_source['SourceSlugID']:
+                audio_source = update_audio_source_with_timeline_item_data(audio_source, timeline_item, video_duration_ms, frame_rate)
+                break
+    for ambient_audio_source in all_audio_sources.get('AmbientAudioSources', []):
+        for timeline_item in data['items']:
+            if timeline_item['id'] == ambient_audio_source['SourceSlugID']:
+                ambient_audio_source = update_audio_source_with_timeline_item_data(ambient_audio_source, timeline_item, video_duration_ms, frame_rate)
+                break
+    return all_audio_sources
+# --- Tab 2 Functionality ---
+def comp_all_audio_to_video(audio_sources, video_info):
+    try:
+        input_video_path = video_info['VideoPath']
+        if not audio_sources:
+            input_video_path
+            return
+        output_directory = "output_videos"
+        # Ensure the output directory exists
+        if not os.path.exists(output_directory):
+            os.makedirs(output_directory)
+        else:
+            # Clear the output directory before saving the new video
+            for filename in os.listdir(output_directory):
+                file_path = os.path.join(output_directory, filename)
+                try:
+                    if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)
+                    elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)
+                except Exception as e:
+                    raise OSError(f"Failed to delete {file_path}. Reason: {e}")
+        # Generate a unique output filename
+        input_filename = os.path.basename(input_video_path)
+        file_name_without_extension, file_extension = os.path.splitext(input_filename)
+        output_video_name = f"{file_name_without_extension}_output{file_extension}"
+        output_video_path = os.path.join(output_directory, output_video_name)
+        output_video_path = af.combine_video_and_audio(audio_sources, input_video_path, output_video_path)
+    except Exception as e:
+        gr.Warning(f"Failed to add the audio to the video: {e}")
+    return output_video_path
+def generate_new_audio(prompt, audio_player, selected_audio_source, ttsfx_api_key):
+    if not selected_audio_source:
+        return audio_player
+    try:
+        new_audio_file_path = af.generate_audio(prompt, selected_audio_source.get("Duration"), ttsfx_api_key)
+        if new_audio_file_path:
+            return new_audio_file_path
+        return audio_player
+    except:
+        return audio_player
+def add_new_audio_source(all_audio_sources, new_audio_sources_counter):
+    new_audio_sources_counter += 1
+    new_audio_source = {
+        'SourceSlugID': f"NewAudioSource{new_audio_sources_counter}",
+        'StartFrameIndex': 0,
+        'EndFrameIndex': 75,
+        'Duration': 3.0,
+        'AudioPath': None,
+        'SoundDescription': f"New audio source {new_audio_sources_counter}",
+        'Volume': 1.0
+    }
+    audio_sources = all_audio_sources.get('AudioSources', [])
+    audio_sources.append(new_audio_source)
+    return {"AudioSources": audio_sources, "AmbientAudioSources": all_audio_sources.get('AmbientAudioSources', [])}, new_audio_sources_counter
+def delete_selected_audio_source(selected_audio_source, all_audio_sources):
+    if not selected_audio_source:
+        return None, all_audio_sources
+    slug_to_delete = selected_audio_source['SourceSlugID']
+    audio_sources = all_audio_sources.get('AudioSources', [])
+    audio_sources = [source for source in audio_sources if source['SourceSlugID'] != slug_to_delete]
+    ambient_audio_sources = all_audio_sources.get('AmbientAudioSources', [])
+    ambient_audio_sources = [source for source in ambient_audio_sources if source['SourceSlugID'] != slug_to_delete]
+    return None, {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_audio_sources}
+def overwrite_changes_to_selected_audio_source(volume, audio_path, prompt, selected_audio_source, all_audio_sources):
+    if not selected_audio_source:
+        return all_audio_sources, selected_audio_source
+    selected_slug = selected_audio_source['SourceSlugID']
+    updated_source = selected_audio_source.copy()
+    updated_source.update({
+        'SoundDescription': prompt,
+        'AudioPath': audio_path,
+        'Volume': float(volume)
+    })
+    audio_sources = all_audio_sources.get('AudioSources', [])
+    ambient_sources = all_audio_sources.get('AmbientAudioSources', [])
+    for i, source in enumerate(audio_sources):
+        if source['SourceSlugID'] == selected_slug:
+            audio_sources[i] = updated_source
+            return {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_sources}, selected_audio_source # Return early to skip the second loop
+    for i, source in enumerate(ambient_sources):
+        if source['SourceSlugID'] == selected_slug:
+            ambient_sources[i] = updated_source
+            break
+    return {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_sources}, selected_audio_source
+# --- Custom JS and CSS ---
+current_dir = os.path.dirname(os.path.abspath(__file__))
+js_path = os.path.join(current_dir, 'custom_script.js')
+css_path = os.path.join(current_dir, 'custom_style.css')
+with open(js_path, 'r') as f:
+    js_content = f.read()
+with open(css_path, 'r') as f:
+    css_content = f.read()
+head = f"""<script>{js_content}</script><style>{css_content}.vis-custom-time.{TIMELINE_ID} {{pointer-events: none !important;}}</style>"""
+# --- Gradio UI ---
+with gr.Blocks(head=head) as ui:
+    # Initialize per-user Gradio states with default values
+    ttsfx_api_key_state = gr.State(value=None)
+    vision_lm_api_key_state = gr.State(value=None)
+    video_input_info_state = gr.State(value={})
+    video_edit_info_state = gr.State(value={})
+    audio_sources_state = gr.State(value={})
+    selected_audio_source_state = gr.State(value={})
+    new_audio_sources_counter = gr.State(value=0)
+    trigger_frame_interval_slider_render = gr.State(value=False)
+    trigger_timeline_window_focus = gr.State(value=False)
+    set_timeline_window_on_next_tab_change = gr.State(value=True)
+    unrendered_changes_flag = gr.State(value=False)
+    gr.Markdown("### Auto-Foley Editor")
+    with gr.Tabs() as tabs:
+        # --- Tab 1 ---
+        with gr.TabItem("Input", id=0) as input_tab:
+            with gr.Row(equal_height=True):
+                video_input = gr.Video(label="Upload a Video", height=206, sources='upload')
+                video_info_display = gr.Textbox(label="Video Information", lines=6, interactive=False)
+            with gr.Accordion("Input control", open=False):
+                @gr.render(inputs=[video_input_info_state], triggers=[trigger_frame_interval_slider_render.change])
+                def render_frame_interval_slider(video_info):
+                    total_frames = video_info.get('FrameCount', 0)
+                    frame_rate = video_info.get('FrameRate', 0)
+                    max_interval = total_frames // 2
+                    with gr.Row(equal_height=True):
+                        with gr.Column():
+                            cost_and_frame_interval_info = gr.Markdown("Upload a video")
+                            frame_interval_slider = gr.Slider(
+                                elem_id="frame_interval_slider",
+                                minimum=1,
+                                maximum=max_interval,
+                                step=1,
+                                value=frame_rate,
+                                label=f"Frame Interval ({max_interval}-1)"
+                            )
+                        with gr.Column(scale=0):
+                            downscale_samples_checkbox = gr.Checkbox(
+                                value=True,
+                                interactive=True,
+                                label="Downscale samples"
+                            )
+                            downscale_resolution_dropdown = gr.Dropdown(
+                                choices=["512px", "768px", "1024px"],
+                                value="512px",
+                                type="value",
+                                interactive=True,
+                                label="Max side"
+                            )
+                    frame_interval_slider.change(
+                        fn=update_video_info_advanced_input,
+                        inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
+                        outputs=[cost_and_frame_interval_info, video_input_info_state]
+                    )
+                    downscale_samples_checkbox.change(
+                        fn=update_video_info_advanced_input,
+                        inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
+                        outputs=[cost_and_frame_interval_info, video_input_info_state]
+                    )
+                    downscale_resolution_dropdown.change(
+                        fn=update_video_info_advanced_input,
+                        inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
+                        outputs=[cost_and_frame_interval_info, video_input_info_state]
+                    )
+                custom_instruction_textbox = gr.Textbox(label="Optional custom instruction for the LLM:", interactive=True)
+                with gr.Accordion("Observe or edit the LLM's response before generating audio with it:", open=False):
+                    generate_descriptions_button = get_generate_descriptions_button(False)
+                    with gr.Tabs():
+                        with gr.Tab("View"):
+                            generate_descriptions_json_output = gr.JSON(label="JSON")
+                        with gr.Tab("Edit"):
+                            generate_descriptions_json_textbox = gr.Textbox(label="JSON", lines=22, interactive=True)
+            generate_all_audio_button = get_generate_audio_button(False)
+            generate_all_progress_textbox = gr.Textbox(show_label=False, visible=False)
+        # --- Tab 2 ---
+        with gr.TabItem("Output & Edit", id=1) as output_tab:
+            video_comp_output = gr.Video(label="Result", height=480, interactive=False, elem_id=OUTPUT_VIDEO_ID)
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        add_audio_source_button = gr.Button("Add New Audio Source")
+                        delete_audio_source_button = gr.Button(value="Delete Selected Audio Source", variant="stop", interactive=False)
+                with gr.Column():
+                    comp_audio_button = gr.Button("Combine All Audio & Render Video", variant="primary", interactive=False)
+            timeline = VisTimeline(
+                value={"groups": [{"id": "track-length", "content": ""}, {"id": 1, "content": ""}, {"id": 2, "content": ""}], "items": []},
+                options={
+                    "moment": "+00:00",
+                    "showCurrentTime": False,
+                    "editable": {
+                        "add": False,
+                        "remove": False,
+                        "updateGroup": False,
+                        "updateTime": True
+                    },
+                    "itemsAlwaysDraggable": {
+                        "item": True,
+                        "range": True
+                    },
+                    "showMajorLabels": False,
+                    "format": {
+                        "minorLabels": {
+                            "millisecond": "mm:ss.SSS",
+                            "second": "mm:ss",
+                            "minute": "mm:ss",
+                            "hour": "HH:mm:ss"
+                        }
+                    },
+                    "start": 0,
+                    "end": 10000,
+                    "min": 0,
+                    "max": 22000,
+                    "zoomMin": 1000,
+                },
+                elem_id=TIMELINE_ID
+            )
+            with gr.Accordion("Edit Audio Source Properties", open=False) as selected_source_accordion:
+                with gr.Group():
+                    selected_audio_volume_slider = gr.Slider(label="Volume", minimum=0.0, maximum=1.0, step=0.01, value=1.0)
+                    selected_audio_player = gr.Audio(label="Audio", type="filepath")
+                with gr.Accordion("Generate New Audio", open=False):
+                    selected_audio_prompt_textbox = gr.Textbox(label="Prompt")
+                    selected_audio_overwrite_audio_button = gr.Button("Generate", variant="primary", interactive=False)
+                save_changes_button = gr.Button("Save Changes", interactive=False)
+        # --- Tab 3 ---
+        with gr.TabItem("Set API Keys", id=2) as settings_tab:
+            vision_lm_api_key_textbox = gr.Textbox(label="OpenAI API Key", type='password')
+            ttsfx_api_key_textbox = gr.Textbox(label="ElevenLabs API Key", type='password')
+    # Tab 1 interactions
+    input_tab.select(
+        fn=lambda: False, outputs=set_timeline_window_on_next_tab_change
+    )
+    video_input.change(
+        fn=on_video_upload,
+        inputs=video_input,
+        outputs=[generate_descriptions_button, generate_all_audio_button, video_input_info_state, custom_instruction_textbox, generate_descriptions_json_output, generate_descriptions_json_textbox]
+    ).then(
+        fn=trigger_frame_interval_slider_rerender,
+        inputs=trigger_frame_interval_slider_render,
+        outputs=trigger_frame_interval_slider_render
+    )
+    video_input_info_state.change(
+        fn=format_video_info,
+        inputs=video_input_info_state,
+        outputs=video_info_display
+    )
+    generate_descriptions_button.click(
+        fn=set_generate_buttons_inactive, outputs=[generate_descriptions_button, generate_all_audio_button]
+    ).then(
+        fn=generate_descriptions,
+        inputs=[video_input, video_input_info_state, custom_instruction_textbox, vision_lm_api_key_state],
+        outputs=[generate_descriptions_json_output, generate_descriptions_json_textbox, audio_sources_state],
+        concurrency_id="long_job"
+    ).then(
+        fn=set_generate_buttons_active, outputs=[generate_descriptions_button, generate_all_audio_button]
+    )
+    generate_all_audio_button.click(fn=lambda: 0, outputs=new_audio_sources_counter)
+    generate_all_audio_button.click(
+        fn=copy_video_info_to_edit_tab,
+        inputs=[video_input, video_input_info_state],
+        outputs=video_edit_info_state
+    ).then(
+        fn=lambda: 0, outputs=new_audio_sources_counter
+    ).then(
+        fn=lambda: True, outputs=set_timeline_window_on_next_tab_change
+    ).then(
+        fn=lambda: gr.Textbox(show_label=False, visible=True), outputs=generate_all_progress_textbox
+    ).then(
+        fn=set_generate_buttons_inactive, outputs=[generate_descriptions_button, generate_all_audio_button]
+    ).then(
+        fn=generate_all_audio,
+        inputs=[video_input, video_edit_info_state, custom_instruction_textbox, generate_descriptions_json_output, generate_descriptions_json_textbox, vision_lm_api_key_state, ttsfx_api_key_state],
+        outputs=[generate_all_progress_textbox, audio_sources_state, generate_descriptions_json_output, generate_descriptions_json_textbox],
+        concurrency_id="long_job"
+    ).then(
+        fn=parse_audio_sources_to_timeline_data,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=timeline
+    ).then(
+        fn=lambda: go_to_tab(1),
+        inputs=[],
+        outputs=tabs
+    ).then(
+        fn=set_generate_buttons_active, outputs=[generate_descriptions_button, generate_all_audio_button]
+    ).then(
+        fn=lambda: gr.Textbox(show_label=False, visible=False), outputs=generate_all_progress_textbox
+    ).then(
+        fn=comp_all_audio_to_video,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=video_comp_output,
+        concurrency_id="comp"
+    ).then(
+        fn=lambda: False, outputs=unrendered_changes_flag
+    ).then(
+        fn=None,
+        js=f'() => initVideoSync("{OUTPUT_VIDEO_ID}", "{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
+    )
+    # Tab 2 interactions
+    output_tab.select(
+        fn=copy_video_info_to_edit_tab_if_none,
+        inputs=[video_input, video_input_info_state, video_edit_info_state],
+        outputs=[video_edit_info_state]
+    ).then(
+        fn=focus_timeline_on_tab_select,
+        inputs=[set_timeline_window_on_next_tab_change, trigger_timeline_window_focus],
+        outputs=[set_timeline_window_on_next_tab_change, trigger_timeline_window_focus]
+    )
+    unrendered_changes_flag.change(
+        fn=set_render_button_state,
+        inputs=unrendered_changes_flag,
+        outputs=comp_audio_button
+    )
+    trigger_timeline_window_focus.change(
+        fn=None,
+        js=f'() => setTimelineWindowToItemLength("{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
+    )
+    comp_audio_button.click(
+        fn=comp_all_audio_to_video,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=video_comp_output,
+        concurrency_id="comp"
+    ).then(
+        fn=lambda: False, outputs=unrendered_changes_flag
+    ).then(
+        fn=None,
+        js=f'() => initVideoSync("{OUTPUT_VIDEO_ID}", "{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
+    )
+    add_audio_source_button.click(
+        fn=add_new_audio_source,
+        inputs=[audio_sources_state, new_audio_sources_counter],
+        outputs=[audio_sources_state, new_audio_sources_counter]
+    ).then(
+        fn=parse_audio_sources_to_timeline_data,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=timeline
+    ).then(
+        fn=focus_timeline_on_new_source_added,
+        inputs=[audio_sources_state, trigger_timeline_window_focus],
+        outputs=trigger_timeline_window_focus
+    ).then(
+        fn=lambda: True, outputs=unrendered_changes_flag
+    )
+    delete_audio_source_button.click(
+        fn=delete_selected_audio_source,
+        inputs=[selected_audio_source_state, audio_sources_state],
+        outputs=[selected_audio_source_state, audio_sources_state]
+    ).then(
+        fn=parse_audio_sources_to_timeline_data,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=timeline
+    ).then(
+        fn=lambda: True, outputs=unrendered_changes_flag
+    )
+    timeline.item_select(
+        fn=on_timeline_item_select,
+        inputs=[audio_sources_state],
+        outputs=selected_audio_source_state
+    )
+    timeline.input(
+            fn=on_timeline_input,
+            inputs=[timeline, audio_sources_state, video_edit_info_state],
+            outputs=audio_sources_state
+    ).then(
+        fn=lambda: True, outputs=unrendered_changes_flag
+    )
+    selected_audio_source_state.change(
+        fn=sync_form_to_selected_audio_source,
+        inputs=selected_audio_source_state,
+        outputs=[selected_source_accordion, selected_audio_volume_slider, selected_audio_player, selected_audio_prompt_textbox]
+    ).then(
+        fn=set_buttons_state_selected_audio_source,
+        inputs=selected_audio_source_state,
+        outputs=[delete_audio_source_button, selected_audio_overwrite_audio_button, save_changes_button]
+    )
+    selected_audio_overwrite_audio_button.click(
+        fn=generate_new_audio,
+        inputs=[selected_audio_prompt_textbox, selected_audio_player, selected_audio_source_state, ttsfx_api_key_state],
+        outputs=selected_audio_player
+    )
+    save_changes_button.click(
+        fn=overwrite_changes_to_selected_audio_source,
+        inputs=[
+            selected_audio_volume_slider,
+            selected_audio_player,
+            selected_audio_prompt_textbox,
+            selected_audio_source_state,
+            audio_sources_state
+        ],
+        outputs=[audio_sources_state, selected_audio_source_state]
+    ).then(
+        fn=parse_audio_sources_to_timeline_data,
+        inputs=[audio_sources_state, video_edit_info_state],
+        outputs=timeline
+    ).then(
+        fn=lambda: True, outputs=unrendered_changes_flag
+    )
+    # Tab 3 interactions
+    vision_lm_api_key_textbox.input(
+        fn=lambda a: a,
+        inputs=vision_lm_api_key_textbox,
+        outputs=vision_lm_api_key_state
+    )
+    ttsfx_api_key_textbox.input(
+        fn=lambda a: a,
+        inputs=ttsfx_api_key_textbox,
+        outputs=ttsfx_api_key_state
+    )
+    ui.load(
+        fn=lambda: (os.getenv('AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY'), os.getenv('AUTO_FOLEY_DEFAULT_TTSFX_API_KEY')),
+        outputs=[vision_lm_api_key_state, ttsfx_api_key_state]
+    )
+if __name__ == "__main__":
+    ui.launch(show_api=False)

auto_foley/__init__.py ADDED Viewed

File without changes

auto_foley/run_auto_foley.py ADDED Viewed

	@@ -0,0 +1,682 @@

+import argparse
+import base64
+import cv2
+import math
+import os
+import sys
+import tempfile
+from PIL import Image
+from pydantic import BaseModel
+from typing import Optional
+if __name__ == "__main__" or 'auto_foley' not in sys.modules:
+    import video_comping
+    from services import (VisionLMService, TTSFXService, set_default_vision_lm, set_default_ttsfx, get_services, get_api_keys)
+else:
+    from auto_foley import video_comping
+    from auto_foley.services import (VisionLMService, TTSFXService, set_default_vision_lm, set_default_ttsfx, get_services, get_api_keys)
+class AudioSource(BaseModel):
+    SourceSlugID: str
+    SoundDescription: str
+    StartFrameIndex: int
+    EndFrameIndex: int
+    Duration: float
+AUDIO_SOURCES_KEY = 'AudioSources'
+AMBIENT_AUDIO_SOURCES_KEY = 'AmbientAudioSources'
+MINIMUM_AUDIO_DURATION = 0.5
+# --- Low-level Helpers ---
+def downscale_dimensions(width: int, height: int, max_side: int) -> tuple[int, int]:
+    """
+    Resize dimensions down while maintaining aspect ratio and respecting maximum side length.
+    Args:
+        width: Original width of the image/video
+        height: Original height of the image/video
+        max_side: Maximum allowed length for either dimension
+    Returns:
+        tuple: (new_width, new_height) maintaining original aspect ratio
+    Raises:
+        ValueError: If width, height, or max_side are less than or equal to 0
+    """
+    if width <= 0 or height <= 0 or max_side <= 0:
+        raise ValueError("Width, height, and max_side must all be positive integers")
+    if width <= max_side and height <= max_side:
+        return (width, height)
+    aspect_ratio = width / height
+    if width > height:
+        new_width = max_side
+        new_height = int(max_side / aspect_ratio)
+    else:
+        new_height = max_side
+        new_width = int(max_side * aspect_ratio)
+    return (new_width, new_height)
+def calculate_duration(start_frame: int, end_frame: int, fps: int, frame_count: int) -> tuple[float, int, int]:
+    """
+    Calculate duration in seconds based on frame indices and FPS.
+    Args:
+        start_frame: Starting frame index
+        end_frame: Ending frame index
+        fps: Frames per second of the video
+        frame_count: Total number of frames in the video
+    Returns:
+        tuple: A tuple containing:
+            float: Duration in seconds (rounded to 3 decimal places)
+            int: Adjusted start frame index
+            int: Adjusted end frame index
+    Raises:
+        ValueError: If fps is less than or equal to 0
+        ValueError: If frame_count is less than or equal to 0
+    """
+    # Input validation
+    if fps <= 0:
+        raise ValueError("FPS must be greater than 0")
+    if frame_count <= 0:
+        raise ValueError("Frame count must be greater than 0")
+    start = max(0, start_frame)
+    end = min(frame_count - 1, end_frame) if end_frame is not None else frame_count - 1
+    if end <= start:
+        end = start + fps
+    duration_frames = end - start
+    duration_seconds = duration_frames / fps
+    # If duration is less than minimum, adjust frames
+    if duration_seconds < MINIMUM_AUDIO_DURATION:
+        frames_needed = int(MINIMUM_AUDIO_DURATION * fps) - duration_frames
+        # First try to extend end frame
+        target_end = end + frames_needed
+        if target_end <= frame_count - 1:
+            end = target_end
+        else:
+            # Can't extend end frame, need to adjust start frame instead
+            end = frame_count - 1
+            remaining_frames_needed = int(MINIMUM_AUDIO_DURATION * fps) - (end - start)
+            potential_start = start - remaining_frames_needed
+            start = max(0, potential_start)
+    final_duration_frames = end - start
+    final_duration = final_duration_frames / fps
+    return round(final_duration, 3), start, end
+def combine_all_audio_sources(all_audio_sources: dict) -> list:
+    """
+    Helper function to convert format of audio sources into a single combined list.
+    Args:
+        all_audio_sources: Dictionary containing AudioSources and AmbientAudioSources lists
+            Expected format: {'AudioSources': [], 'AmbientAudioSources': []}
+    Returns:
+        list: Combined list of all audio sources from both categories
+    """
+    audio_sources = all_audio_sources.get(AUDIO_SOURCES_KEY, [])
+    ambient_audio_sources = all_audio_sources.get(AMBIENT_AUDIO_SOURCES_KEY, [])
+    return audio_sources + ambient_audio_sources
+# --- Video ---
+def get_video_path(video) -> str:
+    """
+    Converts video input into a file path, creating a temporary file if needed.
+    Args:
+        video: Video input that can be a file path string, a dictionary with video data, or a dictionary with a video name
+    Returns:
+        str: Path to the video file
+    Raises:
+        ValueError: If no video is provided or if video format is invalid
+        RuntimeError: If video data cannot be processed
+    """
+    if not video:
+        raise ValueError("No video file provided.")
+    if isinstance(video, str):
+        return video
+    elif isinstance(video, dict):
+        video_data = video.get('data')
+        video_name = video.get('name')
+        if video_name:
+            return video_name
+        else:
+            try:
+                # Save video data to temporary file
+                header, encoded = video_data.split(',', 1)
+                file_ext = header.split(';')[0].split('/')[1]
+                video_bytes = base64.b64decode(encoded)
+                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.' + file_ext)
+                temp_file.write(video_bytes)
+                temp_file.close()
+                return temp_file.name
+            except Exception as e:
+                raise RuntimeError(f"Failed to process video data: {e}")
+    else:
+        raise ValueError("Invalid video format")
+def get_video_info(video_path: str) -> dict:
+    """
+    Extracts information about a video.
+    Args:
+        video_path: Path to the video file
+    Returns:
+        A dictionary containing video metadata:
+            {
+                "FilePath": str,
+                "Width": int,
+                "Height": int,
+                "Duration": float,
+                "FrameCount": int,
+                "FrameRate": float,
+                "FrameInterval": float
+            }
+    Raises:
+        ValueError: If video_path is None or empty
+        FileNotFoundError: If the video file cannot be opened
+        RuntimeError: If there's an error processing the video
+    """
+    if not video_path:
+        raise ValueError("Video file path was not given")
+    try:
+        cap = cv2.VideoCapture(video_path)
+        if not cap.isOpened():
+            raise FileNotFoundError(f"Unable to open the video file at '{video_path}'. Please verify the file path and format.")
+        total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        cap.release()
+    except Exception as e:
+        raise RuntimeError(f"{e}")
+    return {
+        "FilePath": video_path,
+        "Width": width,
+        "Height": height,
+        "Duration": total_frames / fps if fps != 0 else 0,
+        "FrameCount": total_frames,
+        "FrameRate": fps,
+        "FrameInterval": fps
+    }
+def extract_frames(video_path: str, frame_interval: int, target_width: int = None, target_height: int = None) -> list:
+    """
+    Extracts frames from the video at the specified interval and includes the last frame.
+    Optionally downscales the frames to the target resolution.
+    Args:
+        video_path (str): Path to the video file
+        frame_interval (int): Interval between extracted frames
+        target_width (int, optional): Desired width of output frames. If None, original width is used
+        target_height (int, optional): Desired height of output frames. If None, original height is used
+    Returns:
+        list: List of tuples (frame_index, PIL Image)
+    Raises:
+        FileNotFoundError: If the video file cannot be opened
+        ValueError: If video contains no frames
+    """
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise FileNotFoundError("Could not open video.")
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    if total_frames == 0:
+        cap.release()
+        raise ValueError("Video contains no frames.")
+    # Calculate target dimensions while maintaining aspect ratio if only one dimension is specified
+    original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    if target_width is not None and target_height is None:
+        scale_factor = target_width / original_width
+        target_height = int(original_height * scale_factor)
+    elif target_height is not None and target_width is None:
+        scale_factor = target_height / original_height
+        target_width = int(original_width * scale_factor)
+    need_resize = (target_width is not None and target_height is not None and (target_width != original_width or target_height != original_height))
+    # Compute the frame indices to extract
+    if frame_interval <= 0 or frame_interval >= total_frames:
+        frame_indices = [0, total_frames - 1] if total_frames > 1 else [0]
+    else:
+        frame_indices = list(range(0, total_frames, frame_interval))
+        if frame_indices[-1] != total_frames - 1:
+            frame_indices.append(total_frames - 1)
+    frames = []
+    for frame_idx in frame_indices:
+        cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
+        ret, frame = cap.read()
+        if not ret:
+            continue  # Skip if frame could not be read
+        if need_resize:
+            frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
+        pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+        frames.append((frame_idx, pil_image))
+    cap.release()
+    return frames
+def calculate_video_input_cost(width: int, height: int, sample_count: int, overwrite_vision_lm: Optional[VisionLMService] = None) -> float:
+    """
+    Calculates the input cost for video processing based on dimensions and sample count.
+    Args:
+        width: Width of the video in pixels
+        height: Height of the video in pixels
+        sample_count: Number of samples/frames being processed
+        overwrite_vision_lm: Optional custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
+    Returns:
+        float: The calculated input cost for video processing
+    Raises:
+        ValueError: If width, height, or sample_count are not positive numbers
+        RuntimeError: If there's an error during cost calculation
+    """
+    vision_lm, _ = get_services(vision_lm=overwrite_vision_lm)
+    if width <= 0 or height <= 0 or sample_count <= 0:
+        raise ValueError("Width, height, and sample count must be positive numbers")
+    try:
+        return vision_lm.calculate_video_input_cost(width, height, sample_count)
+    except Exception as e:
+        raise RuntimeError({str(e)})
+# --- Audio ---
+def validate_audio_sources(audio_sources: list, fps: int, frame_count: int) -> list:
+    """
+    Helper function to validate a list of audio sources while calculating durations
+    and adjusting frame indices to ensure minimum audio duration.
+    Args:
+        audio_sources: List of dictionaries containing audio source information
+        fps: Frames per second of the video
+        frame_count: Total number of frames in the video
+    Returns:
+        List of validated audio source dictionaries with adjusted durations and frame indices
+    Raises:
+        ValueError: If audio_sources is None or empty
+        TypeError: If any audio source item is missing required fields
+        ValueError: If frame indices are invalid or if fps is 0
+    """
+    if not audio_sources:
+        raise ValueError("Audio sources list cannot be empty")
+    if fps <= 0:
+        raise ValueError("FPS must be greater than 0")
+    validated_sources = []
+    for audio_source in audio_sources:
+        try:
+            # Calculate audio duration and get adjusted frame indices
+            duration, start_frame, end_frame = calculate_duration(
+                audio_source.get("StartFrameIndex", 0),
+                audio_source.get("EndFrameIndex", frame_count - 1),
+                fps,
+                frame_count
+            )
+            audio_source["Duration"] = duration
+            audio_source["StartFrameIndex"] = start_frame
+            audio_source["EndFrameIndex"] = end_frame
+            validated_audio_source = AudioSource(**audio_source)
+            validated_sources.append(validated_audio_source.model_dump())
+        except (KeyError, TypeError) as e:
+            raise TypeError(f"Invalid audio source format: {str(e)}")
+    return validated_sources
+def generate_audio(prompt: str, duration: float, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> str:
+    """
+    Generates audio file for given prompt and duration.
+    Args:
+        prompt: Description of the sound to generate
+        duration: Length of the audio in seconds
+        ttsfx_api_key: API key for the text-to-speech service
+        overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
+    Returns:
+        str: Path to the generated audio file
+    Raises:
+        RuntimeError: If there's an error during audio generation
+    """
+    _, ttsfx = get_services(ttsfx=overwrite_ttsfx)
+    _, ttsfx_api_key = get_api_keys(ttsfx_api_key=ttsfx_api_key)
+    try:
+        audio_data = ttsfx.generate_sound_effect(prompt, duration, api_key=ttsfx_api_key)
+        temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
+        temp_audio_file.write(audio_data.read())
+        temp_audio_file.close()
+        return temp_audio_file.name
+    except Exception as e:
+        raise RuntimeError(str(e))
+def generate_audio_for_audio_source(audio_source: dict, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> dict:
+    """
+    Generates audio file for single audio source with description.
+    Args:
+        audio_source: Dictionary containing sound description and duration
+        ttsfx_api_key: API key for the text-to-speech service
+        overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
+    Returns:
+        dict: Audio source dictionary with added AudioPath and Volume fields
+    """
+    sound_description = audio_source.get("SoundDescription", "")
+    duration = audio_source.get("Duration")
+    audio_source["AudioPath"] = generate_audio(sound_description, duration, ttsfx_api_key, overwrite_ttsfx)
+    audio_source["Volume"] = 1.0
+    return audio_source
+def generate_all_audio(all_audio_sources: dict, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> dict:
+    """
+    Generates audio files for audio sources with descriptions.
+    Args:
+        all_audio_sources: Dictionary containing AudioSources and AmbientAudioSources lists
+        ttsfx_api_key: API key for the text-to-speech service
+        overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
+    Returns:
+        dict: Audio sources dictionary with generated audio paths
+    Raises:
+        ValueError: If audio sources dictionary is None
+        RuntimeError: If there's an error during audio generation
+    """
+    if all_audio_sources is None:
+        raise ValueError("missing audio sources.")
+    try:
+        for audio_source in all_audio_sources[AUDIO_SOURCES_KEY]:
+            audio_source = generate_audio_for_audio_source(audio_source, ttsfx_api_key, overwrite_ttsfx)
+        for audio_source in all_audio_sources[AMBIENT_AUDIO_SOURCES_KEY]:
+            audio_source = generate_audio_for_audio_source(audio_source, ttsfx_api_key, overwrite_ttsfx)
+        return all_audio_sources
+    except Exception as e:
+        raise RuntimeError(str(e))
+# --- High-level Processing ---
+def process_video(video, frame_interval=None, target_width=None, target_height=None, prompt_instruction=None, vision_lm_api_key=None, overwrite_vision_lm: Optional[VisionLMService] = None) -> tuple[dict, str]:
+    """
+    Processes the video, extracts frames at the specified interval, generates video description and audio sources.
+    Args:
+        video: Video input as either a file path string, or a dictionary containing video data
+        frame_interval: Interval between extracted frames. Defaults to None
+        target_width (optional): Target width for frame extraction. Defaults to None
+        target_height (optional): Target height for frame extraction. Defaults to None
+        prompt_instruction (optional): Custom instruction to include in the prompt. Defaults to None
+        vision_lm_api_key (optional): API key for vision language model. Defaults to None
+        overwrite_vision_lm (optional): Custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
+    Returns:
+        tuple: Contains (audio_sources, video_description)
+            audio_sources: Dictionary containing 'AudioSources' and 'AmbientAudioSources' lists
+            video_description: String containing detailed description of the video
+    Raises:
+        ValueError: If video input is invalid or missing
+        FileNotFoundError: If video file cannot be opened or accessed
+        RuntimeError: If frame extraction fails, API returns an error, or audio source generation fails
+        Exception: If data validation fails
+    """
+    # Add frame info
+    video_path = get_video_path(video)
+    cap = cv2.VideoCapture(video_path)
+    fps = 25 # Default value
+    frame_count = fps
+    frame_info_prompt = ""
+    if cap.isOpened():
+        frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        if frame_interval is None:
+            frame_interval = math.ceil(fps) # Default to one sample per second
+        seconds = frame_interval / fps
+        frame_info_prompt += f"\nThe framerate of the video is {fps} FPS."
+        frame_info_prompt += f"\nThe frame interval between each image is {frame_interval} frames."
+        frame_info_prompt += f" Which means there's approximately {seconds:.2f} seconds between each image in real time."
+        cap.release()
+    # Extract frames
+    frames = extract_frames(video_path, frame_interval, target_width, target_height)
+    if not frames:
+        raise RuntimeError("No frames extracted from video.")
+    images = []
+    frame_numbers = []
+    for frame_idx, pil_image in frames:
+        images.append(pil_image)
+        frame_numbers.append(frame_idx)
+    # Build prompt
+    prompt = (
+        "You will be given a list of images representing certain frames of a video that has no audio."
+        "\nYou must provide a general description of the video and a list of audio sources you recognize"
+        " in these frames, so that it can be used to create an audio file for each audio source in the video."
+        "\n\nThe VideoProperties 'Description' property must include a very thorough and detailed description of everything that is visible in the frames."
+        " Make sure not a single thing is missing, the context of this description will help define the audio sources."
+        "\nThe VideoProperties 'Description' property may also include descriptions of abstract things,"
+        " like the vibe, the medium (if it's animated, cartoony or recorded) or the time period,"
+        " but only if it's relevant to the possible audio that could fit underneath the video."
+        "\n\nThe AudioSource 'SoundDescription' property is implied to already be a description of audio so it does not contain words like \"The sound of ...\", etc."
+        "\nThe AudioSource 'SoundDescription' property should only contain the direct description of the audio, and not any unnecessary visual descriptions like color."
+        "\nThe AudioSource 'SoundDescription' property should be able to be understood without outside context. For example, you should never describe it as"
+        " \"The sound of footsteps as the mayor is walking towards the lamp post.\". Because \"the mayor\" or \"the lamp post\" are unknown entities within this description. They're also irrelevant to the sound this would make."
+        " The correct description for that example would have been: \"Footsteps of a man on a sidewalk.\""
+        "\nThe AudioSource 'SoundDescription' property also does not know about the other properties, like the AudioSource 'Slug' property."
+        " If the audio source is a bird, for example, the 'SoundDescription' property can't just be \"Wings flapping.\", but must still include the subject if it would be relevant for the sound,"
+        " so a better description would be \"A bird flapping its wings.\""
+        "\n\nAnother example of a very bad sound description, this is an example of things you need to avoid: \"A gray owl with a hat is making sounds on the walls of the castle, adding to the eerie vibe of the video.\""
+        "\nThe correct way to describe that would be: \"An owl hooting in the distance.\""
+        "\n\nLastly, you will also include a list of ambient audio sources."
+        "\nThe ambient audio sources are similar to the normal audio sources, but are for elements that are either invisible in the frames or just not a single source of audio."
+        "\nThe sounds from the ambient audio sources should not already exist in the normal audio sources. Because that would mean that sound would be duplicate in the end result."
+        "\nTo create a good ambient audio source description, you must imagine what you could hear in the video that's not necessarily depicted."
+        "\nExample: if a video depicts a squirrel playing in a forest, then you can imagine the forest making lots of noises that aren't directly visible,"
+        " such as chirping birds or crickets. The ambient sound description will not include the noises the squirrel makes because these will be found in the normal audio sources.\n"
+    )
+    if frame_info_prompt is not None and frame_info_prompt.strip():
+        prompt += frame_info_prompt
+    if prompt_instruction is not None and prompt_instruction.strip():
+        prompt += f"\n\nThe user included the following custom instruction for you, try to abide if possible:\n<user-instruction>\n{prompt_instruction}\n</user-instruction>"
+    prompt += f"\nThis array shows the frame index (which frame of the video they represent), for each image you are about to see in order: {frame_numbers}"
+    # Generate video description and audio sources
+    try:
+        vision_lm, _ = get_services(vision_lm=overwrite_vision_lm)
+        vision_lm_api_key, _ = get_api_keys(vision_lm_api_key=vision_lm_api_key)
+        result_data = vision_lm.generate_audio_sources(prompt, images, vision_lm_api_key)
+    except Exception as e:
+        raise RuntimeError(str(e))
+    # Validate the returned data using the AudioSource model
+    video_description = result_data['VideoProperties']['Description']
+    audio_sources_data = result_data[AUDIO_SOURCES_KEY]
+    ambient_audio_sources_data = result_data[AMBIENT_AUDIO_SOURCES_KEY]
+    try:
+        validated_audio_sources = validate_audio_sources(audio_sources_data, fps, frame_count)
+        validated_ambient_audio_sources = validate_audio_sources(ambient_audio_sources_data, fps, frame_count)
+        all_audio_sources = { AUDIO_SOURCES_KEY: validated_audio_sources, AMBIENT_AUDIO_SOURCES_KEY: validated_ambient_audio_sources }
+        return all_audio_sources, video_description
+    except Exception as e:
+        raise Exception(str(e))
+def combine_video_and_audio(audio_sources: list, input_video: str | dict, output_video_path: str = None) -> str:
+    """
+    Composes audio sources onto a video file, creating a new video with the combined audio.
+    Args:
+        audio_sources (list): List of audio source dictionaries containing audio paths and timing
+        input_video (str | dict): Either a path to video file or a dictionary containing video data
+        output_video_path (str): Path for the output video to be saved to, if none will be adjecant to the input_video path and add an '_output' suffix. Defaults to None
+    Returns:
+        str: Path to the output video file with composed audio
+    Raises:
+        ValueError: If video input is invalid or missing
+        FileNotFoundError: If video file cannot be accessed
+        RuntimeError: If error occurs during video composition or audio processing
+        OSError: If there are file system errors during composition
+    """
+    input_video_path = get_video_path(input_video)
+    if not input_video_path:
+        raise ValueError("Invalid or missing video input")
+    try:
+        if output_video_path is None:
+            path_to_input_file_name, input_file_extension = os.path.splitext(input_video_path)
+            output_video_path = path_to_input_file_name + "_output" + input_file_extension
+        return video_comping.combine_video_and_audio(input_video_path, output_video_path, combine_all_audio_sources(audio_sources))
+    except Exception as e:
+        raise RuntimeError(str(e))
+def add_audio_to_video(input_video_path, output_video_path=None, frame_interval=None, downscale_to_max_side=None, prompt_instruction=None, vision_lm_api_key=None, ttsfx_api_key: str = None, overwrite_vision_lm: Optional[VisionLMService] = None, overwrite_ttsfx: Optional[TTSFXService] = None, quiet: bool = False) -> str:
+    """
+    Processes the video, extracts frames at the specified interval, generates video description and audio sources, then generate audio from those descriptions and adds them back to the video.
+    Args:
+        video: Video input as either a file path string, or a dictionary containing video data
+        frame_interval (optional): Interval between extracted frames. Defaults to None
+        downscale_to_max_side (optional): Downscale samples to maximum side, if None will use 512px. Defaults to None
+        prompt_instruction (optional): Custom instruction to include in the prompt. Defaults to None
+        vision_lm_api_key (optional): API key for vision language model. Defaults to None
+        overwrite_vision_lm (optional): Custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
+        quiet (optional): If True, suppresses progress output. Defaults to False
+    Returns:
+        str: Path to the output video file with composed audio
+    Raises:
+        ValueError: If video input is invalid or missing
+        FileNotFoundError: If video file cannot be opened or accessed
+        RuntimeError: If frame extraction fails, API returns an error, audio source generation fails, error occurs during video composition or audio processing
+        Exception: If data validation fails
+        OSError: If there are file system errors during composition
+    """
+    try:
+        if downscale_to_max_side is None:
+            downscale_to_max_side = 512
+        input_video_info = get_video_info(input_video_path)
+        target_width, target_height = downscale_dimensions(input_video_info['Width'], input_video_info['Height'], downscale_to_max_side)
+        duration = input_video_info.get("Duration", 0.0)
+        if duration > 20.0 and overwrite_ttsfx is None:
+            if not quiet:
+                print("WARNING: Input video is longer than 20 seconds. This process is works best for 3-10 second clips and becomes unstable with longer videos. Consider trimming your video into shorter segments.")
+    except Exception as e:
+        raise RuntimeError(f"Error determining downscaled sample resolution: {e}")
+    try:
+        if not quiet:
+            print(f"Processing {input_video_path}...")
+        all_audio_sources, _ = process_video(input_video_path, frame_interval, target_width, target_height, prompt_instruction, vision_lm_api_key, overwrite_vision_lm)
+    except Exception as e:
+        raise RuntimeError(f"Error during video processing: {e}")
+    try:
+        if not quiet:
+            print(f"Generating audio...")
+        all_audio_sources = generate_all_audio(all_audio_sources, ttsfx_api_key, overwrite_ttsfx)
+    except Exception as e:
+        raise RuntimeError(f"Error during audio generation: {e}")
+    try:
+        print(f"Adding generated audio to the video...")
+        output = combine_video_and_audio(all_audio_sources, input_video_path, output_video_path)
+        if not quiet:
+            print(f"Output saved to: {output}")
+        return output
+    except Exception as e:
+        raise RuntimeError(f"Error while combining video and audio: {e}")
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Auto-foley: Automatically add sound effects to a video without sound',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        '--input', '-i',
+        required=True,
+        help='Path to the input video file'
+    )
+    # Optional arguments
+    parser.add_argument(
+        '--output', '-o',
+        help='Path for the output video file. If not specified, will append "_output" to the input filename'
+    )
+    parser.add_argument(
+        '--frame-interval',
+        type=int,
+        help='Interval between processed frames'
+    )
+    parser.add_argument(
+        '--downscale-to-max-side',
+        type=int,
+        help='Target max side for downscaling video samples'
+    )
+    parser.add_argument(
+        '--prompt-instruction',
+        help='Custom instruction for the video processing'
+    )
+    parser.add_argument(
+        '--vision-lm-api-key',
+        help='API key for the vision language model'
+    )
+    parser.add_argument(
+        '--ttsfx-api-key',
+        help='API key for the text-to-speech effects service'
+    )
+    parser.add_argument(
+        '--quiet', '-q',
+        action='store_true',
+        help='Suppress progress output'
+    )
+    args = parser.parse_args()
+    try:
+        add_audio_to_video(
+            input_video_path=args.input,
+            output_video_path=args.output,
+            frame_interval=args.frame_interval,
+            downscale_to_max_side=args.downscale_to_max_side,
+            prompt_instruction=args.prompt_instruction,
+            vision_lm_api_key=args.vision_lm_api_key,
+            ttsfx_api_key=args.ttsfx_api_key,
+            quiet=args.quiet
+        )
+    except Exception as e:
+        print(f"{str(e)}", file=sys.stderr)
+        sys.exit(1)

auto_foley/services.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import os
+import sys
+from io import BytesIO
+from typing import Protocol, Optional
+class VisionLMService(Protocol):
+    def calculate_video_input_cost(self, width: int, height: int, sample_count: int) -> str:
+        ...
+    def generate_audio_sources(self, prompt: str, images: list, api_key: str | None = None) -> dict:
+        ...
+class TTSFXService(Protocol):
+    def generate_sound_effect(self, text_prompt: str, duration_seconds: float, prompt_influence: float, api_key: str = None) -> BytesIO:
+        ...
+if __name__ == "__main__" or 'auto_foley' not in sys.modules:
+    import vision_lm_chatgpt
+    import ttsfx_elevenlabs
+else:
+    from auto_foley import vision_lm_chatgpt
+    from auto_foley import ttsfx_elevenlabs
+# Module-level service instances
+_current_vision_lm: VisionLMService = vision_lm_chatgpt.ChatGPTVisionLM()
+_current_ttsfx: TTSFXService = ttsfx_elevenlabs.ElevenLabsTTSFX()
+def set_default_vision_lm(service: VisionLMService) -> None:
+    """
+    Sets the default global vision language model service for all functions to use.
+    Args:
+        service: Object instance that implements the VisionLMService protocol
+    """
+    global _current_vision_lm
+    _current_vision_lm = service
+def set_default_ttsfx(service: TTSFXService) -> None:
+    """
+    Sets the default global text-to-sound-effect service for all functions to use.
+    Args:
+        service: Object instance that implements the TTSFXService protocol
+    """
+    global _current_ttsfx
+    _current_ttsfx = service
+def get_services(vision_lm: Optional[VisionLMService] = None, ttsfx: Optional[TTSFXService] = None) -> tuple[VisionLMService, TTSFXService]:
+    """
+    Gets the service instances to use, either from provided arguments or falling back to global defaults.
+    Args:
+        vision_lm (optional): The vision language model service instance to use.
+            If None, uses the global default. Defaults to None
+        ttsfx (optional): The text-to-sound-effect service instance to use.
+            If None, uses the global default. Defaults to None
+    Returns:
+        tuple: Contains (vision_lm_service, ttsfx_service)
+            vision_lm_service: VisionLMService instance to use
+            ttsfx_service: TTSFXService instance to use
+    """
+    return (
+        vision_lm if vision_lm is not None else _current_vision_lm,
+        ttsfx if ttsfx is not None else _current_ttsfx
+    )
+def get_api_keys(vision_lm_api_key: Optional[str] = None, ttsfx_api_key: Optional[str] = None) -> tuple[Optional[str], Optional[str]]:
+    """
+    Gets the API keys to use, either from provided arguments or falling back to environment variables.
+    Args:
+        vision_lm_api_key (optional): The vision language model API key to use.
+            If None or empty string, uses the AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY environment variable.
+            Defaults to None
+        ttsfx_api_key (optional): The text-to-sound-effect API key to use.
+            If None or empty string, uses the AUTO_FOLEY_DEFAULT_TTSFX_API_KEY environment variable.
+            Defaults to None
+    Returns:
+        tuple: Contains (vision_lm_api_key, ttsfx_api_key)
+            vision_lm_api_key: API key to use for vision language model service
+            ttsfx_api_key: API key to use for text-to-sound-effect service
+    """
+    return (
+        vision_lm_api_key if vision_lm_api_key else os.getenv('AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY'),
+        ttsfx_api_key if ttsfx_api_key else os.getenv('AUTO_FOLEY_DEFAULT_TTSFX_API_KEY')
+    )

auto_foley/ttsfx_elevenlabs.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from elevenlabs.client import ElevenLabs
+from io import BytesIO
+class ElevenLabsTTSFX:
+    def generate_sound_effect(self, text_prompt: str, duration_seconds: float, prompt_influence: float = 0.3, api_key: str = None) -> BytesIO:
+        """
+        Generates a sound effect based on a text prompt using ElevenLabs API.
+        Args:
+            text_prompt: Text description of the desired sound effect
+            duration_seconds: Length of the sound effect in seconds
+            prompt_influence: How strongly the prompt influences the generated sound (default: 0.3)
+            api_key: Optional API key for ElevenLabs authentication
+        Returns:
+            BytesIO: Audio data as a bytes stream
+        Raises:
+            ValueError: If text_prompt is empty or duration_seconds is invalid
+            RuntimeError: If there's an error during sound effect generation
+        """
+        if not text_prompt:
+            raise ValueError("Text prompt cannot be empty")
+        # Other values will cause exceptions to be thrown
+        if duration_seconds < 0.5:
+            duration_seconds = 0.5
+        if duration_seconds > 22.0:
+            duration_seconds = 22.0
+        try:
+            elevenlabs_client = ElevenLabs(api_key=api_key)
+            response = elevenlabs_client.text_to_sound_effects.convert(
+                text=text_prompt,
+                duration_seconds=duration_seconds,
+                prompt_influence=prompt_influence,
+            )
+            # Collect the audio data into a BytesIO object
+            audio_data = BytesIO()
+            for chunk in response:
+                audio_data.write(chunk)
+            audio_data.seek(0)
+            return audio_data
+        except Exception as e:
+            raise RuntimeError(f"{str(e)}")

auto_foley/video_comping.py ADDED Viewed

	@@ -0,0 +1,125 @@

+from moviepy.audio.AudioClip import AudioArrayClip
+from pydub import AudioSegment
+import moviepy.editor as mpe
+import numpy as np
+def combine_video_and_audio(input_video_path: str, output_video_path: str, audio_sources: list) -> str:
+    """
+    Combines audio sources with a video file to create a new video with audio.
+    Args:
+        input_video_path: Path to the video file
+        output_video_path: Path for the output video to be saved to
+        audio_sources: List of audio source dictionaries containing timing and filepath information
+    Returns:
+        str: Path to the output video file with combined audio
+    Raises:
+        ValueError: If video path or audio sources are invalid or missing
+        FileNotFoundError: If the video file cannot be accessed
+        OSError: If there are issues with file operations in the output directory
+        RuntimeError: If there's an error during the video composition process
+    """
+    if not input_video_path:
+        raise ValueError("No video file provided")
+    if not audio_sources:
+        raise ValueError("No audio sources provided")
+    try:
+        video_clip = mpe.VideoFileClip(input_video_path)
+        original_duration = video_clip.duration
+        fps = video_clip.fps
+        audio_clips = []
+        for idx, audio_source in enumerate(audio_sources):
+            audio_path = audio_source.get("AudioPath")
+            start_frame_index = audio_source.get("StartFrameIndex", 0)
+            volume = audio_source.get("Volume", 1.0)
+            start_time = start_frame_index / fps # Convert start_frame_index to start_time in seconds
+            if audio_path is None:
+                continue  # Skip if no audio path
+            # Convert the audio file to numpy array
+            sample_rate, samples = convert_audio_file_to_numpy(audio_path)
+            if samples is None:
+                continue  # Skip if samples are None
+            samples = np.array(samples)
+            if samples.ndim == 0: # If samples is scalar, expand its dimensions
+                samples = np.expand_dims(samples, axis=0)
+            if samples.ndim == 1: # If samples is one-dimensional, reshape to (n_samples, 1)
+                samples = samples.reshape(-1, 1)
+            # Apply volume adjustment
+            samples *= volume
+            duration = len(samples) / sample_rate
+            # Adjust duration if audio extends beyond video duration
+            audio_end_time = start_time + duration
+            if start_time >= original_duration:
+                continue # Skip if audio clip starts after the video ends
+            if audio_end_time > original_duration:
+                # Adjust the duration to not exceed video duration
+                duration = original_duration - start_time
+                # Calculate the number of samples to keep
+                num_samples = int(duration * sample_rate)
+                samples = samples[:num_samples]
+            audio_clip = AudioArrayClip(samples, fps=sample_rate).set_start(start_time).set_duration(duration)
+            audio_clips.append(audio_clip)
+        # Composite all audio clips
+        if audio_clips:
+            composite_audio = mpe.CompositeAudioClip(audio_clips).set_duration(original_duration)
+            # Set the video's audio to the composite audio
+            video_clip = video_clip.set_audio(composite_audio)
+        else:
+            # Ensure video has no audio if there are no audio clips
+            video_clip = video_clip.without_audio()
+        video_clip = video_clip.subclip(0, original_duration) # Trim the video to its original duration to prevent any extension
+        video_clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac", temp_audiofile=None, remove_temp=True, verbose=False, logger=None)
+        video_clip.close()
+        if audio_clips:
+            for clip in audio_clips:
+                clip.close()
+            composite_audio.close()
+        return output_video_path
+    except Exception as e:
+        raise RuntimeError(str(e))
+def convert_audio_file_to_numpy(audio_filepath: str) -> tuple[int, np.ndarray]:
+    """
+    Converts an audio file to a numpy array with normalized samples.
+    Args:
+        audio_filepath: Path to the audio file to convert
+    Returns:
+        tuple: Contains (sample_rate, samples)
+            - sample_rate (int): The sample rate of the audio
+            - samples (np.ndarray): Normalized audio samples in range [-1, 1]
+    Raises:
+        FileNotFoundError: If the audio file cannot be accessed
+        RuntimeError: If there's an error processing the audio file
+    """
+    try:
+        audio_segment = AudioSegment.from_file(audio_filepath)
+        sample_width = audio_segment.sample_width  # in bytes
+        max_val = float(2 ** (8 * sample_width - 1))
+        # Normalize samples to [-1, 1]
+        samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / max_val
+        if audio_segment.channels == 2:
+            samples = samples.reshape((-1, 2))
+        sample_rate = audio_segment.frame_rate
+        return (sample_rate, samples)
+    except FileNotFoundError:
+        raise
+    except Exception as e:
+        raise RuntimeError(str(e))

auto_foley/vision_lm_chatgpt.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import base64
+import io
+import json
+import math
+import openai
+from openai import OpenAI
+class ChatGPTVisionLM:
+    def calculate_video_input_cost(self, width: int, height: int, sample_count: int) -> str:
+        """
+        Calculates the estimated cost to process a video based on dimensions and number of samples.
+        Args:
+            width: Width of the video in pixels
+            height: Height of the video in pixels
+            sample_count: Number of frames to be processed
+        Returns:
+            str: Formatted string with calculated cost in USD with 6 decimal places
+        Raises:
+            ValueError: If width, height or sample_count are negative or zero
+        """
+        if width <= 0 or height <= 0 or sample_count <= 0:
+            raise ValueError("Width, height and sample_count must be positive values")
+        # Constants for gpt-4o-mini
+        price_per_million_tokens = 0.15
+        base_tokens = 2833
+        tile_tokens = 5667
+        tile_size = 512
+        tiles_x = math.ceil(width / tile_size)
+        tiles_y = math.ceil(height / tile_size)
+        total_tiles = tiles_x * tiles_y
+        total_tokens = base_tokens + (tile_tokens * total_tiles)
+        cost_for_one_image = (total_tokens / 1000000) * price_per_million_tokens
+        total_cost = cost_for_one_image * sample_count
+        return f"${total_cost:.6f}"
+    def generate_audio_sources(self, prompt: str, images: list, api_key: str | None = None) -> dict:
+        """
+        Generates a description of the video and a list of audio sources for the provided images using the OpenAI API.
+        Args:
+            prompt: The prompt to send to the model
+            images: List of PIL.Image.Image objects to process
+            api_key: Optional OpenAI API key. If None, will use environment variable
+        Returns:
+            dict: The parsed response containing:
+                - VideoProperties (dict): Contains video description
+                - AudioSources (list): List of audio source objects
+                - AmbientAudioSources (list): List of ambient audio source objects
+        Raises:
+            ValueError: If images input is invalid
+            RuntimeError: If there's an error calling the OpenAI API
+            JSONDecodeError: If the API response cannot be parsed
+        """
+        gpt_model = "gpt-4o-mini"
+        if not isinstance(images, list):
+            images = [images]
+        # Build the content list
+        content_list = [{"type": "text", "text": prompt}]
+        for image in images:
+            # Encode the image to base64
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
+            content_list.append({
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/png;base64,{base64_image}",
+                    "detail": "auto"
+                },
+            })
+        # Construct messages
+        messages = [
+            {
+                "role": "user",
+                "content": content_list
+            }
+        ]
+        # Define the tool (function) schema
+        tool = {
+                "type": "function",
+                "function": {
+                    "name": "analyze_video",
+                    "description": "Provide a description of the video and extract audio sources from video frames.",
+                    "strict": True,
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "VideoProperties": {
+                                "type": "object",
+                                "properties": {
+                                    "Description": {
+                                        "type": "string",
+                                        "description": "A general description of the entire video."
+                                    }
+                                },
+                                "required": ["Description"],
+                                "additionalProperties": False
+                            },
+                            "AudioSources": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "SourceSlugID": {
+                                            "type": "string",
+                                            "description": "A unique ID in the format {Subject}{Number}{Activity}, e.g., \"Bear1Roaring\"."
+                                        },
+                                        "SoundDescription": {
+                                            "type": "string",
+                                            "description": "Description of the sound, e.g., \"A person using a chainsaw against a tree.\""
+                                        },
+                                        "StartFrameIndex": {
+                                            "type": "integer",
+                                            "description": "The frame index where the sound starts."
+                                        },
+                                        "EndFrameIndex": {
+                                            "type": "integer",
+                                            "description": "The frame index where the sound ends."
+                                        }
+                                    },
+                                    "required": ["SourceSlugID", "SoundDescription", "StartFrameIndex", "EndFrameIndex"],
+                                    "additionalProperties": False
+                                }
+                            },
+                            "AmbientAudioSources": {
+                                "type": "array",
+                                "items": {
+                                    "type": "object",
+                                    "properties": {
+                                        "SourceSlugID": {
+                                            "type": "string",
+                                            "description": "A unique ID in the format {Subject}{Number}{Activity}, e.g., \"Wind1Howling\"."
+                                        },
+                                        "SoundDescription": {
+                                            "type": "string",
+                                            "description": "Description of the ambient sound, e.g., \"An eerie wind howling through a ravine.\""
+                                        },
+                                        "StartFrameIndex": {
+                                            "type": "integer",
+                                            "description": "The frame index where the sound starts."
+                                        },
+                                        "EndFrameIndex": {
+                                            "type": "integer",
+                                            "description": "The frame index where the sound ends."
+                                        }
+                                    },
+                                    "required": ["SourceSlugID", "SoundDescription", "StartFrameIndex", "EndFrameIndex"],
+                                    "additionalProperties": False
+                                }
+                            }
+                        },
+                        "required": ["VideoProperties", "AudioSources", "AmbientAudioSources"],
+                        "additionalProperties": False
+                    }
+                }
+            }
+        try:
+            # Call the OpenAI API with the tool
+            openai_client = OpenAI(api_key=api_key)
+            response = openai_client.chat.completions.create(
+                model=gpt_model,
+                messages=messages,
+                tools=[tool],
+                tool_choice={"type": "function", "function": {"name": "analyze_video"}},
+                max_tokens=12288
+            )
+            # Access the choices and messages properly
+            choice = response.choices[0]
+            message = choice.message
+            if hasattr(message, 'tool_calls') and message.tool_calls:
+                tool_call = message.tool_calls[0]
+                arguments = tool_call.function.arguments
+                result_data = json.loads(arguments)
+                return result_data
+            else:
+                raise RuntimeError("LLM did not provide the expected structured output.")
+        except openai.OpenAIError as e:
+            raise RuntimeError(f"OpenAI API error: {str(e)}")
+        except json.JSONDecodeError as e:
+            raise json.JSONDecodeError(f"Error parsing API response: {str(e)}", e.doc, e.pos)

custom_script.js ADDED Viewed

	@@ -0,0 +1,166 @@

+function manageTimeBar(elemId, time) {
+    if (!window.visTimelineInstances) {
+        console.error(`Timeline instances collection not found`);
+        return;
+    }
+    const timeline = window.visTimelineInstances[elemId];
+    if (!timeline) {
+        console.error(`Timeline instance ${elemId} not found`);
+        return;
+    }
+    if (!window.customTimeBarIds) {
+        window.customTimeBarIds = {};
+    }
+    try {
+        timeline.setCustomTime(time, elemId);
+    } catch (e) {
+        timeline.addCustomTime(time, elemId);
+    }
+}
+function setTimeBarDirect(elemId, time) {
+    manageTimeBar(elemId, time);
+}
+function setTimeBarNormalized(elemId, start, end, normalizedPos) {
+    const time = start + (end - start) * normalizedPos;
+    manageTimeBar(elemId, time);
+}
+class VideoTimelineSync  {
+    constructor(videoId, timelineId, trackLengthItemId) {
+        this.timelineId = timelineId;
+        try {
+            const trackLengthItemData = getTimelineItemData(timelineId, trackLengthItemId);
+            if (trackLengthItemData != null) {
+                const trackLengthStart = trackLengthItemData.start;
+                const trackLengthEnd = trackLengthItemData.end;
+                this.trackLength = trackLengthEnd - trackLengthStart;
+            }
+        } catch (error) {
+            console.error('Error setting timeline video sync:', error);
+            return;
+        }
+        const container = document.getElementById(videoId);
+        if (!container) {
+            console.error('Video container not found');
+            return;
+        }
+        this.progressElement = container.querySelector('progress');
+        if (!this.progressElement) {
+            console.error('Progress element not found');
+            return;
+        }
+        this.setupProgressObserver();
+    }
+    setupProgressObserver() {
+        // Create mutation observer to watch for value changes of the progress element
+        this.observer = new MutationObserver((mutations) => {
+            mutations.forEach((mutation) => {
+                if (mutation.type === 'attributes' && mutation.attributeName === 'value') {
+                    this.onProgressUpdate();
+                }
+            });
+        });
+        // Observe the progress element for value changes
+        this.observer.observe(this.progressElement, {
+            attributes: true,
+            attributeFilter: ['value']
+        });
+    }
+    onProgressUpdate() {
+        const value = this.progressElement.value;
+        if (value === undefined || value === null) return;
+        // Value is already normalized (between 0 and 1)
+        this.syncTimeBarToPlayback(value);
+    }
+    syncTimeBarToPlayback(normalizedPosition) {
+        const timeline = window.visTimelineInstances[this.timelineId];
+        if (timeline) {
+            setTimeBarNormalized(this.timelineId, 0, this.trackLength, normalizedPosition);
+        }
+    }
+    cleanup() {
+        // Disconnect observer
+        if (this.observer) {
+            this.observer.disconnect();
+            this.observer = null;
+        }
+    }
+}
+function initVideoSync(videoId, timelineId, trackLengthItemId) {
+    try {
+        // Initialize syncs container if it doesn't exist
+        if (!window.timelineSyncs) {
+            window.timelineSyncs = {};
+        }
+        // Cleanup existing sync if any
+        if (window.timelineSyncs[timelineId]) {
+            window.timelineSyncs[timelineId].cleanup();
+        }
+        // Create new sync instance
+        window.timelineSyncs[timelineId] = new VideoTimelineSync(videoId, timelineId, trackLengthItemId);
+    } catch (error) {
+        console.error('Error initializing video sync:', error);
+    }
+    return null;
+}
+function getTimelineItemData(timelineId, itemId) {
+    const timeline = window.visTimelineInstances[timelineId];
+    if (!timeline) {
+        console.error(`Timeline instance ${timelineId} not found`);
+        return null;
+    }
+    const items = timeline.itemSet?.items;
+    if (!items) {
+        console.error('Timeline items not found');
+        return null;
+    }
+    const item = items[itemId];
+    if (!item) {
+        return null;
+    }
+    const itemData = item.data;
+    if (!itemData) {
+        console.error('Track length item data not found');
+        return null;
+    }
+    return item.data;
+}
+function setTimelineWindowToItemLength(timelineId, itemId) {
+    const itemData = getTimelineItemData(timelineId, itemId);
+    if (!itemData) {
+        return;
+    }
+    try {
+        const timeline = window.visTimelineInstances[timelineId];
+        console.log(itemData.end);
+        timeline.setWindow(itemData.start, new Date(itemData.end.getTime() + 20), {animation: false});
+    } catch (error) {
+        console.error('Error setting timeline window:', error);
+    }
+}

custom_style.css ADDED Viewed

	@@ -0,0 +1,15 @@

+#frame_interval_slider input[type="range"] {
+    --range_progress_inverted: calc(100% - var(--range_progress));
+}
+#frame_interval_slider input[type="range"]::-webkit-slider-runnable-track {
+    background: linear-gradient(to right, var(--color-accent) var(--range_progress_inverted), var(--neutral-200) var(--range_progress_inverted));
+}
+#frame_interval_slider input[type="range"]::-moz-range-track {
+    background: linear-gradient(to right, var(--color-accent) var(--range_progress_inverted), var(--color-accent) var(--range_progress_inverted));
+}
+#frame_interval_slider .slider_input_container {
+    direction: rtl;
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=5.12.0
+openai>=1.59.7
+Pillow>=10.4.0
+opencv-python-headless>=4.10.0.84
+elevenlabs>=1.50.3
+pydub>=0.25.1
+numpy>=2.2.1
+moviepy==1.0.3
+gradio_vistimeline>=1.0.1