TMBoeren commited on
Commit
df8fe58
·
1 Parent(s): 65d9759

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ output_videos/*
2
+ *.mp4
3
+
4
+ .gradio/
5
+ .venv/
6
+ venv/
7
+
8
+ **/__pycache__/
9
+ *.pyc
10
+ *.pyo
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 DODI-Research
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,762 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import math
4
+ import os
5
+ import shutil
6
+ from auto_foley import run_auto_foley as af
7
+ from datetime import datetime
8
+ from gradio_vistimeline import VisTimeline
9
+
10
+ TIMELINE_ID = "editor-tab-timeline"
11
+ OUTPUT_VIDEO_ID = "output-video-player"
12
+ TRACK_LENGTH_ID = "track-length-item"
13
+
14
+ # --- Demo specific helper functions ---
15
+ def parse_date_to_milliseconds(date):
16
+ if isinstance(date, int): # Input is already in milliseconds (Unix timestamp)
17
+ return date
18
+ elif isinstance(date, str): # Input is ISO8601 datetime string
19
+ dt = datetime.fromisoformat(date.replace("Z", "+00:00"))
20
+ epoch = datetime(1970, 1, 1, tzinfo=dt.tzinfo)
21
+ return int((dt - epoch).total_seconds() * 1000)
22
+ else:
23
+ return 0
24
+
25
+ def parse_frame_to_timestamp(frame, framerate):
26
+ # Convert frame number with given framerate to corresponding time in milliseconds
27
+ exact_ms = (frame / framerate) * 1000
28
+ # Round up to nearest 50ms, which is the smallest time step on the maximum zoom of our timeline
29
+ rounded_ms = math.ceil(exact_ms / 50) * 50
30
+ return int(rounded_ms)
31
+
32
+ def format_video_info(video_info):
33
+ if video_info is None:
34
+ return ""
35
+
36
+ label_names = {
37
+ "Width": "Width",
38
+ "Height": "Height",
39
+ "Duration": "Length in seconds",
40
+ "FrameCount": "Number of frames",
41
+ "FrameRate": "Frame rate (fps)",
42
+ "FrameInterval": "Frame interval"
43
+ }
44
+
45
+ info = ""
46
+ for key, label in label_names.items():
47
+ if key in video_info:
48
+ info += f"{label}: {video_info[key]}\n"
49
+ return info
50
+
51
+ def update_video_info_advanced_input(frame_interval, downscale_samples, downscale_target, video_info):
52
+ """
53
+ Save the frame interval to the current job state and update the markdown text above the frame interval slider
54
+ """
55
+ if video_info is None:
56
+ return "Upload a video first.", video_info
57
+
58
+ video_info['DownScaleSamples'] = downscale_samples
59
+ video_info['FrameInterval'] = frame_interval
60
+ try:
61
+ if downscale_samples:
62
+ max_side = int(downscale_target[:-2]) # Remove the "px" from the "512px" format that the dropdown returns
63
+ video_info['DownscaledWidth'], video_info['DownscaledHeight'] = af.downscale_dimensions(video_info['Width'], video_info['Height'], max_side)
64
+ else:
65
+ video_info['DownscaledWidth'] = video_info['Width']
66
+ video_info['DownscaledHeight'] = video_info['Height']
67
+
68
+ frame_count = video_info['FrameCount']
69
+ frame_rate = video_info['FrameRate']
70
+
71
+ if not frame_count or not frame_rate:
72
+ return "Video information not available.", video_info
73
+
74
+ samples_count = (frame_count // frame_interval) + 2
75
+ samples_per_second = frame_rate / frame_interval
76
+ cost = af.calculate_video_input_cost(video_info['DownscaledWidth'], video_info['DownscaledHeight'], samples_count)
77
+ return f"Minimum input cost: {cost}<br />Video will be split into {samples_count} samples total. Or approximately {samples_per_second:.1f} samples per second.", video_info
78
+ except Exception as e:
79
+ return f"Error calculating frame interval: {str(e)}", video_info
80
+
81
+ # --- Tab 1 UI State Management ---
82
+ def trigger_frame_interval_slider_rerender(on_video_uploaded_state):
83
+ return not on_video_uploaded_state
84
+
85
+ def get_slider_config(video_info):
86
+ if not video_info:
87
+ return None
88
+
89
+ total_frames = video_info.get('FrameCount', {}).get('Value', 0)
90
+ framerate = video_info.get('FrameRate', {}).get('Value', 0)
91
+
92
+ if not total_frames or not framerate:
93
+ return None
94
+
95
+ max_interval = total_frames // 2
96
+
97
+ return {
98
+ 'minimum': 1,
99
+ 'maximum': max_interval,
100
+ 'step': 1,
101
+ 'value': framerate,
102
+ 'label': f"Frame Interval (1-{max_interval})"
103
+ }
104
+
105
+ def get_generate_descriptions_button(is_interactive):
106
+ return gr.Button("Generate Video Description and Audio Sources", interactive=is_interactive)
107
+
108
+ def get_generate_audio_button(is_interactive):
109
+ return gr.Button("Generate Audio", variant="primary", interactive=is_interactive)
110
+
111
+ def set_generate_buttons_active():
112
+ return get_generate_descriptions_button(True), get_generate_audio_button(True)
113
+
114
+ def set_generate_buttons_inactive():
115
+ return get_generate_descriptions_button(False), get_generate_audio_button(False)
116
+
117
+ def go_to_tab(id):
118
+ return gr.Tabs(selected=id)
119
+
120
+ # --- Tab 1 Functionality ---
121
+ def on_video_upload(video):
122
+ if video is None:
123
+ return get_generate_descriptions_button(False), get_generate_audio_button(False), None, "", None, ""
124
+ try:
125
+ video_info = af.get_video_info(video)
126
+ except Exception as e:
127
+ gr.Warning(f"Error: {e}")
128
+ return get_generate_descriptions_button(True), get_generate_audio_button(True), video_info, "", None, ""
129
+
130
+ def generate_descriptions(video, video_info, prompt_instruction, vision_lm_api_key):
131
+ if not video or video_info is None:
132
+ return None, "", video_info
133
+ try:
134
+ audio_sources, _ = af.process_video(video, video_info['FrameInterval'], video_info['DownscaledWidth'], video_info['DownscaledHeight'], prompt_instruction, vision_lm_api_key)
135
+ json_output = json.dumps(audio_sources, indent=4)
136
+ return json_output, json_output, audio_sources
137
+ except Exception as e:
138
+ gr.Warning(f"Error: {e}")
139
+ return None, "", {}
140
+
141
+ def generate_all_audio(video, video_info, prompt_instruction, generate_descriptions_json_output, generate_descriptions_json_textbox, vision_lm_api_key, ttsfx_api_key, progress=gr.Progress()):
142
+ # Check if user has provided their own descriptions through the advanced input textbox
143
+ valid_json = True
144
+ if generate_descriptions_json_textbox and not generate_descriptions_json_textbox.isspace():
145
+ # Validate the expected structure
146
+ try:
147
+ audio_sources = json.loads(generate_descriptions_json_textbox)
148
+ if audio_sources and not isinstance(audio_sources, dict):
149
+ valid_json = False
150
+ else:
151
+ required_keys = {'AudioSources', 'AmbientAudioSources'}
152
+ if not all(key in audio_sources for key in required_keys):
153
+ valid_json = False
154
+ if not isinstance(audio_sources['AudioSources'], list) or not isinstance(audio_sources['AmbientAudioSources'], list):
155
+ valid_json = False
156
+ except:
157
+ valid_json = False
158
+ else:
159
+ valid_json = False
160
+ # If descriptions weren't given, generate them now
161
+ if not valid_json:
162
+ progress((1, 3), desc="Processing video")
163
+ try:
164
+ audio_sources, _ = af.process_video(video, video_info['FrameInterval'], video_info['DownscaledWidth'], video_info['DownscaledHeight'], prompt_instruction, vision_lm_api_key)
165
+ json_output = json.dumps(audio_sources, indent=4)
166
+ generate_descriptions_json_output = json_output
167
+ generate_descriptions_json_textbox = json_output
168
+ except Exception as e:
169
+ raise gr.Error(f"Could not generate audio: {e}")
170
+ # Generate audio files all the audio sources
171
+ progress((2, 3), desc="Generating audio")
172
+ try:
173
+ audio_sources = af.generate_all_audio(audio_sources, ttsfx_api_key)
174
+ except Exception as e:
175
+ raise gr.Error(f"Could not generate audio: {e}")
176
+ return "", audio_sources, generate_descriptions_json_output, generate_descriptions_json_textbox
177
+
178
+ # --- Tab 2 UI State Management ---
179
+ def copy_video_info_to_edit_tab(video_path, video_info):
180
+ video_info['VideoPath'] = video_path
181
+ return video_info
182
+
183
+ def copy_video_info_to_edit_tab_if_none(video_path, video_input_info, video_edit_info):
184
+ if not video_edit_info and video_input_info:
185
+ video_edit_info = copy_video_info_to_edit_tab(video_path, video_input_info)
186
+ return video_edit_info
187
+
188
+ def set_render_button_state(unrendered_changes_flag):
189
+ return gr.Button("Combine All Audio & Render Video", variant="primary", interactive=unrendered_changes_flag)
190
+
191
+ def reset_new_audio_source_counter():
192
+ return 0
193
+
194
+ def set_buttons_state_selected_audio_source(selected_audio_source):
195
+ set_interactive = selected_audio_source is not None
196
+ return gr.Button(value="Delete Selected Audio Source", variant="stop", interactive=set_interactive), gr.Button("Generate", variant="primary", interactive=set_interactive), gr.Button("Save Changes", interactive=set_interactive)
197
+
198
+ def sync_form_to_selected_audio_source(selected_audio_source):
199
+ accordion_label = "Edit Audio Source Properties"
200
+ if selected_audio_source is None:
201
+ return gr.Accordion(label=accordion_label, open=False), 1.0, None, ""
202
+ return gr.Accordion(label=accordion_label, open=True), selected_audio_source.get('Volume', 1.0), selected_audio_source.get('AudioPath', None), selected_audio_source['SoundDescription']
203
+
204
+ # --- Tab 2 VisTimelineData & AudioSource helper functions ---
205
+ def parse_single_audio_source(audio_source, video_fps, group_id):
206
+ timeline_item = {
207
+ "id": audio_source['SourceSlugID'],
208
+ "content": audio_source['SoundDescription'],
209
+ "group": group_id,
210
+ "start": parse_frame_to_timestamp(audio_source['StartFrameIndex'], video_fps),
211
+ "end": parse_frame_to_timestamp(audio_source['EndFrameIndex'], video_fps)
212
+ }
213
+ return timeline_item
214
+
215
+ def parse_audio_sources_to_timeline_data(audio_sources, video_info):
216
+ video_fps = video_info['FrameRate']
217
+ last_frame = video_info['FrameCount'] - 1
218
+ timeline_data = {
219
+ "groups": [{"id": "track-length", "content": ""}, {"id": 1, "content": ""}, {"id": 2, "content": ""}],
220
+ "items": [
221
+ {
222
+ "id": TRACK_LENGTH_ID,
223
+ "content": "",
224
+ "group": "track-length",
225
+ "selectable": False,
226
+ "type": "background",
227
+ "start": 0,
228
+ "end": parse_frame_to_timestamp(last_frame, video_fps),
229
+ "className": "color-primary-600"
230
+ }
231
+ ]
232
+ }
233
+ for audio_source in audio_sources.get('AudioSources', []):
234
+ timeline_data['items'].append(parse_single_audio_source(audio_source, video_fps, 1))
235
+ for ambient_audio_source in audio_sources.get('AmbientAudioSources', []):
236
+ timeline_data['items'].append(parse_single_audio_source(ambient_audio_source, video_fps, 2))
237
+ return timeline_data
238
+
239
+ def get_audio_source_by_slug(audio_sources, slug):
240
+ for audio_source in audio_sources.get('AudioSources', []):
241
+ if audio_source['SourceSlugID'] == slug:
242
+ return audio_source
243
+ for audio_source in audio_sources.get('AmbientAudioSources', []):
244
+ if audio_source['SourceSlugID'] == slug:
245
+ return audio_source
246
+ return None
247
+
248
+ def update_audio_source_with_timeline_item_data(audio_source, timeline_item, max_duration, frame_rate):
249
+ start_ms = max(0, parse_date_to_milliseconds(timeline_item["start"]))
250
+ end_ms = min(max_duration, parse_date_to_milliseconds(timeline_item["end"]))
251
+ audio_source['StartFrameIndex'] = int((start_ms / 1000) * frame_rate)
252
+ audio_source['EndFrameIndex'] = int((end_ms / 1000) * frame_rate)
253
+ audio_source['Duration'] = (end_ms - start_ms) / 1000
254
+ return audio_source
255
+
256
+ # --- Tab 2 Timeline ---
257
+ def focus_timeline_on_tab_select(set_timeline_window_on_next_tab_change, trigger_timeline_window_focus):
258
+ if set_timeline_window_on_next_tab_change == True:
259
+ return False, not trigger_timeline_window_focus
260
+ return set_timeline_window_on_next_tab_change, trigger_timeline_window_focus
261
+
262
+ def focus_timeline_on_new_source_added(audio_sources, trigger_timeline_window_focus):
263
+ # Only focus the timeline if there's only one audio source because this would mean there were none earlier
264
+ audio_count = len(audio_sources.get('AudioSources', []) + audio_sources.get('AmbientAudioSources', []))
265
+ if audio_count > 1:
266
+ return trigger_timeline_window_focus
267
+ return not trigger_timeline_window_focus
268
+
269
+ def on_timeline_item_select(audio_sources, event_data: gr.EventData):
270
+ selected_ids = event_data._data
271
+ if not selected_ids:
272
+ return None
273
+ return get_audio_source_by_slug(audio_sources, selected_ids[0]) # Because we instantiate all timeline items with their ids set to the audio_source's slug
274
+
275
+ def on_timeline_input(timeline: dict[str, any], all_audio_sources, video_info):
276
+ if hasattr(timeline, "model_dump"):
277
+ data = timeline.model_dump(exclude_none=True)
278
+ else:
279
+ data = timeline
280
+
281
+ video_duration_ms = video_info['Duration'] * 1000
282
+ frame_rate = video_info['FrameRate']
283
+
284
+ for audio_source in all_audio_sources.get('AudioSources', []):
285
+ for timeline_item in data['items']:
286
+ if timeline_item['id'] == audio_source['SourceSlugID']:
287
+ audio_source = update_audio_source_with_timeline_item_data(audio_source, timeline_item, video_duration_ms, frame_rate)
288
+ break
289
+ for ambient_audio_source in all_audio_sources.get('AmbientAudioSources', []):
290
+ for timeline_item in data['items']:
291
+ if timeline_item['id'] == ambient_audio_source['SourceSlugID']:
292
+ ambient_audio_source = update_audio_source_with_timeline_item_data(ambient_audio_source, timeline_item, video_duration_ms, frame_rate)
293
+ break
294
+ return all_audio_sources
295
+
296
+ # --- Tab 2 Functionality ---
297
+ def comp_all_audio_to_video(audio_sources, video_info):
298
+ try:
299
+ input_video_path = video_info['VideoPath']
300
+ if not audio_sources:
301
+ input_video_path
302
+ return
303
+
304
+ output_directory = "output_videos"
305
+ # Ensure the output directory exists
306
+ if not os.path.exists(output_directory):
307
+ os.makedirs(output_directory)
308
+ else:
309
+ # Clear the output directory before saving the new video
310
+ for filename in os.listdir(output_directory):
311
+ file_path = os.path.join(output_directory, filename)
312
+ try:
313
+ if os.path.isfile(file_path) or os.path.islink(file_path):
314
+ os.unlink(file_path)
315
+ elif os.path.isdir(file_path):
316
+ shutil.rmtree(file_path)
317
+ except Exception as e:
318
+ raise OSError(f"Failed to delete {file_path}. Reason: {e}")
319
+
320
+ # Generate a unique output filename
321
+ input_filename = os.path.basename(input_video_path)
322
+ file_name_without_extension, file_extension = os.path.splitext(input_filename)
323
+ output_video_name = f"{file_name_without_extension}_output{file_extension}"
324
+ output_video_path = os.path.join(output_directory, output_video_name)
325
+ output_video_path = af.combine_video_and_audio(audio_sources, input_video_path, output_video_path)
326
+ except Exception as e:
327
+ gr.Warning(f"Failed to add the audio to the video: {e}")
328
+ return output_video_path
329
+
330
+ def generate_new_audio(prompt, audio_player, selected_audio_source, ttsfx_api_key):
331
+ if not selected_audio_source:
332
+ return audio_player
333
+ try:
334
+ new_audio_file_path = af.generate_audio(prompt, selected_audio_source.get("Duration"), ttsfx_api_key)
335
+ if new_audio_file_path:
336
+ return new_audio_file_path
337
+ return audio_player
338
+ except:
339
+ return audio_player
340
+
341
+ def add_new_audio_source(all_audio_sources, new_audio_sources_counter):
342
+ new_audio_sources_counter += 1
343
+ new_audio_source = {
344
+ 'SourceSlugID': f"NewAudioSource{new_audio_sources_counter}",
345
+ 'StartFrameIndex': 0,
346
+ 'EndFrameIndex': 75,
347
+ 'Duration': 3.0,
348
+ 'AudioPath': None,
349
+ 'SoundDescription': f"New audio source {new_audio_sources_counter}",
350
+ 'Volume': 1.0
351
+ }
352
+ audio_sources = all_audio_sources.get('AudioSources', [])
353
+ audio_sources.append(new_audio_source)
354
+ return {"AudioSources": audio_sources, "AmbientAudioSources": all_audio_sources.get('AmbientAudioSources', [])}, new_audio_sources_counter
355
+
356
+ def delete_selected_audio_source(selected_audio_source, all_audio_sources):
357
+ if not selected_audio_source:
358
+ return None, all_audio_sources
359
+
360
+ slug_to_delete = selected_audio_source['SourceSlugID']
361
+ audio_sources = all_audio_sources.get('AudioSources', [])
362
+ audio_sources = [source for source in audio_sources if source['SourceSlugID'] != slug_to_delete]
363
+
364
+ ambient_audio_sources = all_audio_sources.get('AmbientAudioSources', [])
365
+ ambient_audio_sources = [source for source in ambient_audio_sources if source['SourceSlugID'] != slug_to_delete]
366
+
367
+ return None, {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_audio_sources}
368
+
369
+ def overwrite_changes_to_selected_audio_source(volume, audio_path, prompt, selected_audio_source, all_audio_sources):
370
+ if not selected_audio_source:
371
+ return all_audio_sources, selected_audio_source
372
+
373
+ selected_slug = selected_audio_source['SourceSlugID']
374
+ updated_source = selected_audio_source.copy()
375
+ updated_source.update({
376
+ 'SoundDescription': prompt,
377
+ 'AudioPath': audio_path,
378
+ 'Volume': float(volume)
379
+ })
380
+
381
+ audio_sources = all_audio_sources.get('AudioSources', [])
382
+ ambient_sources = all_audio_sources.get('AmbientAudioSources', [])
383
+
384
+ for i, source in enumerate(audio_sources):
385
+ if source['SourceSlugID'] == selected_slug:
386
+ audio_sources[i] = updated_source
387
+ return {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_sources}, selected_audio_source # Return early to skip the second loop
388
+ for i, source in enumerate(ambient_sources):
389
+ if source['SourceSlugID'] == selected_slug:
390
+ ambient_sources[i] = updated_source
391
+ break
392
+ return {'AudioSources': audio_sources, 'AmbientAudioSources': ambient_sources}, selected_audio_source
393
+
394
+ # --- Custom JS and CSS ---
395
+ current_dir = os.path.dirname(os.path.abspath(__file__))
396
+ js_path = os.path.join(current_dir, 'custom_script.js')
397
+ css_path = os.path.join(current_dir, 'custom_style.css')
398
+
399
+ with open(js_path, 'r') as f:
400
+ js_content = f.read()
401
+
402
+ with open(css_path, 'r') as f:
403
+ css_content = f.read()
404
+
405
+ head = f"""<script>{js_content}</script><style>{css_content}.vis-custom-time.{TIMELINE_ID} {{pointer-events: none !important;}}</style>"""
406
+
407
+ # --- Gradio UI ---
408
+ with gr.Blocks(head=head) as ui:
409
+ # Initialize per-user Gradio states with default values
410
+ ttsfx_api_key_state = gr.State(value=None)
411
+ vision_lm_api_key_state = gr.State(value=None)
412
+
413
+ video_input_info_state = gr.State(value={})
414
+ video_edit_info_state = gr.State(value={})
415
+
416
+ audio_sources_state = gr.State(value={})
417
+ selected_audio_source_state = gr.State(value={})
418
+ new_audio_sources_counter = gr.State(value=0)
419
+
420
+ trigger_frame_interval_slider_render = gr.State(value=False)
421
+ trigger_timeline_window_focus = gr.State(value=False)
422
+ set_timeline_window_on_next_tab_change = gr.State(value=True)
423
+ unrendered_changes_flag = gr.State(value=False)
424
+
425
+ gr.Markdown("### Auto-Foley Editor")
426
+
427
+ with gr.Tabs() as tabs:
428
+ # --- Tab 1 ---
429
+ with gr.TabItem("Input", id=0) as input_tab:
430
+ with gr.Row(equal_height=True):
431
+ video_input = gr.Video(label="Upload a Video", height=206, sources='upload')
432
+ video_info_display = gr.Textbox(label="Video Information", lines=6, interactive=False)
433
+
434
+ with gr.Accordion("Input control", open=False):
435
+ @gr.render(inputs=[video_input_info_state], triggers=[trigger_frame_interval_slider_render.change])
436
+ def render_frame_interval_slider(video_info):
437
+ total_frames = video_info.get('FrameCount', 0)
438
+ frame_rate = video_info.get('FrameRate', 0)
439
+ max_interval = total_frames // 2
440
+
441
+ with gr.Row(equal_height=True):
442
+ with gr.Column():
443
+ cost_and_frame_interval_info = gr.Markdown("Upload a video")
444
+ frame_interval_slider = gr.Slider(
445
+ elem_id="frame_interval_slider",
446
+ minimum=1,
447
+ maximum=max_interval,
448
+ step=1,
449
+ value=frame_rate,
450
+ label=f"Frame Interval ({max_interval}-1)"
451
+ )
452
+
453
+ with gr.Column(scale=0):
454
+ downscale_samples_checkbox = gr.Checkbox(
455
+ value=True,
456
+ interactive=True,
457
+ label="Downscale samples"
458
+ )
459
+
460
+ downscale_resolution_dropdown = gr.Dropdown(
461
+ choices=["512px", "768px", "1024px"],
462
+ value="512px",
463
+ type="value",
464
+ interactive=True,
465
+ label="Max side"
466
+ )
467
+
468
+ frame_interval_slider.change(
469
+ fn=update_video_info_advanced_input,
470
+ inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
471
+ outputs=[cost_and_frame_interval_info, video_input_info_state]
472
+ )
473
+
474
+ downscale_samples_checkbox.change(
475
+ fn=update_video_info_advanced_input,
476
+ inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
477
+ outputs=[cost_and_frame_interval_info, video_input_info_state]
478
+ )
479
+
480
+ downscale_resolution_dropdown.change(
481
+ fn=update_video_info_advanced_input,
482
+ inputs=[frame_interval_slider, downscale_samples_checkbox, downscale_resolution_dropdown, video_input_info_state],
483
+ outputs=[cost_and_frame_interval_info, video_input_info_state]
484
+ )
485
+
486
+ custom_instruction_textbox = gr.Textbox(label="Optional custom instruction for the LLM:", interactive=True)
487
+
488
+ with gr.Accordion("Observe or edit the LLM's response before generating audio with it:", open=False):
489
+ generate_descriptions_button = get_generate_descriptions_button(False)
490
+ with gr.Tabs():
491
+ with gr.Tab("View"):
492
+ generate_descriptions_json_output = gr.JSON(label="JSON")
493
+ with gr.Tab("Edit"):
494
+ generate_descriptions_json_textbox = gr.Textbox(label="JSON", lines=22, interactive=True)
495
+
496
+ generate_all_audio_button = get_generate_audio_button(False)
497
+ generate_all_progress_textbox = gr.Textbox(show_label=False, visible=False)
498
+
499
+ # --- Tab 2 ---
500
+ with gr.TabItem("Output & Edit", id=1) as output_tab:
501
+ video_comp_output = gr.Video(label="Result", height=480, interactive=False, elem_id=OUTPUT_VIDEO_ID)
502
+
503
+ with gr.Row():
504
+ with gr.Column():
505
+ with gr.Row():
506
+ add_audio_source_button = gr.Button("Add New Audio Source")
507
+ delete_audio_source_button = gr.Button(value="Delete Selected Audio Source", variant="stop", interactive=False)
508
+ with gr.Column():
509
+ comp_audio_button = gr.Button("Combine All Audio & Render Video", variant="primary", interactive=False)
510
+
511
+ timeline = VisTimeline(
512
+ value={"groups": [{"id": "track-length", "content": ""}, {"id": 1, "content": ""}, {"id": 2, "content": ""}], "items": []},
513
+ options={
514
+ "moment": "+00:00",
515
+ "showCurrentTime": False,
516
+ "editable": {
517
+ "add": False,
518
+ "remove": False,
519
+ "updateGroup": False,
520
+ "updateTime": True
521
+ },
522
+ "itemsAlwaysDraggable": {
523
+ "item": True,
524
+ "range": True
525
+ },
526
+ "showMajorLabels": False,
527
+ "format": {
528
+ "minorLabels": {
529
+ "millisecond": "mm:ss.SSS",
530
+ "second": "mm:ss",
531
+ "minute": "mm:ss",
532
+ "hour": "HH:mm:ss"
533
+ }
534
+ },
535
+ "start": 0,
536
+ "end": 10000,
537
+ "min": 0,
538
+ "max": 22000,
539
+ "zoomMin": 1000,
540
+ },
541
+ elem_id=TIMELINE_ID
542
+ )
543
+
544
+ with gr.Accordion("Edit Audio Source Properties", open=False) as selected_source_accordion:
545
+ with gr.Group():
546
+ selected_audio_volume_slider = gr.Slider(label="Volume", minimum=0.0, maximum=1.0, step=0.01, value=1.0)
547
+ selected_audio_player = gr.Audio(label="Audio", type="filepath")
548
+ with gr.Accordion("Generate New Audio", open=False):
549
+ selected_audio_prompt_textbox = gr.Textbox(label="Prompt")
550
+ selected_audio_overwrite_audio_button = gr.Button("Generate", variant="primary", interactive=False)
551
+ save_changes_button = gr.Button("Save Changes", interactive=False)
552
+
553
+ # --- Tab 3 ---
554
+ with gr.TabItem("Set API Keys", id=2) as settings_tab:
555
+ vision_lm_api_key_textbox = gr.Textbox(label="OpenAI API Key", type='password')
556
+ ttsfx_api_key_textbox = gr.Textbox(label="ElevenLabs API Key", type='password')
557
+
558
+ # Tab 1 interactions
559
+ input_tab.select(
560
+ fn=lambda: False, outputs=set_timeline_window_on_next_tab_change
561
+ )
562
+
563
+ video_input.change(
564
+ fn=on_video_upload,
565
+ inputs=video_input,
566
+ outputs=[generate_descriptions_button, generate_all_audio_button, video_input_info_state, custom_instruction_textbox, generate_descriptions_json_output, generate_descriptions_json_textbox]
567
+ ).then(
568
+ fn=trigger_frame_interval_slider_rerender,
569
+ inputs=trigger_frame_interval_slider_render,
570
+ outputs=trigger_frame_interval_slider_render
571
+ )
572
+
573
+ video_input_info_state.change(
574
+ fn=format_video_info,
575
+ inputs=video_input_info_state,
576
+ outputs=video_info_display
577
+ )
578
+
579
+ generate_descriptions_button.click(
580
+ fn=set_generate_buttons_inactive, outputs=[generate_descriptions_button, generate_all_audio_button]
581
+ ).then(
582
+ fn=generate_descriptions,
583
+ inputs=[video_input, video_input_info_state, custom_instruction_textbox, vision_lm_api_key_state],
584
+ outputs=[generate_descriptions_json_output, generate_descriptions_json_textbox, audio_sources_state],
585
+ concurrency_id="long_job"
586
+ ).then(
587
+ fn=set_generate_buttons_active, outputs=[generate_descriptions_button, generate_all_audio_button]
588
+ )
589
+
590
+ generate_all_audio_button.click(fn=lambda: 0, outputs=new_audio_sources_counter)
591
+
592
+ generate_all_audio_button.click(
593
+ fn=copy_video_info_to_edit_tab,
594
+ inputs=[video_input, video_input_info_state],
595
+ outputs=video_edit_info_state
596
+ ).then(
597
+ fn=lambda: 0, outputs=new_audio_sources_counter
598
+ ).then(
599
+ fn=lambda: True, outputs=set_timeline_window_on_next_tab_change
600
+ ).then(
601
+ fn=lambda: gr.Textbox(show_label=False, visible=True), outputs=generate_all_progress_textbox
602
+ ).then(
603
+ fn=set_generate_buttons_inactive, outputs=[generate_descriptions_button, generate_all_audio_button]
604
+ ).then(
605
+ fn=generate_all_audio,
606
+ inputs=[video_input, video_edit_info_state, custom_instruction_textbox, generate_descriptions_json_output, generate_descriptions_json_textbox, vision_lm_api_key_state, ttsfx_api_key_state],
607
+ outputs=[generate_all_progress_textbox, audio_sources_state, generate_descriptions_json_output, generate_descriptions_json_textbox],
608
+ concurrency_id="long_job"
609
+ ).then(
610
+ fn=parse_audio_sources_to_timeline_data,
611
+ inputs=[audio_sources_state, video_edit_info_state],
612
+ outputs=timeline
613
+ ).then(
614
+ fn=lambda: go_to_tab(1),
615
+ inputs=[],
616
+ outputs=tabs
617
+ ).then(
618
+ fn=set_generate_buttons_active, outputs=[generate_descriptions_button, generate_all_audio_button]
619
+ ).then(
620
+ fn=lambda: gr.Textbox(show_label=False, visible=False), outputs=generate_all_progress_textbox
621
+ ).then(
622
+ fn=comp_all_audio_to_video,
623
+ inputs=[audio_sources_state, video_edit_info_state],
624
+ outputs=video_comp_output,
625
+ concurrency_id="comp"
626
+ ).then(
627
+ fn=lambda: False, outputs=unrendered_changes_flag
628
+ ).then(
629
+ fn=None,
630
+ js=f'() => initVideoSync("{OUTPUT_VIDEO_ID}", "{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
631
+ )
632
+
633
+ # Tab 2 interactions
634
+ output_tab.select(
635
+ fn=copy_video_info_to_edit_tab_if_none,
636
+ inputs=[video_input, video_input_info_state, video_edit_info_state],
637
+ outputs=[video_edit_info_state]
638
+ ).then(
639
+ fn=focus_timeline_on_tab_select,
640
+ inputs=[set_timeline_window_on_next_tab_change, trigger_timeline_window_focus],
641
+ outputs=[set_timeline_window_on_next_tab_change, trigger_timeline_window_focus]
642
+ )
643
+
644
+ unrendered_changes_flag.change(
645
+ fn=set_render_button_state,
646
+ inputs=unrendered_changes_flag,
647
+ outputs=comp_audio_button
648
+ )
649
+
650
+ trigger_timeline_window_focus.change(
651
+ fn=None,
652
+ js=f'() => setTimelineWindowToItemLength("{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
653
+ )
654
+
655
+ comp_audio_button.click(
656
+ fn=comp_all_audio_to_video,
657
+ inputs=[audio_sources_state, video_edit_info_state],
658
+ outputs=video_comp_output,
659
+ concurrency_id="comp"
660
+ ).then(
661
+ fn=lambda: False, outputs=unrendered_changes_flag
662
+ ).then(
663
+ fn=None,
664
+ js=f'() => initVideoSync("{OUTPUT_VIDEO_ID}", "{TIMELINE_ID}", "{TRACK_LENGTH_ID}")'
665
+ )
666
+
667
+ add_audio_source_button.click(
668
+ fn=add_new_audio_source,
669
+ inputs=[audio_sources_state, new_audio_sources_counter],
670
+ outputs=[audio_sources_state, new_audio_sources_counter]
671
+ ).then(
672
+ fn=parse_audio_sources_to_timeline_data,
673
+ inputs=[audio_sources_state, video_edit_info_state],
674
+ outputs=timeline
675
+ ).then(
676
+ fn=focus_timeline_on_new_source_added,
677
+ inputs=[audio_sources_state, trigger_timeline_window_focus],
678
+ outputs=trigger_timeline_window_focus
679
+ ).then(
680
+ fn=lambda: True, outputs=unrendered_changes_flag
681
+ )
682
+
683
+ delete_audio_source_button.click(
684
+ fn=delete_selected_audio_source,
685
+ inputs=[selected_audio_source_state, audio_sources_state],
686
+ outputs=[selected_audio_source_state, audio_sources_state]
687
+ ).then(
688
+ fn=parse_audio_sources_to_timeline_data,
689
+ inputs=[audio_sources_state, video_edit_info_state],
690
+ outputs=timeline
691
+ ).then(
692
+ fn=lambda: True, outputs=unrendered_changes_flag
693
+ )
694
+
695
+ timeline.item_select(
696
+ fn=on_timeline_item_select,
697
+ inputs=[audio_sources_state],
698
+ outputs=selected_audio_source_state
699
+ )
700
+
701
+ timeline.input(
702
+ fn=on_timeline_input,
703
+ inputs=[timeline, audio_sources_state, video_edit_info_state],
704
+ outputs=audio_sources_state
705
+ ).then(
706
+ fn=lambda: True, outputs=unrendered_changes_flag
707
+ )
708
+
709
+ selected_audio_source_state.change(
710
+ fn=sync_form_to_selected_audio_source,
711
+ inputs=selected_audio_source_state,
712
+ outputs=[selected_source_accordion, selected_audio_volume_slider, selected_audio_player, selected_audio_prompt_textbox]
713
+ ).then(
714
+ fn=set_buttons_state_selected_audio_source,
715
+ inputs=selected_audio_source_state,
716
+ outputs=[delete_audio_source_button, selected_audio_overwrite_audio_button, save_changes_button]
717
+ )
718
+
719
+ selected_audio_overwrite_audio_button.click(
720
+ fn=generate_new_audio,
721
+ inputs=[selected_audio_prompt_textbox, selected_audio_player, selected_audio_source_state, ttsfx_api_key_state],
722
+ outputs=selected_audio_player
723
+ )
724
+
725
+ save_changes_button.click(
726
+ fn=overwrite_changes_to_selected_audio_source,
727
+ inputs=[
728
+ selected_audio_volume_slider,
729
+ selected_audio_player,
730
+ selected_audio_prompt_textbox,
731
+ selected_audio_source_state,
732
+ audio_sources_state
733
+ ],
734
+ outputs=[audio_sources_state, selected_audio_source_state]
735
+ ).then(
736
+ fn=parse_audio_sources_to_timeline_data,
737
+ inputs=[audio_sources_state, video_edit_info_state],
738
+ outputs=timeline
739
+ ).then(
740
+ fn=lambda: True, outputs=unrendered_changes_flag
741
+ )
742
+
743
+ # Tab 3 interactions
744
+ vision_lm_api_key_textbox.input(
745
+ fn=lambda a: a,
746
+ inputs=vision_lm_api_key_textbox,
747
+ outputs=vision_lm_api_key_state
748
+ )
749
+
750
+ ttsfx_api_key_textbox.input(
751
+ fn=lambda a: a,
752
+ inputs=ttsfx_api_key_textbox,
753
+ outputs=ttsfx_api_key_state
754
+ )
755
+
756
+ ui.load(
757
+ fn=lambda: (os.getenv('AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY'), os.getenv('AUTO_FOLEY_DEFAULT_TTSFX_API_KEY')),
758
+ outputs=[vision_lm_api_key_state, ttsfx_api_key_state]
759
+ )
760
+
761
+ if __name__ == "__main__":
762
+ ui.launch(show_api=False)
auto_foley/__init__.py ADDED
File without changes
auto_foley/run_auto_foley.py ADDED
@@ -0,0 +1,682 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import base64
3
+ import cv2
4
+ import math
5
+ import os
6
+ import sys
7
+ import tempfile
8
+ from PIL import Image
9
+ from pydantic import BaseModel
10
+ from typing import Optional
11
+
12
+ if __name__ == "__main__" or 'auto_foley' not in sys.modules:
13
+ import video_comping
14
+ from services import (VisionLMService, TTSFXService, set_default_vision_lm, set_default_ttsfx, get_services, get_api_keys)
15
+ else:
16
+ from auto_foley import video_comping
17
+ from auto_foley.services import (VisionLMService, TTSFXService, set_default_vision_lm, set_default_ttsfx, get_services, get_api_keys)
18
+
19
+ class AudioSource(BaseModel):
20
+ SourceSlugID: str
21
+ SoundDescription: str
22
+ StartFrameIndex: int
23
+ EndFrameIndex: int
24
+ Duration: float
25
+
26
+ AUDIO_SOURCES_KEY = 'AudioSources'
27
+ AMBIENT_AUDIO_SOURCES_KEY = 'AmbientAudioSources'
28
+ MINIMUM_AUDIO_DURATION = 0.5
29
+
30
+ # --- Low-level Helpers ---
31
+ def downscale_dimensions(width: int, height: int, max_side: int) -> tuple[int, int]:
32
+ """
33
+ Resize dimensions down while maintaining aspect ratio and respecting maximum side length.
34
+
35
+ Args:
36
+ width: Original width of the image/video
37
+ height: Original height of the image/video
38
+ max_side: Maximum allowed length for either dimension
39
+
40
+ Returns:
41
+ tuple: (new_width, new_height) maintaining original aspect ratio
42
+
43
+ Raises:
44
+ ValueError: If width, height, or max_side are less than or equal to 0
45
+ """
46
+ if width <= 0 or height <= 0 or max_side <= 0:
47
+ raise ValueError("Width, height, and max_side must all be positive integers")
48
+ if width <= max_side and height <= max_side:
49
+ return (width, height)
50
+
51
+ aspect_ratio = width / height
52
+ if width > height:
53
+ new_width = max_side
54
+ new_height = int(max_side / aspect_ratio)
55
+ else:
56
+ new_height = max_side
57
+ new_width = int(max_side * aspect_ratio)
58
+ return (new_width, new_height)
59
+
60
+ def calculate_duration(start_frame: int, end_frame: int, fps: int, frame_count: int) -> tuple[float, int, int]:
61
+ """
62
+ Calculate duration in seconds based on frame indices and FPS.
63
+
64
+ Args:
65
+ start_frame: Starting frame index
66
+ end_frame: Ending frame index
67
+ fps: Frames per second of the video
68
+ frame_count: Total number of frames in the video
69
+
70
+ Returns:
71
+ tuple: A tuple containing:
72
+ float: Duration in seconds (rounded to 3 decimal places)
73
+ int: Adjusted start frame index
74
+ int: Adjusted end frame index
75
+
76
+ Raises:
77
+ ValueError: If fps is less than or equal to 0
78
+ ValueError: If frame_count is less than or equal to 0
79
+ """
80
+ # Input validation
81
+ if fps <= 0:
82
+ raise ValueError("FPS must be greater than 0")
83
+ if frame_count <= 0:
84
+ raise ValueError("Frame count must be greater than 0")
85
+
86
+ start = max(0, start_frame)
87
+ end = min(frame_count - 1, end_frame) if end_frame is not None else frame_count - 1
88
+ if end <= start:
89
+ end = start + fps
90
+
91
+ duration_frames = end - start
92
+ duration_seconds = duration_frames / fps
93
+ # If duration is less than minimum, adjust frames
94
+ if duration_seconds < MINIMUM_AUDIO_DURATION:
95
+ frames_needed = int(MINIMUM_AUDIO_DURATION * fps) - duration_frames
96
+ # First try to extend end frame
97
+ target_end = end + frames_needed
98
+ if target_end <= frame_count - 1:
99
+ end = target_end
100
+ else:
101
+ # Can't extend end frame, need to adjust start frame instead
102
+ end = frame_count - 1
103
+ remaining_frames_needed = int(MINIMUM_AUDIO_DURATION * fps) - (end - start)
104
+ potential_start = start - remaining_frames_needed
105
+ start = max(0, potential_start)
106
+
107
+ final_duration_frames = end - start
108
+ final_duration = final_duration_frames / fps
109
+ return round(final_duration, 3), start, end
110
+
111
+ def combine_all_audio_sources(all_audio_sources: dict) -> list:
112
+ """
113
+ Helper function to convert format of audio sources into a single combined list.
114
+
115
+ Args:
116
+ all_audio_sources: Dictionary containing AudioSources and AmbientAudioSources lists
117
+ Expected format: {'AudioSources': [], 'AmbientAudioSources': []}
118
+
119
+ Returns:
120
+ list: Combined list of all audio sources from both categories
121
+ """
122
+ audio_sources = all_audio_sources.get(AUDIO_SOURCES_KEY, [])
123
+ ambient_audio_sources = all_audio_sources.get(AMBIENT_AUDIO_SOURCES_KEY, [])
124
+ return audio_sources + ambient_audio_sources
125
+
126
+ # --- Video ---
127
+ def get_video_path(video) -> str:
128
+ """
129
+ Converts video input into a file path, creating a temporary file if needed.
130
+
131
+ Args:
132
+ video: Video input that can be a file path string, a dictionary with video data, or a dictionary with a video name
133
+
134
+ Returns:
135
+ str: Path to the video file
136
+
137
+ Raises:
138
+ ValueError: If no video is provided or if video format is invalid
139
+ RuntimeError: If video data cannot be processed
140
+ """
141
+ if not video:
142
+ raise ValueError("No video file provided.")
143
+ if isinstance(video, str):
144
+ return video
145
+ elif isinstance(video, dict):
146
+ video_data = video.get('data')
147
+ video_name = video.get('name')
148
+ if video_name:
149
+ return video_name
150
+ else:
151
+ try:
152
+ # Save video data to temporary file
153
+ header, encoded = video_data.split(',', 1)
154
+ file_ext = header.split(';')[0].split('/')[1]
155
+ video_bytes = base64.b64decode(encoded)
156
+
157
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.' + file_ext)
158
+ temp_file.write(video_bytes)
159
+ temp_file.close()
160
+ return temp_file.name
161
+ except Exception as e:
162
+ raise RuntimeError(f"Failed to process video data: {e}")
163
+ else:
164
+ raise ValueError("Invalid video format")
165
+
166
+ def get_video_info(video_path: str) -> dict:
167
+ """
168
+ Extracts information about a video.
169
+
170
+ Args:
171
+ video_path: Path to the video file
172
+
173
+ Returns:
174
+ A dictionary containing video metadata:
175
+ {
176
+ "FilePath": str,
177
+ "Width": int,
178
+ "Height": int,
179
+ "Duration": float,
180
+ "FrameCount": int,
181
+ "FrameRate": float,
182
+ "FrameInterval": float
183
+ }
184
+
185
+ Raises:
186
+ ValueError: If video_path is None or empty
187
+ FileNotFoundError: If the video file cannot be opened
188
+ RuntimeError: If there's an error processing the video
189
+ """
190
+ if not video_path:
191
+ raise ValueError("Video file path was not given")
192
+ try:
193
+ cap = cv2.VideoCapture(video_path)
194
+ if not cap.isOpened():
195
+ raise FileNotFoundError(f"Unable to open the video file at '{video_path}'. Please verify the file path and format.")
196
+
197
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
198
+ fps = cap.get(cv2.CAP_PROP_FPS)
199
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
200
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
201
+ cap.release()
202
+ except Exception as e:
203
+ raise RuntimeError(f"{e}")
204
+ return {
205
+ "FilePath": video_path,
206
+ "Width": width,
207
+ "Height": height,
208
+ "Duration": total_frames / fps if fps != 0 else 0,
209
+ "FrameCount": total_frames,
210
+ "FrameRate": fps,
211
+ "FrameInterval": fps
212
+ }
213
+
214
+ def extract_frames(video_path: str, frame_interval: int, target_width: int = None, target_height: int = None) -> list:
215
+ """
216
+ Extracts frames from the video at the specified interval and includes the last frame.
217
+ Optionally downscales the frames to the target resolution.
218
+
219
+ Args:
220
+ video_path (str): Path to the video file
221
+ frame_interval (int): Interval between extracted frames
222
+ target_width (int, optional): Desired width of output frames. If None, original width is used
223
+ target_height (int, optional): Desired height of output frames. If None, original height is used
224
+
225
+ Returns:
226
+ list: List of tuples (frame_index, PIL Image)
227
+
228
+ Raises:
229
+ FileNotFoundError: If the video file cannot be opened
230
+ ValueError: If video contains no frames
231
+ """
232
+ cap = cv2.VideoCapture(video_path)
233
+ if not cap.isOpened():
234
+ raise FileNotFoundError("Could not open video.")
235
+ total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
236
+ if total_frames == 0:
237
+ cap.release()
238
+ raise ValueError("Video contains no frames.")
239
+
240
+ # Calculate target dimensions while maintaining aspect ratio if only one dimension is specified
241
+ original_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
242
+ original_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
243
+
244
+ if target_width is not None and target_height is None:
245
+ scale_factor = target_width / original_width
246
+ target_height = int(original_height * scale_factor)
247
+ elif target_height is not None and target_width is None:
248
+ scale_factor = target_height / original_height
249
+ target_width = int(original_width * scale_factor)
250
+
251
+ need_resize = (target_width is not None and target_height is not None and (target_width != original_width or target_height != original_height))
252
+
253
+ # Compute the frame indices to extract
254
+ if frame_interval <= 0 or frame_interval >= total_frames:
255
+ frame_indices = [0, total_frames - 1] if total_frames > 1 else [0]
256
+ else:
257
+ frame_indices = list(range(0, total_frames, frame_interval))
258
+ if frame_indices[-1] != total_frames - 1:
259
+ frame_indices.append(total_frames - 1)
260
+
261
+ frames = []
262
+ for frame_idx in frame_indices:
263
+ cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx)
264
+ ret, frame = cap.read()
265
+ if not ret:
266
+ continue # Skip if frame could not be read
267
+ if need_resize:
268
+ frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
269
+ pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
270
+ frames.append((frame_idx, pil_image))
271
+
272
+ cap.release()
273
+ return frames
274
+
275
+ def calculate_video_input_cost(width: int, height: int, sample_count: int, overwrite_vision_lm: Optional[VisionLMService] = None) -> float:
276
+ """
277
+ Calculates the input cost for video processing based on dimensions and sample count.
278
+
279
+ Args:
280
+ width: Width of the video in pixels
281
+ height: Height of the video in pixels
282
+ sample_count: Number of samples/frames being processed
283
+ overwrite_vision_lm: Optional custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
284
+
285
+ Returns:
286
+ float: The calculated input cost for video processing
287
+
288
+ Raises:
289
+ ValueError: If width, height, or sample_count are not positive numbers
290
+ RuntimeError: If there's an error during cost calculation
291
+ """
292
+ vision_lm, _ = get_services(vision_lm=overwrite_vision_lm)
293
+ if width <= 0 or height <= 0 or sample_count <= 0:
294
+ raise ValueError("Width, height, and sample count must be positive numbers")
295
+ try:
296
+ return vision_lm.calculate_video_input_cost(width, height, sample_count)
297
+ except Exception as e:
298
+ raise RuntimeError({str(e)})
299
+
300
+ # --- Audio ---
301
+ def validate_audio_sources(audio_sources: list, fps: int, frame_count: int) -> list:
302
+ """
303
+ Helper function to validate a list of audio sources while calculating durations
304
+ and adjusting frame indices to ensure minimum audio duration.
305
+
306
+ Args:
307
+ audio_sources: List of dictionaries containing audio source information
308
+ fps: Frames per second of the video
309
+ frame_count: Total number of frames in the video
310
+
311
+ Returns:
312
+ List of validated audio source dictionaries with adjusted durations and frame indices
313
+
314
+ Raises:
315
+ ValueError: If audio_sources is None or empty
316
+ TypeError: If any audio source item is missing required fields
317
+ ValueError: If frame indices are invalid or if fps is 0
318
+ """
319
+ if not audio_sources:
320
+ raise ValueError("Audio sources list cannot be empty")
321
+ if fps <= 0:
322
+ raise ValueError("FPS must be greater than 0")
323
+
324
+ validated_sources = []
325
+ for audio_source in audio_sources:
326
+ try:
327
+ # Calculate audio duration and get adjusted frame indices
328
+ duration, start_frame, end_frame = calculate_duration(
329
+ audio_source.get("StartFrameIndex", 0),
330
+ audio_source.get("EndFrameIndex", frame_count - 1),
331
+ fps,
332
+ frame_count
333
+ )
334
+ audio_source["Duration"] = duration
335
+ audio_source["StartFrameIndex"] = start_frame
336
+ audio_source["EndFrameIndex"] = end_frame
337
+ validated_audio_source = AudioSource(**audio_source)
338
+ validated_sources.append(validated_audio_source.model_dump())
339
+ except (KeyError, TypeError) as e:
340
+ raise TypeError(f"Invalid audio source format: {str(e)}")
341
+ return validated_sources
342
+
343
+ def generate_audio(prompt: str, duration: float, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> str:
344
+ """
345
+ Generates audio file for given prompt and duration.
346
+
347
+ Args:
348
+ prompt: Description of the sound to generate
349
+ duration: Length of the audio in seconds
350
+ ttsfx_api_key: API key for the text-to-speech service
351
+ overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
352
+
353
+ Returns:
354
+ str: Path to the generated audio file
355
+
356
+ Raises:
357
+ RuntimeError: If there's an error during audio generation
358
+ """
359
+ _, ttsfx = get_services(ttsfx=overwrite_ttsfx)
360
+ _, ttsfx_api_key = get_api_keys(ttsfx_api_key=ttsfx_api_key)
361
+ try:
362
+ audio_data = ttsfx.generate_sound_effect(prompt, duration, api_key=ttsfx_api_key)
363
+ temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
364
+ temp_audio_file.write(audio_data.read())
365
+ temp_audio_file.close()
366
+ return temp_audio_file.name
367
+ except Exception as e:
368
+ raise RuntimeError(str(e))
369
+
370
+ def generate_audio_for_audio_source(audio_source: dict, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> dict:
371
+ """
372
+ Generates audio file for single audio source with description.
373
+
374
+ Args:
375
+ audio_source: Dictionary containing sound description and duration
376
+ ttsfx_api_key: API key for the text-to-speech service
377
+ overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
378
+
379
+ Returns:
380
+ dict: Audio source dictionary with added AudioPath and Volume fields
381
+ """
382
+ sound_description = audio_source.get("SoundDescription", "")
383
+ duration = audio_source.get("Duration")
384
+ audio_source["AudioPath"] = generate_audio(sound_description, duration, ttsfx_api_key, overwrite_ttsfx)
385
+ audio_source["Volume"] = 1.0
386
+ return audio_source
387
+
388
+ def generate_all_audio(all_audio_sources: dict, ttsfx_api_key: str = None, overwrite_ttsfx: Optional[TTSFXService] = None) -> dict:
389
+ """
390
+ Generates audio files for audio sources with descriptions.
391
+
392
+ Args:
393
+ all_audio_sources: Dictionary containing AudioSources and AmbientAudioSources lists
394
+ ttsfx_api_key: API key for the text-to-speech service
395
+ overwrite_ttsfx: Optional custom object instance that implements the TTSFXService protocol to use instead of the default (ElevenLabs)
396
+
397
+ Returns:
398
+ dict: Audio sources dictionary with generated audio paths
399
+
400
+ Raises:
401
+ ValueError: If audio sources dictionary is None
402
+ RuntimeError: If there's an error during audio generation
403
+ """
404
+ if all_audio_sources is None:
405
+ raise ValueError("missing audio sources.")
406
+ try:
407
+ for audio_source in all_audio_sources[AUDIO_SOURCES_KEY]:
408
+ audio_source = generate_audio_for_audio_source(audio_source, ttsfx_api_key, overwrite_ttsfx)
409
+ for audio_source in all_audio_sources[AMBIENT_AUDIO_SOURCES_KEY]:
410
+ audio_source = generate_audio_for_audio_source(audio_source, ttsfx_api_key, overwrite_ttsfx)
411
+ return all_audio_sources
412
+ except Exception as e:
413
+ raise RuntimeError(str(e))
414
+
415
+ # --- High-level Processing ---
416
+ def process_video(video, frame_interval=None, target_width=None, target_height=None, prompt_instruction=None, vision_lm_api_key=None, overwrite_vision_lm: Optional[VisionLMService] = None) -> tuple[dict, str]:
417
+ """
418
+ Processes the video, extracts frames at the specified interval, generates video description and audio sources.
419
+
420
+ Args:
421
+ video: Video input as either a file path string, or a dictionary containing video data
422
+ frame_interval: Interval between extracted frames. Defaults to None
423
+ target_width (optional): Target width for frame extraction. Defaults to None
424
+ target_height (optional): Target height for frame extraction. Defaults to None
425
+ prompt_instruction (optional): Custom instruction to include in the prompt. Defaults to None
426
+ vision_lm_api_key (optional): API key for vision language model. Defaults to None
427
+ overwrite_vision_lm (optional): Custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
428
+
429
+ Returns:
430
+ tuple: Contains (audio_sources, video_description)
431
+ audio_sources: Dictionary containing 'AudioSources' and 'AmbientAudioSources' lists
432
+ video_description: String containing detailed description of the video
433
+
434
+ Raises:
435
+ ValueError: If video input is invalid or missing
436
+ FileNotFoundError: If video file cannot be opened or accessed
437
+ RuntimeError: If frame extraction fails, API returns an error, or audio source generation fails
438
+ Exception: If data validation fails
439
+ """
440
+ # Add frame info
441
+ video_path = get_video_path(video)
442
+ cap = cv2.VideoCapture(video_path)
443
+ fps = 25 # Default value
444
+ frame_count = fps
445
+ frame_info_prompt = ""
446
+
447
+ if cap.isOpened():
448
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
449
+ fps = cap.get(cv2.CAP_PROP_FPS)
450
+ if frame_interval is None:
451
+ frame_interval = math.ceil(fps) # Default to one sample per second
452
+ seconds = frame_interval / fps
453
+ frame_info_prompt += f"\nThe framerate of the video is {fps} FPS."
454
+ frame_info_prompt += f"\nThe frame interval between each image is {frame_interval} frames."
455
+ frame_info_prompt += f" Which means there's approximately {seconds:.2f} seconds between each image in real time."
456
+ cap.release()
457
+
458
+ # Extract frames
459
+ frames = extract_frames(video_path, frame_interval, target_width, target_height)
460
+ if not frames:
461
+ raise RuntimeError("No frames extracted from video.")
462
+
463
+ images = []
464
+ frame_numbers = []
465
+ for frame_idx, pil_image in frames:
466
+ images.append(pil_image)
467
+ frame_numbers.append(frame_idx)
468
+
469
+ # Build prompt
470
+ prompt = (
471
+ "You will be given a list of images representing certain frames of a video that has no audio."
472
+ "\nYou must provide a general description of the video and a list of audio sources you recognize"
473
+ " in these frames, so that it can be used to create an audio file for each audio source in the video."
474
+
475
+ "\n\nThe VideoProperties 'Description' property must include a very thorough and detailed description of everything that is visible in the frames."
476
+ " Make sure not a single thing is missing, the context of this description will help define the audio sources."
477
+ "\nThe VideoProperties 'Description' property may also include descriptions of abstract things,"
478
+ " like the vibe, the medium (if it's animated, cartoony or recorded) or the time period,"
479
+ " but only if it's relevant to the possible audio that could fit underneath the video."
480
+
481
+ "\n\nThe AudioSource 'SoundDescription' property is implied to already be a description of audio so it does not contain words like \"The sound of ...\", etc."
482
+ "\nThe AudioSource 'SoundDescription' property should only contain the direct description of the audio, and not any unnecessary visual descriptions like color."
483
+ "\nThe AudioSource 'SoundDescription' property should be able to be understood without outside context. For example, you should never describe it as"
484
+ " \"The sound of footsteps as the mayor is walking towards the lamp post.\". Because \"the mayor\" or \"the lamp post\" are unknown entities within this description. They're also irrelevant to the sound this would make."
485
+ " The correct description for that example would have been: \"Footsteps of a man on a sidewalk.\""
486
+ "\nThe AudioSource 'SoundDescription' property also does not know about the other properties, like the AudioSource 'Slug' property."
487
+ " If the audio source is a bird, for example, the 'SoundDescription' property can't just be \"Wings flapping.\", but must still include the subject if it would be relevant for the sound,"
488
+ " so a better description would be \"A bird flapping its wings.\""
489
+
490
+ "\n\nAnother example of a very bad sound description, this is an example of things you need to avoid: \"A gray owl with a hat is making sounds on the walls of the castle, adding to the eerie vibe of the video.\""
491
+ "\nThe correct way to describe that would be: \"An owl hooting in the distance.\""
492
+
493
+ "\n\nLastly, you will also include a list of ambient audio sources."
494
+ "\nThe ambient audio sources are similar to the normal audio sources, but are for elements that are either invisible in the frames or just not a single source of audio."
495
+ "\nThe sounds from the ambient audio sources should not already exist in the normal audio sources. Because that would mean that sound would be duplicate in the end result."
496
+ "\nTo create a good ambient audio source description, you must imagine what you could hear in the video that's not necessarily depicted."
497
+ "\nExample: if a video depicts a squirrel playing in a forest, then you can imagine the forest making lots of noises that aren't directly visible,"
498
+ " such as chirping birds or crickets. The ambient sound description will not include the noises the squirrel makes because these will be found in the normal audio sources.\n"
499
+ )
500
+
501
+ if frame_info_prompt is not None and frame_info_prompt.strip():
502
+ prompt += frame_info_prompt
503
+ if prompt_instruction is not None and prompt_instruction.strip():
504
+ prompt += f"\n\nThe user included the following custom instruction for you, try to abide if possible:\n<user-instruction>\n{prompt_instruction}\n</user-instruction>"
505
+ prompt += f"\nThis array shows the frame index (which frame of the video they represent), for each image you are about to see in order: {frame_numbers}"
506
+
507
+ # Generate video description and audio sources
508
+ try:
509
+ vision_lm, _ = get_services(vision_lm=overwrite_vision_lm)
510
+ vision_lm_api_key, _ = get_api_keys(vision_lm_api_key=vision_lm_api_key)
511
+ result_data = vision_lm.generate_audio_sources(prompt, images, vision_lm_api_key)
512
+ except Exception as e:
513
+ raise RuntimeError(str(e))
514
+
515
+ # Validate the returned data using the AudioSource model
516
+ video_description = result_data['VideoProperties']['Description']
517
+ audio_sources_data = result_data[AUDIO_SOURCES_KEY]
518
+ ambient_audio_sources_data = result_data[AMBIENT_AUDIO_SOURCES_KEY]
519
+
520
+ try:
521
+ validated_audio_sources = validate_audio_sources(audio_sources_data, fps, frame_count)
522
+ validated_ambient_audio_sources = validate_audio_sources(ambient_audio_sources_data, fps, frame_count)
523
+ all_audio_sources = { AUDIO_SOURCES_KEY: validated_audio_sources, AMBIENT_AUDIO_SOURCES_KEY: validated_ambient_audio_sources }
524
+ return all_audio_sources, video_description
525
+ except Exception as e:
526
+ raise Exception(str(e))
527
+
528
+ def combine_video_and_audio(audio_sources: list, input_video: str | dict, output_video_path: str = None) -> str:
529
+ """
530
+ Composes audio sources onto a video file, creating a new video with the combined audio.
531
+
532
+ Args:
533
+ audio_sources (list): List of audio source dictionaries containing audio paths and timing
534
+ input_video (str | dict): Either a path to video file or a dictionary containing video data
535
+ output_video_path (str): Path for the output video to be saved to, if none will be adjecant to the input_video path and add an '_output' suffix. Defaults to None
536
+
537
+ Returns:
538
+ str: Path to the output video file with composed audio
539
+
540
+ Raises:
541
+ ValueError: If video input is invalid or missing
542
+ FileNotFoundError: If video file cannot be accessed
543
+ RuntimeError: If error occurs during video composition or audio processing
544
+ OSError: If there are file system errors during composition
545
+ """
546
+ input_video_path = get_video_path(input_video)
547
+ if not input_video_path:
548
+ raise ValueError("Invalid or missing video input")
549
+ try:
550
+ if output_video_path is None:
551
+ path_to_input_file_name, input_file_extension = os.path.splitext(input_video_path)
552
+ output_video_path = path_to_input_file_name + "_output" + input_file_extension
553
+ return video_comping.combine_video_and_audio(input_video_path, output_video_path, combine_all_audio_sources(audio_sources))
554
+ except Exception as e:
555
+ raise RuntimeError(str(e))
556
+
557
+ def add_audio_to_video(input_video_path, output_video_path=None, frame_interval=None, downscale_to_max_side=None, prompt_instruction=None, vision_lm_api_key=None, ttsfx_api_key: str = None, overwrite_vision_lm: Optional[VisionLMService] = None, overwrite_ttsfx: Optional[TTSFXService] = None, quiet: bool = False) -> str:
558
+ """
559
+ Processes the video, extracts frames at the specified interval, generates video description and audio sources, then generate audio from those descriptions and adds them back to the video.
560
+
561
+ Args:
562
+ video: Video input as either a file path string, or a dictionary containing video data
563
+ frame_interval (optional): Interval between extracted frames. Defaults to None
564
+ downscale_to_max_side (optional): Downscale samples to maximum side, if None will use 512px. Defaults to None
565
+ prompt_instruction (optional): Custom instruction to include in the prompt. Defaults to None
566
+ vision_lm_api_key (optional): API key for vision language model. Defaults to None
567
+ overwrite_vision_lm (optional): Custom object instance that implements the VisionLMService protocol to use instead of the default (ChatGPT)
568
+ quiet (optional): If True, suppresses progress output. Defaults to False
569
+
570
+ Returns:
571
+ str: Path to the output video file with composed audio
572
+
573
+ Raises:
574
+ ValueError: If video input is invalid or missing
575
+ FileNotFoundError: If video file cannot be opened or accessed
576
+ RuntimeError: If frame extraction fails, API returns an error, audio source generation fails, error occurs during video composition or audio processing
577
+ Exception: If data validation fails
578
+ OSError: If there are file system errors during composition
579
+ """
580
+ try:
581
+ if downscale_to_max_side is None:
582
+ downscale_to_max_side = 512
583
+ input_video_info = get_video_info(input_video_path)
584
+ target_width, target_height = downscale_dimensions(input_video_info['Width'], input_video_info['Height'], downscale_to_max_side)
585
+ duration = input_video_info.get("Duration", 0.0)
586
+ if duration > 20.0 and overwrite_ttsfx is None:
587
+ if not quiet:
588
+ print("WARNING: Input video is longer than 20 seconds. This process is works best for 3-10 second clips and becomes unstable with longer videos. Consider trimming your video into shorter segments.")
589
+
590
+ except Exception as e:
591
+ raise RuntimeError(f"Error determining downscaled sample resolution: {e}")
592
+
593
+ try:
594
+ if not quiet:
595
+ print(f"Processing {input_video_path}...")
596
+ all_audio_sources, _ = process_video(input_video_path, frame_interval, target_width, target_height, prompt_instruction, vision_lm_api_key, overwrite_vision_lm)
597
+ except Exception as e:
598
+ raise RuntimeError(f"Error during video processing: {e}")
599
+
600
+ try:
601
+ if not quiet:
602
+ print(f"Generating audio...")
603
+ all_audio_sources = generate_all_audio(all_audio_sources, ttsfx_api_key, overwrite_ttsfx)
604
+ except Exception as e:
605
+ raise RuntimeError(f"Error during audio generation: {e}")
606
+
607
+ try:
608
+ print(f"Adding generated audio to the video...")
609
+ output = combine_video_and_audio(all_audio_sources, input_video_path, output_video_path)
610
+ if not quiet:
611
+ print(f"Output saved to: {output}")
612
+ return output
613
+ except Exception as e:
614
+ raise RuntimeError(f"Error while combining video and audio: {e}")
615
+
616
+ if __name__ == '__main__':
617
+ parser = argparse.ArgumentParser(
618
+ description='Auto-foley: Automatically add sound effects to a video without sound',
619
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
620
+ )
621
+
622
+ parser.add_argument(
623
+ '--input', '-i',
624
+ required=True,
625
+ help='Path to the input video file'
626
+ )
627
+
628
+ # Optional arguments
629
+ parser.add_argument(
630
+ '--output', '-o',
631
+ help='Path for the output video file. If not specified, will append "_output" to the input filename'
632
+ )
633
+
634
+ parser.add_argument(
635
+ '--frame-interval',
636
+ type=int,
637
+ help='Interval between processed frames'
638
+ )
639
+
640
+ parser.add_argument(
641
+ '--downscale-to-max-side',
642
+ type=int,
643
+ help='Target max side for downscaling video samples'
644
+ )
645
+
646
+ parser.add_argument(
647
+ '--prompt-instruction',
648
+ help='Custom instruction for the video processing'
649
+ )
650
+
651
+ parser.add_argument(
652
+ '--vision-lm-api-key',
653
+ help='API key for the vision language model'
654
+ )
655
+
656
+ parser.add_argument(
657
+ '--ttsfx-api-key',
658
+ help='API key for the text-to-speech effects service'
659
+ )
660
+
661
+ parser.add_argument(
662
+ '--quiet', '-q',
663
+ action='store_true',
664
+ help='Suppress progress output'
665
+ )
666
+
667
+ args = parser.parse_args()
668
+
669
+ try:
670
+ add_audio_to_video(
671
+ input_video_path=args.input,
672
+ output_video_path=args.output,
673
+ frame_interval=args.frame_interval,
674
+ downscale_to_max_side=args.downscale_to_max_side,
675
+ prompt_instruction=args.prompt_instruction,
676
+ vision_lm_api_key=args.vision_lm_api_key,
677
+ ttsfx_api_key=args.ttsfx_api_key,
678
+ quiet=args.quiet
679
+ )
680
+ except Exception as e:
681
+ print(f"{str(e)}", file=sys.stderr)
682
+ sys.exit(1)
auto_foley/services.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from io import BytesIO
4
+ from typing import Protocol, Optional
5
+
6
+ class VisionLMService(Protocol):
7
+ def calculate_video_input_cost(self, width: int, height: int, sample_count: int) -> str:
8
+ ...
9
+ def generate_audio_sources(self, prompt: str, images: list, api_key: str | None = None) -> dict:
10
+ ...
11
+
12
+ class TTSFXService(Protocol):
13
+ def generate_sound_effect(self, text_prompt: str, duration_seconds: float, prompt_influence: float, api_key: str = None) -> BytesIO:
14
+ ...
15
+
16
+ if __name__ == "__main__" or 'auto_foley' not in sys.modules:
17
+ import vision_lm_chatgpt
18
+ import ttsfx_elevenlabs
19
+ else:
20
+ from auto_foley import vision_lm_chatgpt
21
+ from auto_foley import ttsfx_elevenlabs
22
+
23
+ # Module-level service instances
24
+ _current_vision_lm: VisionLMService = vision_lm_chatgpt.ChatGPTVisionLM()
25
+ _current_ttsfx: TTSFXService = ttsfx_elevenlabs.ElevenLabsTTSFX()
26
+
27
+ def set_default_vision_lm(service: VisionLMService) -> None:
28
+ """
29
+ Sets the default global vision language model service for all functions to use.
30
+
31
+ Args:
32
+ service: Object instance that implements the VisionLMService protocol
33
+ """
34
+ global _current_vision_lm
35
+ _current_vision_lm = service
36
+
37
+ def set_default_ttsfx(service: TTSFXService) -> None:
38
+ """
39
+ Sets the default global text-to-sound-effect service for all functions to use.
40
+
41
+ Args:
42
+ service: Object instance that implements the TTSFXService protocol
43
+ """
44
+ global _current_ttsfx
45
+ _current_ttsfx = service
46
+
47
+ def get_services(vision_lm: Optional[VisionLMService] = None, ttsfx: Optional[TTSFXService] = None) -> tuple[VisionLMService, TTSFXService]:
48
+ """
49
+ Gets the service instances to use, either from provided arguments or falling back to global defaults.
50
+
51
+ Args:
52
+ vision_lm (optional): The vision language model service instance to use.
53
+ If None, uses the global default. Defaults to None
54
+ ttsfx (optional): The text-to-sound-effect service instance to use.
55
+ If None, uses the global default. Defaults to None
56
+
57
+ Returns:
58
+ tuple: Contains (vision_lm_service, ttsfx_service)
59
+ vision_lm_service: VisionLMService instance to use
60
+ ttsfx_service: TTSFXService instance to use
61
+ """
62
+ return (
63
+ vision_lm if vision_lm is not None else _current_vision_lm,
64
+ ttsfx if ttsfx is not None else _current_ttsfx
65
+ )
66
+
67
+ def get_api_keys(vision_lm_api_key: Optional[str] = None, ttsfx_api_key: Optional[str] = None) -> tuple[Optional[str], Optional[str]]:
68
+ """
69
+ Gets the API keys to use, either from provided arguments or falling back to environment variables.
70
+
71
+ Args:
72
+ vision_lm_api_key (optional): The vision language model API key to use.
73
+ If None or empty string, uses the AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY environment variable.
74
+ Defaults to None
75
+ ttsfx_api_key (optional): The text-to-sound-effect API key to use.
76
+ If None or empty string, uses the AUTO_FOLEY_DEFAULT_TTSFX_API_KEY environment variable.
77
+ Defaults to None
78
+
79
+ Returns:
80
+ tuple: Contains (vision_lm_api_key, ttsfx_api_key)
81
+ vision_lm_api_key: API key to use for vision language model service
82
+ ttsfx_api_key: API key to use for text-to-sound-effect service
83
+ """
84
+ return (
85
+ vision_lm_api_key if vision_lm_api_key else os.getenv('AUTO_FOLEY_DEFAULT_VISION_LM_API_KEY'),
86
+ ttsfx_api_key if ttsfx_api_key else os.getenv('AUTO_FOLEY_DEFAULT_TTSFX_API_KEY')
87
+ )
auto_foley/ttsfx_elevenlabs.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from elevenlabs.client import ElevenLabs
2
+ from io import BytesIO
3
+
4
+ class ElevenLabsTTSFX:
5
+ def generate_sound_effect(self, text_prompt: str, duration_seconds: float, prompt_influence: float = 0.3, api_key: str = None) -> BytesIO:
6
+ """
7
+ Generates a sound effect based on a text prompt using ElevenLabs API.
8
+
9
+ Args:
10
+ text_prompt: Text description of the desired sound effect
11
+ duration_seconds: Length of the sound effect in seconds
12
+ prompt_influence: How strongly the prompt influences the generated sound (default: 0.3)
13
+ api_key: Optional API key for ElevenLabs authentication
14
+
15
+ Returns:
16
+ BytesIO: Audio data as a bytes stream
17
+
18
+ Raises:
19
+ ValueError: If text_prompt is empty or duration_seconds is invalid
20
+ RuntimeError: If there's an error during sound effect generation
21
+ """
22
+ if not text_prompt:
23
+ raise ValueError("Text prompt cannot be empty")
24
+ # Other values will cause exceptions to be thrown
25
+ if duration_seconds < 0.5:
26
+ duration_seconds = 0.5
27
+ if duration_seconds > 22.0:
28
+ duration_seconds = 22.0
29
+ try:
30
+ elevenlabs_client = ElevenLabs(api_key=api_key)
31
+ response = elevenlabs_client.text_to_sound_effects.convert(
32
+ text=text_prompt,
33
+ duration_seconds=duration_seconds,
34
+ prompt_influence=prompt_influence,
35
+ )
36
+ # Collect the audio data into a BytesIO object
37
+ audio_data = BytesIO()
38
+ for chunk in response:
39
+ audio_data.write(chunk)
40
+ audio_data.seek(0)
41
+ return audio_data
42
+
43
+ except Exception as e:
44
+ raise RuntimeError(f"{str(e)}")
45
+
auto_foley/video_comping.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from moviepy.audio.AudioClip import AudioArrayClip
2
+ from pydub import AudioSegment
3
+ import moviepy.editor as mpe
4
+ import numpy as np
5
+
6
+ def combine_video_and_audio(input_video_path: str, output_video_path: str, audio_sources: list) -> str:
7
+ """
8
+ Combines audio sources with a video file to create a new video with audio.
9
+
10
+ Args:
11
+ input_video_path: Path to the video file
12
+ output_video_path: Path for the output video to be saved to
13
+ audio_sources: List of audio source dictionaries containing timing and filepath information
14
+
15
+ Returns:
16
+ str: Path to the output video file with combined audio
17
+
18
+ Raises:
19
+ ValueError: If video path or audio sources are invalid or missing
20
+ FileNotFoundError: If the video file cannot be accessed
21
+ OSError: If there are issues with file operations in the output directory
22
+ RuntimeError: If there's an error during the video composition process
23
+ """
24
+ if not input_video_path:
25
+ raise ValueError("No video file provided")
26
+ if not audio_sources:
27
+ raise ValueError("No audio sources provided")
28
+
29
+ try:
30
+ video_clip = mpe.VideoFileClip(input_video_path)
31
+ original_duration = video_clip.duration
32
+ fps = video_clip.fps
33
+ audio_clips = []
34
+
35
+ for idx, audio_source in enumerate(audio_sources):
36
+ audio_path = audio_source.get("AudioPath")
37
+ start_frame_index = audio_source.get("StartFrameIndex", 0)
38
+ volume = audio_source.get("Volume", 1.0)
39
+ start_time = start_frame_index / fps # Convert start_frame_index to start_time in seconds
40
+
41
+ if audio_path is None:
42
+ continue # Skip if no audio path
43
+
44
+ # Convert the audio file to numpy array
45
+ sample_rate, samples = convert_audio_file_to_numpy(audio_path)
46
+
47
+ if samples is None:
48
+ continue # Skip if samples are None
49
+
50
+ samples = np.array(samples)
51
+ if samples.ndim == 0: # If samples is scalar, expand its dimensions
52
+ samples = np.expand_dims(samples, axis=0)
53
+ if samples.ndim == 1: # If samples is one-dimensional, reshape to (n_samples, 1)
54
+ samples = samples.reshape(-1, 1)
55
+
56
+ # Apply volume adjustment
57
+ samples *= volume
58
+ duration = len(samples) / sample_rate
59
+
60
+ # Adjust duration if audio extends beyond video duration
61
+ audio_end_time = start_time + duration
62
+ if start_time >= original_duration:
63
+ continue # Skip if audio clip starts after the video ends
64
+ if audio_end_time > original_duration:
65
+ # Adjust the duration to not exceed video duration
66
+ duration = original_duration - start_time
67
+ # Calculate the number of samples to keep
68
+ num_samples = int(duration * sample_rate)
69
+ samples = samples[:num_samples]
70
+
71
+ audio_clip = AudioArrayClip(samples, fps=sample_rate).set_start(start_time).set_duration(duration)
72
+ audio_clips.append(audio_clip)
73
+
74
+ # Composite all audio clips
75
+ if audio_clips:
76
+ composite_audio = mpe.CompositeAudioClip(audio_clips).set_duration(original_duration)
77
+ # Set the video's audio to the composite audio
78
+ video_clip = video_clip.set_audio(composite_audio)
79
+ else:
80
+ # Ensure video has no audio if there are no audio clips
81
+ video_clip = video_clip.without_audio()
82
+
83
+ video_clip = video_clip.subclip(0, original_duration) # Trim the video to its original duration to prevent any extension
84
+ video_clip.write_videofile(output_video_path, codec="libx264", audio_codec="aac", temp_audiofile=None, remove_temp=True, verbose=False, logger=None)
85
+
86
+ video_clip.close()
87
+ if audio_clips:
88
+ for clip in audio_clips:
89
+ clip.close()
90
+ composite_audio.close()
91
+
92
+ return output_video_path
93
+ except Exception as e:
94
+ raise RuntimeError(str(e))
95
+
96
+ def convert_audio_file_to_numpy(audio_filepath: str) -> tuple[int, np.ndarray]:
97
+ """
98
+ Converts an audio file to a numpy array with normalized samples.
99
+
100
+ Args:
101
+ audio_filepath: Path to the audio file to convert
102
+
103
+ Returns:
104
+ tuple: Contains (sample_rate, samples)
105
+ - sample_rate (int): The sample rate of the audio
106
+ - samples (np.ndarray): Normalized audio samples in range [-1, 1]
107
+
108
+ Raises:
109
+ FileNotFoundError: If the audio file cannot be accessed
110
+ RuntimeError: If there's an error processing the audio file
111
+ """
112
+ try:
113
+ audio_segment = AudioSegment.from_file(audio_filepath)
114
+ sample_width = audio_segment.sample_width # in bytes
115
+ max_val = float(2 ** (8 * sample_width - 1))
116
+ # Normalize samples to [-1, 1]
117
+ samples = np.array(audio_segment.get_array_of_samples()).astype(np.float32) / max_val
118
+ if audio_segment.channels == 2:
119
+ samples = samples.reshape((-1, 2))
120
+ sample_rate = audio_segment.frame_rate
121
+ return (sample_rate, samples)
122
+ except FileNotFoundError:
123
+ raise
124
+ except Exception as e:
125
+ raise RuntimeError(str(e))
auto_foley/vision_lm_chatgpt.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import io
3
+ import json
4
+ import math
5
+ import openai
6
+ from openai import OpenAI
7
+
8
+ class ChatGPTVisionLM:
9
+ def calculate_video_input_cost(self, width: int, height: int, sample_count: int) -> str:
10
+ """
11
+ Calculates the estimated cost to process a video based on dimensions and number of samples.
12
+
13
+ Args:
14
+ width: Width of the video in pixels
15
+ height: Height of the video in pixels
16
+ sample_count: Number of frames to be processed
17
+
18
+ Returns:
19
+ str: Formatted string with calculated cost in USD with 6 decimal places
20
+
21
+ Raises:
22
+ ValueError: If width, height or sample_count are negative or zero
23
+ """
24
+ if width <= 0 or height <= 0 or sample_count <= 0:
25
+ raise ValueError("Width, height and sample_count must be positive values")
26
+
27
+ # Constants for gpt-4o-mini
28
+ price_per_million_tokens = 0.15
29
+ base_tokens = 2833
30
+ tile_tokens = 5667
31
+ tile_size = 512
32
+
33
+ tiles_x = math.ceil(width / tile_size)
34
+ tiles_y = math.ceil(height / tile_size)
35
+ total_tiles = tiles_x * tiles_y
36
+ total_tokens = base_tokens + (tile_tokens * total_tiles)
37
+ cost_for_one_image = (total_tokens / 1000000) * price_per_million_tokens
38
+ total_cost = cost_for_one_image * sample_count
39
+ return f"${total_cost:.6f}"
40
+
41
+ def generate_audio_sources(self, prompt: str, images: list, api_key: str | None = None) -> dict:
42
+ """
43
+ Generates a description of the video and a list of audio sources for the provided images using the OpenAI API.
44
+
45
+ Args:
46
+ prompt: The prompt to send to the model
47
+ images: List of PIL.Image.Image objects to process
48
+ api_key: Optional OpenAI API key. If None, will use environment variable
49
+
50
+ Returns:
51
+ dict: The parsed response containing:
52
+ - VideoProperties (dict): Contains video description
53
+ - AudioSources (list): List of audio source objects
54
+ - AmbientAudioSources (list): List of ambient audio source objects
55
+
56
+ Raises:
57
+ ValueError: If images input is invalid
58
+ RuntimeError: If there's an error calling the OpenAI API
59
+ JSONDecodeError: If the API response cannot be parsed
60
+ """
61
+ gpt_model = "gpt-4o-mini"
62
+
63
+ if not isinstance(images, list):
64
+ images = [images]
65
+
66
+ # Build the content list
67
+ content_list = [{"type": "text", "text": prompt}]
68
+
69
+ for image in images:
70
+ # Encode the image to base64
71
+ buffered = io.BytesIO()
72
+ image.save(buffered, format="PNG")
73
+ base64_image = base64.b64encode(buffered.getvalue()).decode('utf-8')
74
+
75
+ content_list.append({
76
+ "type": "image_url",
77
+ "image_url": {
78
+ "url": f"data:image/png;base64,{base64_image}",
79
+ "detail": "auto"
80
+ },
81
+ })
82
+
83
+ # Construct messages
84
+ messages = [
85
+ {
86
+ "role": "user",
87
+ "content": content_list
88
+ }
89
+ ]
90
+
91
+ # Define the tool (function) schema
92
+ tool = {
93
+ "type": "function",
94
+ "function": {
95
+ "name": "analyze_video",
96
+ "description": "Provide a description of the video and extract audio sources from video frames.",
97
+ "strict": True,
98
+ "parameters": {
99
+ "type": "object",
100
+ "properties": {
101
+ "VideoProperties": {
102
+ "type": "object",
103
+ "properties": {
104
+ "Description": {
105
+ "type": "string",
106
+ "description": "A general description of the entire video."
107
+ }
108
+ },
109
+ "required": ["Description"],
110
+ "additionalProperties": False
111
+ },
112
+ "AudioSources": {
113
+ "type": "array",
114
+ "items": {
115
+ "type": "object",
116
+ "properties": {
117
+ "SourceSlugID": {
118
+ "type": "string",
119
+ "description": "A unique ID in the format {Subject}{Number}{Activity}, e.g., \"Bear1Roaring\"."
120
+ },
121
+ "SoundDescription": {
122
+ "type": "string",
123
+ "description": "Description of the sound, e.g., \"A person using a chainsaw against a tree.\""
124
+ },
125
+ "StartFrameIndex": {
126
+ "type": "integer",
127
+ "description": "The frame index where the sound starts."
128
+ },
129
+ "EndFrameIndex": {
130
+ "type": "integer",
131
+ "description": "The frame index where the sound ends."
132
+ }
133
+ },
134
+ "required": ["SourceSlugID", "SoundDescription", "StartFrameIndex", "EndFrameIndex"],
135
+ "additionalProperties": False
136
+ }
137
+ },
138
+ "AmbientAudioSources": {
139
+ "type": "array",
140
+ "items": {
141
+ "type": "object",
142
+ "properties": {
143
+ "SourceSlugID": {
144
+ "type": "string",
145
+ "description": "A unique ID in the format {Subject}{Number}{Activity}, e.g., \"Wind1Howling\"."
146
+ },
147
+ "SoundDescription": {
148
+ "type": "string",
149
+ "description": "Description of the ambient sound, e.g., \"An eerie wind howling through a ravine.\""
150
+ },
151
+ "StartFrameIndex": {
152
+ "type": "integer",
153
+ "description": "The frame index where the sound starts."
154
+ },
155
+ "EndFrameIndex": {
156
+ "type": "integer",
157
+ "description": "The frame index where the sound ends."
158
+ }
159
+ },
160
+ "required": ["SourceSlugID", "SoundDescription", "StartFrameIndex", "EndFrameIndex"],
161
+ "additionalProperties": False
162
+ }
163
+ }
164
+ },
165
+ "required": ["VideoProperties", "AudioSources", "AmbientAudioSources"],
166
+ "additionalProperties": False
167
+ }
168
+ }
169
+ }
170
+
171
+ try:
172
+ # Call the OpenAI API with the tool
173
+ openai_client = OpenAI(api_key=api_key)
174
+ response = openai_client.chat.completions.create(
175
+ model=gpt_model,
176
+ messages=messages,
177
+ tools=[tool],
178
+ tool_choice={"type": "function", "function": {"name": "analyze_video"}},
179
+ max_tokens=12288
180
+ )
181
+
182
+ # Access the choices and messages properly
183
+ choice = response.choices[0]
184
+ message = choice.message
185
+
186
+ if hasattr(message, 'tool_calls') and message.tool_calls:
187
+ tool_call = message.tool_calls[0]
188
+ arguments = tool_call.function.arguments
189
+ result_data = json.loads(arguments)
190
+ return result_data
191
+ else:
192
+ raise RuntimeError("LLM did not provide the expected structured output.")
193
+
194
+ except openai.OpenAIError as e:
195
+ raise RuntimeError(f"OpenAI API error: {str(e)}")
196
+ except json.JSONDecodeError as e:
197
+ raise json.JSONDecodeError(f"Error parsing API response: {str(e)}", e.doc, e.pos)
custom_script.js ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function manageTimeBar(elemId, time) {
2
+ if (!window.visTimelineInstances) {
3
+ console.error(`Timeline instances collection not found`);
4
+ return;
5
+ }
6
+
7
+ const timeline = window.visTimelineInstances[elemId];
8
+ if (!timeline) {
9
+ console.error(`Timeline instance ${elemId} not found`);
10
+ return;
11
+ }
12
+
13
+ if (!window.customTimeBarIds) {
14
+ window.customTimeBarIds = {};
15
+ }
16
+
17
+ try {
18
+ timeline.setCustomTime(time, elemId);
19
+ } catch (e) {
20
+ timeline.addCustomTime(time, elemId);
21
+ }
22
+ }
23
+
24
+ function setTimeBarDirect(elemId, time) {
25
+ manageTimeBar(elemId, time);
26
+ }
27
+
28
+ function setTimeBarNormalized(elemId, start, end, normalizedPos) {
29
+ const time = start + (end - start) * normalizedPos;
30
+ manageTimeBar(elemId, time);
31
+ }
32
+
33
+ class VideoTimelineSync {
34
+ constructor(videoId, timelineId, trackLengthItemId) {
35
+ this.timelineId = timelineId;
36
+
37
+ try {
38
+ const trackLengthItemData = getTimelineItemData(timelineId, trackLengthItemId);
39
+ if (trackLengthItemData != null) {
40
+ const trackLengthStart = trackLengthItemData.start;
41
+ const trackLengthEnd = trackLengthItemData.end;
42
+ this.trackLength = trackLengthEnd - trackLengthStart;
43
+ }
44
+ } catch (error) {
45
+ console.error('Error setting timeline video sync:', error);
46
+ return;
47
+ }
48
+
49
+ const container = document.getElementById(videoId);
50
+ if (!container) {
51
+ console.error('Video container not found');
52
+ return;
53
+ }
54
+
55
+ this.progressElement = container.querySelector('progress');
56
+ if (!this.progressElement) {
57
+ console.error('Progress element not found');
58
+ return;
59
+ }
60
+
61
+ this.setupProgressObserver();
62
+ }
63
+
64
+ setupProgressObserver() {
65
+ // Create mutation observer to watch for value changes of the progress element
66
+ this.observer = new MutationObserver((mutations) => {
67
+ mutations.forEach((mutation) => {
68
+ if (mutation.type === 'attributes' && mutation.attributeName === 'value') {
69
+ this.onProgressUpdate();
70
+ }
71
+ });
72
+ });
73
+
74
+ // Observe the progress element for value changes
75
+ this.observer.observe(this.progressElement, {
76
+ attributes: true,
77
+ attributeFilter: ['value']
78
+ });
79
+ }
80
+
81
+ onProgressUpdate() {
82
+ const value = this.progressElement.value;
83
+ if (value === undefined || value === null) return;
84
+
85
+ // Value is already normalized (between 0 and 1)
86
+ this.syncTimeBarToPlayback(value);
87
+ }
88
+
89
+ syncTimeBarToPlayback(normalizedPosition) {
90
+ const timeline = window.visTimelineInstances[this.timelineId];
91
+ if (timeline) {
92
+ setTimeBarNormalized(this.timelineId, 0, this.trackLength, normalizedPosition);
93
+ }
94
+ }
95
+
96
+ cleanup() {
97
+ // Disconnect observer
98
+ if (this.observer) {
99
+ this.observer.disconnect();
100
+ this.observer = null;
101
+ }
102
+ }
103
+ }
104
+
105
+ function initVideoSync(videoId, timelineId, trackLengthItemId) {
106
+ try {
107
+ // Initialize syncs container if it doesn't exist
108
+ if (!window.timelineSyncs) {
109
+ window.timelineSyncs = {};
110
+ }
111
+
112
+ // Cleanup existing sync if any
113
+ if (window.timelineSyncs[timelineId]) {
114
+ window.timelineSyncs[timelineId].cleanup();
115
+ }
116
+
117
+ // Create new sync instance
118
+ window.timelineSyncs[timelineId] = new VideoTimelineSync(videoId, timelineId, trackLengthItemId);
119
+ } catch (error) {
120
+ console.error('Error initializing video sync:', error);
121
+ }
122
+
123
+ return null;
124
+ }
125
+
126
+ function getTimelineItemData(timelineId, itemId) {
127
+ const timeline = window.visTimelineInstances[timelineId];
128
+ if (!timeline) {
129
+ console.error(`Timeline instance ${timelineId} not found`);
130
+ return null;
131
+ }
132
+
133
+ const items = timeline.itemSet?.items;
134
+ if (!items) {
135
+ console.error('Timeline items not found');
136
+ return null;
137
+ }
138
+
139
+ const item = items[itemId];
140
+ if (!item) {
141
+ return null;
142
+ }
143
+
144
+ const itemData = item.data;
145
+ if (!itemData) {
146
+ console.error('Track length item data not found');
147
+ return null;
148
+ }
149
+
150
+ return item.data;
151
+ }
152
+
153
+ function setTimelineWindowToItemLength(timelineId, itemId) {
154
+ const itemData = getTimelineItemData(timelineId, itemId);
155
+ if (!itemData) {
156
+ return;
157
+ }
158
+
159
+ try {
160
+ const timeline = window.visTimelineInstances[timelineId];
161
+ console.log(itemData.end);
162
+ timeline.setWindow(itemData.start, new Date(itemData.end.getTime() + 20), {animation: false});
163
+ } catch (error) {
164
+ console.error('Error setting timeline window:', error);
165
+ }
166
+ }
custom_style.css ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #frame_interval_slider input[type="range"] {
2
+ --range_progress_inverted: calc(100% - var(--range_progress));
3
+ }
4
+
5
+ #frame_interval_slider input[type="range"]::-webkit-slider-runnable-track {
6
+ background: linear-gradient(to right, var(--color-accent) var(--range_progress_inverted), var(--neutral-200) var(--range_progress_inverted));
7
+ }
8
+
9
+ #frame_interval_slider input[type="range"]::-moz-range-track {
10
+ background: linear-gradient(to right, var(--color-accent) var(--range_progress_inverted), var(--color-accent) var(--range_progress_inverted));
11
+ }
12
+
13
+ #frame_interval_slider .slider_input_container {
14
+ direction: rtl;
15
+ }
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=5.12.0
2
+ openai>=1.59.7
3
+ Pillow>=10.4.0
4
+ opencv-python-headless>=4.10.0.84
5
+ elevenlabs>=1.50.3
6
+ pydub>=0.25.1
7
+ numpy>=2.2.1
8
+ moviepy==1.0.3
9
+ gradio_vistimeline>=1.0.1