Shreevathsam commited on
Commit
f94f7b0
·
verified ·
1 Parent(s): d9704d6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +697 -697
app.py CHANGED
@@ -1,698 +1,698 @@
1
- import os
2
- from datetime import datetime
3
- import random
4
- import whisper
5
- import shutil
6
- import wave
7
- import base64
8
- from moviepy.editor import (VideoFileClip, AudioFileClip, TextClip,
9
- concatenate_videoclips, CompositeVideoClip, CompositeAudioClip, ImageClip)
10
- import moviepy.audio.fx.all as afx
11
- import moviepy.video.fx.all as vfx
12
- import gradio as gr
13
- from PIL import Image, ImageDraw, ImageFilter, ImageFont
14
- import numpy as np
15
- from functools import lru_cache
16
- import urllib.request
17
- from google import genai
18
- from google.genai import types
19
-
20
- # Create necessary directories
21
- os.makedirs('video_clips', exist_ok=True)
22
- os.makedirs('background_music', exist_ok=True)
23
- os.makedirs('voice_over', exist_ok=True)
24
- os.makedirs('exports', exist_ok=True)
25
-
26
- # Get API key from environment variable (will be set in Hugging Face Space settings)
27
- GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
28
- if GOOGLE_API_KEY:
29
- os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
30
-
31
- generation_cancelled = False
32
- current_video_clip = None
33
-
34
- AVAILABLE_VOICES = {
35
- "Puck": {"name": "Puck", "description": "Young adult female (US)"},
36
- "Charon": {"name": "Charon", "description": "Young adult male (US)"},
37
- "Kore": {"name": "Kore", "description": "Young adult female (US)"},
38
- "Fenrir": {"name": "Fenrir", "description": "Young adult male (US)"},
39
- "Aoede": {"name": "Aoede", "description": "Young adult female (US)"}
40
- }
41
-
42
- def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
43
- with wave.open(filename, "wb") as wf:
44
- wf.setnchannels(channels)
45
- wf.setsampwidth(sample_width)
46
- wf.setframerate(rate)
47
- wf.writeframes(pcm_data)
48
-
49
- def generate_tts_audio(text_input, voice_name="Puck"):
50
- global generation_cancelled
51
- try:
52
- if generation_cancelled:
53
- return None, "Generation cancelled"
54
- client = genai.Client()
55
- response = client.models.generate_content(
56
- model="gemini-2.5-flash-preview-tts",
57
- contents=text_input,
58
- config=types.GenerateContentConfig(
59
- response_modalities=["AUDIO"],
60
- speech_config=types.SpeechConfig(
61
- voice_config=types.VoiceConfig(
62
- prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
63
- )
64
- ),
65
- )
66
- )
67
- if generation_cancelled:
68
- return None, "Generation cancelled"
69
- audio_data = response.candidates[0].content.parts[0].inline_data.data
70
- if isinstance(audio_data, str):
71
- audio_data = base64.b64decode(audio_data)
72
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
73
- temp_audio_path = f'/tmp/tts_audio_{timestamp}.wav'
74
- wave_file(temp_audio_path, audio_data)
75
- return temp_audio_path, "TTS generated"
76
- except Exception as e:
77
- return None, f"Error: {str(e)}"
78
-
79
- def split_text_into_lines(data):
80
- MaxChars, MaxDuration, MaxGap = 60, 2.5, 1.5
81
- subtitles, line, line_duration = [], [], 0
82
- for idx, word_data in enumerate(data):
83
- line.append(word_data)
84
- line_duration += word_data["end"] - word_data["start"]
85
- chars_exceeded = len(" ".join(item["word"] for item in line)) > MaxChars
86
- duration_exceeded = line_duration > MaxDuration
87
- sentence_ended = word_data["word"].rstrip().endswith(('.', '!', '?'))
88
- maxgap_exceeded = idx > 0 and word_data['start'] - data[idx-1]['end'] > MaxGap
89
- if chars_exceeded or duration_exceeded or sentence_ended or maxgap_exceeded:
90
- if line:
91
- subtitles.append({
92
- "word": " ".join(item["word"] for item in line),
93
- "start": line[0]["start"],
94
- "end": line[-1]["end"],
95
- "textcontents": line
96
- })
97
- line, line_duration = [], 0
98
- if line:
99
- subtitles.append({
100
- "word": " ".join(item["word"] for item in line),
101
- "start": line[0]["start"],
102
- "end": line[-1]["end"],
103
- "textcontents": line
104
- })
105
- return subtitles
106
-
107
- @lru_cache(maxsize=1000)
108
- def get_cached_text_clip(text, font, fontsize, color):
109
- return TextClip(text, font=font, fontsize=fontsize, color=color)
110
-
111
- def create_title_overlay(title_text, framesize, duration=4):
112
- if not title_text or not title_text.strip():
113
- return []
114
- frame_width, frame_height = framesize
115
- FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf"
116
- FONT_PATH = "/tmp/Poppins-Bold.ttf"
117
- if not os.path.exists(FONT_PATH):
118
- try:
119
- urllib.request.urlretrieve(FONT_URL, FONT_PATH)
120
- except:
121
- FONT_PATH = None
122
- TOP_MARGIN = int(frame_height * 0.115)
123
- FONT_SIZE = int(frame_height * 0.042)
124
- STROKE_WIDTH = max(1, int(frame_height * 0.003))
125
- LINE_SPACING = max(4, int(frame_height * 0.008))
126
- def load_font(size):
127
- try:
128
- if FONT_PATH and os.path.exists(FONT_PATH):
129
- return ImageFont.truetype(FONT_PATH, size)
130
- return ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", size)
131
- except:
132
- return ImageFont.load_default()
133
- font_obj = load_font(FONT_SIZE)
134
- base = Image.new("RGBA", (frame_width, frame_height), (0, 0, 0, 0))
135
- temp_img = Image.new("RGBA", (frame_width, frame_height), (0,0,0,0))
136
- temp_draw = ImageDraw.Draw(temp_img)
137
- def measure_text(text, font):
138
- try:
139
- bbox = temp_draw.textbbox((0,0), text, font=font, stroke_width=STROKE_WIDTH)
140
- return bbox[2]-bbox[0], bbox[3]-bbox[1]
141
- except:
142
- return 100, 50
143
- def wrap_text(text, font, max_width):
144
- words = text.upper().split()
145
- lines, current = [], []
146
- for word in words:
147
- test_line = " ".join(current + [word])
148
- w, _ = measure_text(test_line, font)
149
- if w <= max_width:
150
- current.append(word)
151
- else:
152
- if current:
153
- lines.append(" ".join(current))
154
- current = [word]
155
- else:
156
- lines.append(word)
157
- current = []
158
- if current:
159
- lines.append(" ".join(current))
160
- return lines[:4]
161
- lines = wrap_text(title_text, font_obj, frame_width * 0.90)
162
- line_heights = [measure_text(line, font_obj)[1] for line in lines]
163
- y_start = TOP_MARGIN
164
- x_center = frame_width // 2
165
- draw = ImageDraw.Draw(base)
166
- y = y_start
167
- for i, line in enumerate(lines):
168
- w, h = measure_text(line, font_obj)
169
- x = x_center - w // 2
170
- draw.text((x+2, y+2), line, font=font_obj, fill=(0,0,0,180))
171
- draw.text((x, y), line, font=font_obj, fill=(255,255,255,255), stroke_width=STROKE_WIDTH, stroke_fill=(0,0,0,255))
172
- y += line_heights[i] + LINE_SPACING
173
- return [ImageClip(np.array(base), duration=duration)]
174
-
175
- def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=14, color='white'):
176
- full_duration = textJSON['end'] - textJSON['start']
177
- word_clips = []
178
- xy_textclips_positions = []
179
- frame_width, frame_height = framesize
180
- max_line_width = frame_width * 0.8
181
- lines, current_line, current_line_width = [], [], 0
182
- for wordJSON in textJSON['textcontents']:
183
- word_upper = wordJSON['word'].upper()
184
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
185
- temp_space = get_cached_text_clip(" ", font, fontsize, color)
186
- word_width, word_height = temp_word.size
187
- space_width, _ = temp_space.size
188
- if current_line_width + word_width + space_width > max_line_width and current_line:
189
- lines.append({'words': current_line.copy(), 'width': current_line_width, 'height': word_height})
190
- current_line = [wordJSON]
191
- current_line_width = word_width + space_width
192
- else:
193
- current_line.append(wordJSON)
194
- current_line_width += word_width + space_width
195
- if current_line:
196
- word_upper = current_line[0]['word'].upper()
197
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
198
- _, word_height = temp_word.size
199
- lines.append({'words': current_line, 'width': current_line_width, 'height': word_height})
200
- total_text_height = sum(line['height'] for line in lines) + (len(lines) - 1) * 3
201
- subtitle_y_position = int(frame_height * 0.65)
202
- current_y = subtitle_y_position
203
- if lines:
204
- shadow_padding = 25
205
- shadow_height_extra = 15
206
- total_subtitle_width = max(line['width'] for line in lines)
207
- bg_width = int(total_subtitle_width + shadow_padding * 2)
208
- bg_height = int(total_text_height + shadow_height_extra * 2)
209
- img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
210
- draw = ImageDraw.Draw(img)
211
- draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=15, fill=(0, 0, 0, 128))
212
- img_array = np.array(img)
213
- shadow_bg = ImageClip(img_array, duration=full_duration).set_start(textJSON['start'])
214
- shadow_x = (frame_width - total_subtitle_width) / 2 - shadow_padding
215
- shadow_y = subtitle_y_position - shadow_height_extra
216
- shadow_bg = shadow_bg.set_position((shadow_x, shadow_y))
217
- word_clips.append(shadow_bg)
218
- for line in lines:
219
- line_words = line['words']
220
- word_dimensions = []
221
- for wordJSON in line_words:
222
- word_upper = wordJSON['word'].upper()
223
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
224
- temp_space = get_cached_text_clip(" ", font, fontsize, color)
225
- word_width, word_height = temp_word.size
226
- space_width, _ = temp_space.size
227
- word_dimensions.append({
228
- 'word_data': wordJSON,
229
- 'word_width': word_width,
230
- 'word_height': word_height,
231
- 'space_width': space_width,
232
- 'word_upper': word_upper
233
- })
234
- line_start_x = (frame_width - line['width']) / 2
235
- current_x = line_start_x
236
- for word_dim in word_dimensions:
237
- wordJSON = word_dim['word_data']
238
- word_width = word_dim['word_width']
239
- word_height = word_dim['word_height']
240
- space_width = word_dim['space_width']
241
- word_upper = word_dim['word_upper']
242
- shadow_text = get_cached_text_clip(word_upper, font, fontsize, 'black')
243
- shadow_text = shadow_text.set_start(textJSON['start']).set_duration(full_duration)
244
- shadow_text = shadow_text.set_position((current_x + 1, current_y + 1)).set_opacity(0.3)
245
- word_clips.append(shadow_text)
246
- word_clip = get_cached_text_clip(word_upper, font, fontsize, color)
247
- word_clip = word_clip.set_start(textJSON['start']).set_duration(full_duration)
248
- word_clip = word_clip.set_position((current_x, current_y))
249
- space_clip = get_cached_text_clip(" ", font, fontsize, color)
250
- space_clip = space_clip.set_start(textJSON['start']).set_duration(full_duration)
251
- space_clip = space_clip.set_position((current_x + word_width, current_y))
252
- xy_textclips_positions.append({
253
- "x_pos": current_x,
254
- "y_pos": current_y,
255
- "width": word_width,
256
- "height": word_height,
257
- "word": word_upper,
258
- "start": wordJSON['start'],
259
- "end": wordJSON['end'],
260
- "duration": wordJSON['end'] - wordJSON['start']
261
- })
262
- word_clips.append(word_clip)
263
- word_clips.append(space_clip)
264
- current_x += word_width + space_width
265
- current_y += line['height'] + 3
266
- for highlight_word in xy_textclips_positions:
267
- bg_width = int(highlight_word['width'] + 16)
268
- bg_height = int(highlight_word['height'] + 8)
269
- img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
270
- draw = ImageDraw.Draw(img)
271
- draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=8, fill=(147, 0, 211, 180))
272
- img_array = np.array(img)
273
- bg_clip = ImageClip(img_array, duration=highlight_word['duration'])
274
- bg_clip = bg_clip.set_start(highlight_word['start'])
275
- bg_x = highlight_word['x_pos'] - 8
276
- bg_y = highlight_word['y_pos'] - 4
277
- bg_clip = bg_clip.set_position((bg_x, bg_y))
278
- shadow_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'black')
279
- shadow_highlight = shadow_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
280
- shadow_highlight = shadow_highlight.set_position((highlight_word['x_pos'] + 1, highlight_word['y_pos'] + 1)).set_opacity(0.4)
281
- word_clip_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'white')
282
- word_clip_highlight = word_clip_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
283
- word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
284
- word_clips.append(bg_clip)
285
- word_clips.append(shadow_highlight)
286
- word_clips.append(word_clip_highlight)
287
- return word_clips
288
-
289
- def get_random_subclip_and_slow(clip):
290
- subclip_durations = [2, 3, 4]
291
- subclip_duration = random.choice(subclip_durations)
292
- if clip.duration < subclip_duration:
293
- return clip.speedx(0.5)
294
- start_time = random.uniform(0, clip.duration - subclip_duration)
295
- subclip = clip.subclip(start_time, start_time + subclip_duration)
296
- return subclip.speedx(0.5)
297
-
298
- def ensure_even_dimensions(clip):
299
- width, height = clip.size
300
- if width % 2 != 0:
301
- width -= 1
302
- if height % 2 != 0:
303
- height -= 1
304
- if (width, height) != clip.size:
305
- return clip.resize((width, height))
306
- return clip
307
-
308
- def apply_transition_effect(clip1, clip2, transition_type, duration=0.5):
309
- if transition_type == "Smooth Blend":
310
- return clip1.crossfadeout(duration), clip2.crossfadein(duration)
311
- elif transition_type == "Ken Burns Zoom":
312
- def zoom_in(t):
313
- return 1 + (0.15 * min(t / clip1.duration, 1))
314
- clip1_zoom = clip1.resize(zoom_in)
315
- clip1_out = clip1_zoom.crossfadeout(duration)
316
- def zoom_out(t):
317
- return 1.15 - (0.15 * min(t / duration, 1))
318
- clip2_zoom = clip2.resize(zoom_out) if clip2.duration >= duration else clip2
319
- clip2_in = clip2_zoom.crossfadein(duration)
320
- return clip1_out, clip2_in
321
- elif transition_type == "Whip Pan":
322
- return clip1.fadeout(duration * 0.5), clip2.fadein(duration * 0.5)
323
- elif transition_type == "Dreamy Fade":
324
- return clip1.crossfadeout(duration * 1.2), clip2.crossfadein(duration * 1.2)
325
- elif transition_type == "Snap Cut":
326
- return clip1, clip2
327
- else:
328
- return clip1.crossfadeout(duration), clip2.crossfadein(duration)
329
-
330
- def process_voiceover_to_subtitles(voice_over_path):
331
- global generation_cancelled
332
- try:
333
- if generation_cancelled:
334
- return [], ""
335
- model = whisper.load_model("tiny")
336
- result = model.transcribe(voice_over_path, word_timestamps=True, fp16=False)
337
- if generation_cancelled:
338
- return [], ""
339
- wordlevel_info = []
340
- for segment in result['segments']:
341
- if generation_cancelled:
342
- return [], ""
343
- if 'words' in segment:
344
- for word in segment['words']:
345
- wordlevel_info.append({'word': word['word'].strip(), 'start': word['start'], 'end': word['end']})
346
- return split_text_into_lines(wordlevel_info), result['text']
347
- except Exception as e:
348
- if generation_cancelled:
349
- return [], ""
350
- raise e
351
-
352
- def cleanup_resources():
353
- global current_video_clip
354
- try:
355
- if current_video_clip:
356
- current_video_clip.close()
357
- current_video_clip = None
358
- except:
359
- pass
360
-
361
- def cancel_generation():
362
- global generation_cancelled
363
- generation_cancelled = True
364
- cleanup_resources()
365
- return "Generation cancelled", None
366
-
367
- def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_text, duration_minutes, video_quality, transition_type, progress=gr.Progress(track_tqdm=True)):
368
- global generation_cancelled, current_video_clip
369
- generation_cancelled = False
370
- current_video_clip = None
371
- progress(0, desc="Starting...")
372
- if generation_cancelled:
373
- return None, "Generation cancelled"
374
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
375
-
376
- # Updated paths for Hugging Face
377
- source_path = 'video_clips'
378
- if not os.path.isdir(source_path):
379
- return None, "Video clips folder not found. Please upload video clips to the 'video_clips' folder."
380
- output_path = 'exports'
381
- os.makedirs(output_path, exist_ok=True)
382
-
383
- video_extensions = ('.mp4', '.avi', '.mkv', '.mov')
384
- all_files = [f for f in os.listdir(source_path) if f.lower().endswith(video_extensions)]
385
- if not all_files:
386
- return None, "No video files found in 'video_clips' folder"
387
- random.shuffle(all_files)
388
- if generation_cancelled:
389
- return None, "Generation cancelled"
390
- bg_music_path = None
391
- bg_music_folder_path = 'background_music'
392
- if os.path.isdir(bg_music_folder_path):
393
- audio_extensions = ('.mp3', '.wav', '.m4a', '.aac')
394
- possible_files = [f for f in os.listdir(bg_music_folder_path) if f.lower().endswith(audio_extensions) and not f.startswith('voiceover_')]
395
- if len(possible_files) >= 1:
396
- bg_music_path = os.path.join(bg_music_folder_path, possible_files[0])
397
- target_duration_seconds = 0
398
- voice_over_audio = None
399
- linelevel_subtitles = None
400
- voice_over_path = None
401
- if text_input and text_input.strip():
402
- progress(0.1, desc="Generating TTS...")
403
- voice_name = AVAILABLE_VOICES[voice_selection]["name"] if voice_selection in AVAILABLE_VOICES else "Puck"
404
- tts_path, tts_message = generate_tts_audio(text_input, voice_name)
405
- if generation_cancelled:
406
- return None, "Generation cancelled"
407
- if tts_path:
408
- voice_over_folder_path = 'voice_over'
409
- os.makedirs(voice_over_folder_path, exist_ok=True)
410
- voice_filename = f"tts_voiceover_{timestamp}.wav"
411
- saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
412
- shutil.copy2(tts_path, saved_voice_path)
413
- voice_over_path = saved_voice_path
414
- else:
415
- return None, f"TTS failed: {tts_message}"
416
- elif audio_input:
417
- if generation_cancelled:
418
- return None, "Generation cancelled"
419
- voice_over_folder_path = 'voice_over'
420
- os.makedirs(voice_over_folder_path, exist_ok=True)
421
- voice_filename = f"uploaded_voiceover_{timestamp}.mp3"
422
- saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
423
- shutil.copy2(audio_input, saved_voice_path)
424
- voice_over_path = saved_voice_path
425
- if voice_over_path:
426
- try:
427
- progress(0.2, desc="Processing audio...")
428
- if generation_cancelled:
429
- return None, "Generation cancelled"
430
- voice_over_audio = AudioFileClip(voice_over_path)
431
- target_duration_seconds = voice_over_audio.duration
432
- linelevel_subtitles, _ = process_voiceover_to_subtitles(voice_over_path)
433
- if generation_cancelled:
434
- voice_over_audio.close()
435
- return None, "Generation cancelled"
436
- except Exception as e:
437
- return None, f"Audio error: {str(e)}"
438
- else:
439
- if not bg_music_path:
440
- return None, "Need text/audio or background music"
441
- target_duration_seconds = duration_minutes * 60
442
- progress(0.3, desc="Preparing audio...")
443
- if generation_cancelled:
444
- if voice_over_audio:
445
- voice_over_audio.close()
446
- return None, "Generation cancelled"
447
- audio_tracks = []
448
- if voice_over_audio:
449
- audio_tracks.append(voice_over_audio)
450
- if bg_music_path:
451
- try:
452
- background_audio = AudioFileClip(bg_music_path)
453
- background_audio = background_audio.fx(afx.volumex, 0.015)
454
- background_audio = background_audio.fx(afx.audio_loop, duration=target_duration_seconds)
455
- audio_tracks.append(background_audio)
456
- except Exception as e:
457
- print(f"Background music error: {e}")
458
- final_audio = CompositeAudioClip(audio_tracks) if len(audio_tracks) > 1 else (audio_tracks[0] if audio_tracks else None)
459
- progress(0.4, desc="Setting up video...")
460
- if generation_cancelled:
461
- cleanup_resources()
462
- return None, "Generation cancelled"
463
- if video_quality == "High":
464
- target_height, bitrate, preset, crf = 1080, "8000k", "veryfast", "20"
465
- elif video_quality == "Standard":
466
- target_height, bitrate, preset, crf = 720, "4000k", "veryfast", "24"
467
- else:
468
- target_height, bitrate, preset, crf = 480, "1000k", "ultrafast", "28"
469
- progress(0.5, desc="Processing clips...")
470
- video_clips = []
471
- current_duration = 0
472
- file_index = 0
473
- safety_counter = 0
474
- max_iterations = len(all_files) * 3
475
- while current_duration < target_duration_seconds and safety_counter < max_iterations:
476
- if generation_cancelled:
477
- for clip in video_clips:
478
- try:
479
- clip.close()
480
- except:
481
- pass
482
- cleanup_resources()
483
- return None, "Generation cancelled"
484
- if file_index >= len(all_files):
485
- file_index = 0
486
- random.shuffle(all_files)
487
- video_file = all_files[file_index]
488
- file_index += 1
489
- safety_counter += 1
490
- try:
491
- full_clip = VideoFileClip(os.path.join(source_path, video_file))
492
- current_video_clip = full_clip
493
- if generation_cancelled:
494
- full_clip.close()
495
- cleanup_resources()
496
- return None, "Generation cancelled"
497
- if full_clip.h != target_height:
498
- aspect_ratio = full_clip.w / full_clip.h
499
- new_width = int(target_height * aspect_ratio)
500
- if new_width % 2 != 0:
501
- new_width -= 1
502
- adjusted_height = target_height if target_height % 2 == 0 else target_height - 1
503
- full_clip = full_clip.resize((new_width, adjusted_height))
504
- else:
505
- full_clip = ensure_even_dimensions(full_clip)
506
- subclip = get_random_subclip_and_slow(full_clip)
507
- remaining_duration = target_duration_seconds - current_duration
508
- if subclip.duration > remaining_duration:
509
- subclip = subclip.subclip(0, remaining_duration)
510
- video_clips.append(ensure_even_dimensions(subclip))
511
- current_duration += subclip.duration
512
- progress(0.5 + (safety_counter * 0.1 / max_iterations), desc=f"Clip {len(video_clips)}")
513
- except Exception as e:
514
- print(f"Error: {e}")
515
- continue
516
- if generation_cancelled:
517
- for clip in video_clips:
518
- try:
519
- clip.close()
520
- except:
521
- pass
522
- cleanup_resources()
523
- return None, "Generation cancelled"
524
- if not video_clips:
525
- return None, "No clips processed"
526
-
527
- total_video_duration = sum(clip.duration for clip in video_clips)
528
- duration_diff = total_video_duration - target_duration_seconds
529
- if abs(duration_diff) > 0.1:
530
- if duration_diff > 0:
531
- trim_amount = duration_diff
532
- new_last_clip = video_clips[-1].subclip(0, video_clips[-1].duration - trim_amount)
533
- video_clips[-1] = new_last_clip
534
- else:
535
- extend_amount = abs(duration_diff)
536
- new_last_clip = video_clips[-1].fx(vfx.loop, duration=video_clips[-1].duration + extend_amount)
537
- video_clips[-1] = new_last_clip
538
- progress(0.6, desc="Applying transitions...")
539
- transition_duration = {"Snap Cut": 0.1, "Whip Pan": 0.3, "Dreamy Fade": 0.8, "Smooth Blend": 0.5, "Ken Burns Zoom": 0.5}.get(transition_type, 0.5)
540
- processed_clips = []
541
- for i in range(len(video_clips)):
542
- if i == 0:
543
- if len(video_clips) > 1:
544
- clip_out, _ = apply_transition_effect(video_clips[i], video_clips[i+1], transition_type, transition_duration)
545
- processed_clips.append(clip_out)
546
- else:
547
- processed_clips.append(video_clips[i])
548
- elif i == len(video_clips) - 1:
549
- _, clip_in = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
550
- processed_clips.append(clip_in)
551
- else:
552
- _, clip_with_transition = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
553
- processed_clips.append(clip_with_transition)
554
- progress(0.7, desc="Concatenating...")
555
- if generation_cancelled:
556
- for c in processed_clips:
557
- try:
558
- c.close()
559
- except:
560
- pass
561
- cleanup_resources()
562
- return None, "Generation cancelled"
563
- if transition_type == "Snap Cut":
564
- final_video_only = concatenate_videoclips(processed_clips, method="compose")
565
- else:
566
- final_video_only = concatenate_videoclips(processed_clips, method="compose", padding=-transition_duration)
567
- final_video_only = ensure_even_dimensions(final_video_only)
568
- current_video_clip = final_video_only
569
- if final_audio:
570
- final_video_only = final_video_only.set_duration(final_audio.duration)
571
- progress(0.8, desc="Adding overlays...")
572
- if generation_cancelled:
573
- try:
574
- final_video_only.close()
575
- except:
576
- pass
577
- cleanup_resources()
578
- return None, "Generation cancelled"
579
- all_subtitle_clips = []
580
- if linelevel_subtitles:
581
- for line in linelevel_subtitles:
582
- if generation_cancelled:
583
- try:
584
- final_video_only.close()
585
- except:
586
- pass
587
- cleanup_resources()
588
- return None, "Generation cancelled"
589
- try:
590
- subtitle_fontsize = min(42, final_video_only.size[1] // 25)
591
- all_subtitle_clips.extend(create_caption(line, final_video_only.size, font="Helvetica-Bold", fontsize=subtitle_fontsize, color='white'))
592
- except Exception as e:
593
- print(f"Subtitle error: {e}")
594
- continue
595
- all_clips = [final_video_only.set_opacity(0.65)]
596
- if all_subtitle_clips:
597
- all_clips.extend(all_subtitle_clips)
598
- if title_text and title_text.strip():
599
- title_clips = create_title_overlay(title_text, final_video_only.size, duration=4)
600
- all_clips.extend(title_clips)
601
- final_video = CompositeVideoClip(all_clips)
602
- current_video_clip = final_video
603
- if final_audio:
604
- final_video = final_video.set_audio(final_audio)
605
- progress(0.9, desc="Exporting...")
606
- if generation_cancelled:
607
- try:
608
- final_video.close()
609
- except:
610
- pass
611
- cleanup_resources()
612
- return None, "Generation cancelled"
613
- output_filename = f'video_{timestamp}.mp4'
614
- final_output_path = os.path.join(output_path, output_filename)
615
- try:
616
- final_video.write_videofile(
617
- final_output_path,
618
- codec="libx264",
619
- audio_codec="aac",
620
- fps=24,
621
- preset=preset,
622
- bitrate=bitrate,
623
- audio_bitrate="128k",
624
- threads=8,
625
- ffmpeg_params=["-crf", crf, "-pix_fmt", "yuv420p", "-movflags", "+faststart", "-tune", "fastdecode"]
626
- )
627
- except Exception as e:
628
- if generation_cancelled:
629
- return None, "Generation cancelled"
630
- return None, f"Export error: {str(e)}"
631
- progress(1.0, desc="Done")
632
- if generation_cancelled:
633
- try:
634
- if os.path.exists(final_output_path):
635
- os.remove(final_output_path)
636
- except:
637
- pass
638
- cleanup_resources()
639
- return None, "Generation cancelled"
640
- try:
641
- final_video.close()
642
- if voice_over_audio:
643
- voice_over_audio.close()
644
- current_video_clip = None
645
- except:
646
- pass
647
- audio_source = ""
648
- if text_input and text_input.strip():
649
- audio_source = f"TTS ({AVAILABLE_VOICES[voice_selection]['name'] if voice_selection in AVAILABLE_VOICES else 'Puck'})"
650
- elif voice_over_path:
651
- audio_source = "Uploaded Audio"
652
- else:
653
- audio_source = "Background Music"
654
- summary = f"Complete\n{output_filename}\n{audio_source}\n{transition_type}\n{target_duration_seconds:.1f}s\n{len(linelevel_subtitles) if linelevel_subtitles else 0} subtitles"
655
- return final_output_path, summary
656
-
657
- with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface:
658
- gr.Markdown("# 🎬 AI Video Generator")
659
- gr.Markdown("Upload video clips to `video_clips` folder and optionally background music to `background_music` folder.")
660
-
661
- with gr.Row():
662
- with gr.Column():
663
- text_input = gr.Textbox(label="Text for TTS", lines=4, placeholder="Enter text to convert to speech...")
664
- voice_dropdown = gr.Dropdown(
665
- choices=[(f"{v['name']} - {v['description']}", k) for k, v in AVAILABLE_VOICES.items()],
666
- value="Puck",
667
- label="Voice Selection"
668
- )
669
- audio_input = gr.Audio(type="filepath", label="Or Upload Audio File")
670
- title_input = gr.Textbox(label="Video Title (Optional)", lines=2, placeholder="Enter video title...")
671
- duration_slider = gr.Slider(0.5, 10, 2, 0.5, label="Duration (minutes) - only used if no audio")
672
- quality_radio = gr.Radio(["High", "Standard", "Preview"], value="High", label="Video Quality")
673
- transition_radio = gr.Radio(
674
- ["Smooth Blend", "Ken Burns Zoom", "Whip Pan", "Dreamy Fade", "Snap Cut"],
675
- value="Smooth Blend",
676
- label="Transition Effect"
677
- )
678
- with gr.Row():
679
- submit_btn = gr.Button("🎥 Generate Video", variant="primary", size="lg")
680
- stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")
681
-
682
- with gr.Column():
683
- video_output = gr.Video(label="Generated Video")
684
- summary_output = gr.Textbox(label="Status", lines=8)
685
-
686
- submit_btn.click(
687
- fn=merge_videos_with_subtitles,
688
- inputs=[text_input, voice_dropdown, audio_input, title_input, duration_slider, quality_radio, transition_radio],
689
- outputs=[video_output, summary_output]
690
- )
691
- stop_btn.click(fn=cancel_generation, outputs=[summary_output, video_output])
692
-
693
- if __name__ == "__main__":
694
- interface.launch(
695
- server_name="0.0.0.0",
696
- server_port=7860,
697
- show_error=True
698
  )
 
1
+ import os
2
+ from datetime import datetime
3
+ import random
4
+ import whisper
5
+ import shutil
6
+ import wave
7
+ import base64
8
+ from moviepy.editor import (VideoFileClip, AudioFileClip, TextClip,
9
+ concatenate_videoclips, CompositeVideoClip, CompositeAudioClip, ImageClip)
10
+ import moviepy.audio.fx.all as afx
11
+ import moviepy.video.fx.all as vfx
12
+ import gradio as gr
13
+ from PIL import Image, ImageDraw, ImageFilter, ImageFont
14
+ import numpy as np
15
+ from functools import lru_cache
16
+ import urllib.request
17
+ from google import genai
18
+ from google.genai import types
19
+
20
+ # Create necessary directories
21
+ os.makedirs('video_clips', exist_ok=True)
22
+ os.makedirs('background_music', exist_ok=True)
23
+ os.makedirs('voice_over', exist_ok=True)
24
+ os.makedirs('exports', exist_ok=True)
25
+
26
+ # Get API key from environment variable (will be set in Hugging Face Space settings)
27
+ GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
28
+ if GOOGLE_API_KEY:
29
+ os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
30
+
31
+ generation_cancelled = False
32
+ current_video_clip = None
33
+
34
+ AVAILABLE_VOICES = {
35
+ "Puck": {"name": "Puck", "description": "Young adult female (US)"},
36
+ "Charon": {"name": "Charon", "description": "Young adult male (US)"},
37
+ "Kore": {"name": "Kore", "description": "Young adult female (US)"},
38
+ "Fenrir": {"name": "Fenrir", "description": "Young adult male (US)"},
39
+ "Aoede": {"name": "Aoede", "description": "Young adult female (US)"}
40
+ }
41
+
42
+ def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2):
43
+ with wave.open(filename, "wb") as wf:
44
+ wf.setnchannels(channels)
45
+ wf.setsampwidth(sample_width)
46
+ wf.setframerate(rate)
47
+ wf.writeframes(pcm_data)
48
+
49
+ def generate_tts_audio(text_input, voice_name="Puck"):
50
+ global generation_cancelled
51
+ try:
52
+ if generation_cancelled:
53
+ return None, "Generation cancelled"
54
+ client = genai.Client()
55
+ response = client.models.generate_content(
56
+ model="gemini-2.5-flash-preview-tts",
57
+ contents=text_input,
58
+ config=types.GenerateContentConfig(
59
+ response_modalities=["AUDIO"],
60
+ speech_config=types.SpeechConfig(
61
+ voice_config=types.VoiceConfig(
62
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name)
63
+ )
64
+ ),
65
+ )
66
+ )
67
+ if generation_cancelled:
68
+ return None, "Generation cancelled"
69
+ audio_data = response.candidates[0].content.parts[0].inline_data.data
70
+ if isinstance(audio_data, str):
71
+ audio_data = base64.b64decode(audio_data)
72
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
73
+ temp_audio_path = f'/tmp/tts_audio_{timestamp}.wav'
74
+ wave_file(temp_audio_path, audio_data)
75
+ return temp_audio_path, "TTS generated"
76
+ except Exception as e:
77
+ return None, f"Error: {str(e)}"
78
+
79
+ def split_text_into_lines(data):
80
+ MaxChars, MaxDuration, MaxGap = 60, 2.5, 1.5
81
+ subtitles, line, line_duration = [], [], 0
82
+ for idx, word_data in enumerate(data):
83
+ line.append(word_data)
84
+ line_duration += word_data["end"] - word_data["start"]
85
+ chars_exceeded = len(" ".join(item["word"] for item in line)) > MaxChars
86
+ duration_exceeded = line_duration > MaxDuration
87
+ sentence_ended = word_data["word"].rstrip().endswith(('.', '!', '?'))
88
+ maxgap_exceeded = idx > 0 and word_data['start'] - data[idx-1]['end'] > MaxGap
89
+ if chars_exceeded or duration_exceeded or sentence_ended or maxgap_exceeded:
90
+ if line:
91
+ subtitles.append({
92
+ "word": " ".join(item["word"] for item in line),
93
+ "start": line[0]["start"],
94
+ "end": line[-1]["end"],
95
+ "textcontents": line
96
+ })
97
+ line, line_duration = [], 0
98
+ if line:
99
+ subtitles.append({
100
+ "word": " ".join(item["word"] for item in line),
101
+ "start": line[0]["start"],
102
+ "end": line[-1]["end"],
103
+ "textcontents": line
104
+ })
105
+ return subtitles
106
+
107
+ @lru_cache(maxsize=1000)
108
+ def get_cached_text_clip(text, font, fontsize, color):
109
+ return TextClip(text, font=font, fontsize=fontsize, color=color)
110
+
111
+ def create_title_overlay(title_text, framesize, duration=4):
112
+ if not title_text or not title_text.strip():
113
+ return []
114
+ frame_width, frame_height = framesize
115
+ FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf"
116
+ FONT_PATH = "/tmp/Poppins-Bold.ttf"
117
+ if not os.path.exists(FONT_PATH):
118
+ try:
119
+ urllib.request.urlretrieve(FONT_URL, FONT_PATH)
120
+ except:
121
+ FONT_PATH = None
122
+ TOP_MARGIN = int(frame_height * 0.115)
123
+ FONT_SIZE = int(frame_height * 0.042)
124
+ STROKE_WIDTH = max(1, int(frame_height * 0.003))
125
+ LINE_SPACING = max(4, int(frame_height * 0.008))
126
+ def load_font(size):
127
+ try:
128
+ if FONT_PATH and os.path.exists(FONT_PATH):
129
+ return ImageFont.truetype(FONT_PATH, size)
130
+ return ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", size)
131
+ except:
132
+ return ImageFont.load_default()
133
+ font_obj = load_font(FONT_SIZE)
134
+ base = Image.new("RGBA", (frame_width, frame_height), (0, 0, 0, 0))
135
+ temp_img = Image.new("RGBA", (frame_width, frame_height), (0,0,0,0))
136
+ temp_draw = ImageDraw.Draw(temp_img)
137
+ def measure_text(text, font):
138
+ try:
139
+ bbox = temp_draw.textbbox((0,0), text, font=font, stroke_width=STROKE_WIDTH)
140
+ return bbox[2]-bbox[0], bbox[3]-bbox[1]
141
+ except:
142
+ return 100, 50
143
+ def wrap_text(text, font, max_width):
144
+ words = text.upper().split()
145
+ lines, current = [], []
146
+ for word in words:
147
+ test_line = " ".join(current + [word])
148
+ w, _ = measure_text(test_line, font)
149
+ if w <= max_width:
150
+ current.append(word)
151
+ else:
152
+ if current:
153
+ lines.append(" ".join(current))
154
+ current = [word]
155
+ else:
156
+ lines.append(word)
157
+ current = []
158
+ if current:
159
+ lines.append(" ".join(current))
160
+ return lines[:4]
161
+ lines = wrap_text(title_text, font_obj, frame_width * 0.90)
162
+ line_heights = [measure_text(line, font_obj)[1] for line in lines]
163
+ y_start = TOP_MARGIN
164
+ x_center = frame_width // 2
165
+ draw = ImageDraw.Draw(base)
166
+ y = y_start
167
+ for i, line in enumerate(lines):
168
+ w, h = measure_text(line, font_obj)
169
+ x = x_center - w // 2
170
+ draw.text((x+2, y+2), line, font=font_obj, fill=(0,0,0,180))
171
+ draw.text((x, y), line, font=font_obj, fill=(255,255,255,255), stroke_width=STROKE_WIDTH, stroke_fill=(0,0,0,255))
172
+ y += line_heights[i] + LINE_SPACING
173
+ return [ImageClip(np.array(base), duration=duration)]
174
+
175
+ def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=14, color='white'):
176
+ full_duration = textJSON['end'] - textJSON['start']
177
+ word_clips = []
178
+ xy_textclips_positions = []
179
+ frame_width, frame_height = framesize
180
+ max_line_width = frame_width * 0.8
181
+ lines, current_line, current_line_width = [], [], 0
182
+ for wordJSON in textJSON['textcontents']:
183
+ word_upper = wordJSON['word'].upper()
184
+ temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
185
+ temp_space = get_cached_text_clip(" ", font, fontsize, color)
186
+ word_width, word_height = temp_word.size
187
+ space_width, _ = temp_space.size
188
+ if current_line_width + word_width + space_width > max_line_width and current_line:
189
+ lines.append({'words': current_line.copy(), 'width': current_line_width, 'height': word_height})
190
+ current_line = [wordJSON]
191
+ current_line_width = word_width + space_width
192
+ else:
193
+ current_line.append(wordJSON)
194
+ current_line_width += word_width + space_width
195
+ if current_line:
196
+ word_upper = current_line[0]['word'].upper()
197
+ temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
198
+ _, word_height = temp_word.size
199
+ lines.append({'words': current_line, 'width': current_line_width, 'height': word_height})
200
+ total_text_height = sum(line['height'] for line in lines) + (len(lines) - 1) * 3
201
+ subtitle_y_position = int(frame_height * 0.65)
202
+ current_y = subtitle_y_position
203
+ if lines:
204
+ shadow_padding = 25
205
+ shadow_height_extra = 15
206
+ total_subtitle_width = max(line['width'] for line in lines)
207
+ bg_width = int(total_subtitle_width + shadow_padding * 2)
208
+ bg_height = int(total_text_height + shadow_height_extra * 2)
209
+ img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
210
+ draw = ImageDraw.Draw(img)
211
+ draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=15, fill=(0, 0, 0, 128))
212
+ img_array = np.array(img)
213
+ shadow_bg = ImageClip(img_array, duration=full_duration).set_start(textJSON['start'])
214
+ shadow_x = (frame_width - total_subtitle_width) / 2 - shadow_padding
215
+ shadow_y = subtitle_y_position - shadow_height_extra
216
+ shadow_bg = shadow_bg.set_position((shadow_x, shadow_y))
217
+ word_clips.append(shadow_bg)
218
+ for line in lines:
219
+ line_words = line['words']
220
+ word_dimensions = []
221
+ for wordJSON in line_words:
222
+ word_upper = wordJSON['word'].upper()
223
+ temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
224
+ temp_space = get_cached_text_clip(" ", font, fontsize, color)
225
+ word_width, word_height = temp_word.size
226
+ space_width, _ = temp_space.size
227
+ word_dimensions.append({
228
+ 'word_data': wordJSON,
229
+ 'word_width': word_width,
230
+ 'word_height': word_height,
231
+ 'space_width': space_width,
232
+ 'word_upper': word_upper
233
+ })
234
+ line_start_x = (frame_width - line['width']) / 2
235
+ current_x = line_start_x
236
+ for word_dim in word_dimensions:
237
+ wordJSON = word_dim['word_data']
238
+ word_width = word_dim['word_width']
239
+ word_height = word_dim['word_height']
240
+ space_width = word_dim['space_width']
241
+ word_upper = word_dim['word_upper']
242
+ shadow_text = get_cached_text_clip(word_upper, font, fontsize, 'black')
243
+ shadow_text = shadow_text.set_start(textJSON['start']).set_duration(full_duration)
244
+ shadow_text = shadow_text.set_position((current_x + 1, current_y + 1)).set_opacity(0.3)
245
+ word_clips.append(shadow_text)
246
+ word_clip = get_cached_text_clip(word_upper, font, fontsize, color)
247
+ word_clip = word_clip.set_start(textJSON['start']).set_duration(full_duration)
248
+ word_clip = word_clip.set_position((current_x, current_y))
249
+ space_clip = get_cached_text_clip(" ", font, fontsize, color)
250
+ space_clip = space_clip.set_start(textJSON['start']).set_duration(full_duration)
251
+ space_clip = space_clip.set_position((current_x + word_width, current_y))
252
+ xy_textclips_positions.append({
253
+ "x_pos": current_x,
254
+ "y_pos": current_y,
255
+ "width": word_width,
256
+ "height": word_height,
257
+ "word": word_upper,
258
+ "start": wordJSON['start'],
259
+ "end": wordJSON['end'],
260
+ "duration": wordJSON['end'] - wordJSON['start']
261
+ })
262
+ word_clips.append(word_clip)
263
+ word_clips.append(space_clip)
264
+ current_x += word_width + space_width
265
+ current_y += line['height'] + 3
266
+ for highlight_word in xy_textclips_positions:
267
+ bg_width = int(highlight_word['width'] + 16)
268
+ bg_height = int(highlight_word['height'] + 8)
269
+ img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
270
+ draw = ImageDraw.Draw(img)
271
+ draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=8, fill=(147, 0, 211, 180))
272
+ img_array = np.array(img)
273
+ bg_clip = ImageClip(img_array, duration=highlight_word['duration'])
274
+ bg_clip = bg_clip.set_start(highlight_word['start'])
275
+ bg_x = highlight_word['x_pos'] - 8
276
+ bg_y = highlight_word['y_pos'] - 4
277
+ bg_clip = bg_clip.set_position((bg_x, bg_y))
278
+ shadow_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'black')
279
+ shadow_highlight = shadow_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
280
+ shadow_highlight = shadow_highlight.set_position((highlight_word['x_pos'] + 1, highlight_word['y_pos'] + 1)).set_opacity(0.4)
281
+ word_clip_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'white')
282
+ word_clip_highlight = word_clip_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
283
+ word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
284
+ word_clips.append(bg_clip)
285
+ word_clips.append(shadow_highlight)
286
+ word_clips.append(word_clip_highlight)
287
+ return word_clips
288
+
289
+ def get_random_subclip_and_slow(clip):
290
+ subclip_durations = [2, 3, 4]
291
+ subclip_duration = random.choice(subclip_durations)
292
+ if clip.duration < subclip_duration:
293
+ return clip.speedx(0.5)
294
+ start_time = random.uniform(0, clip.duration - subclip_duration)
295
+ subclip = clip.subclip(start_time, start_time + subclip_duration)
296
+ return subclip.speedx(0.5)
297
+
298
+ def ensure_even_dimensions(clip):
299
+ width, height = clip.size
300
+ if width % 2 != 0:
301
+ width -= 1
302
+ if height % 2 != 0:
303
+ height -= 1
304
+ if (width, height) != clip.size:
305
+ return clip.resize((width, height))
306
+ return clip
307
+
308
+ def apply_transition_effect(clip1, clip2, transition_type, duration=0.5):
309
+ if transition_type == "Smooth Blend":
310
+ return clip1.crossfadeout(duration), clip2.crossfadein(duration)
311
+ elif transition_type == "Ken Burns Zoom":
312
+ def zoom_in(t):
313
+ return 1 + (0.15 * min(t / clip1.duration, 1))
314
+ clip1_zoom = clip1.resize(zoom_in)
315
+ clip1_out = clip1_zoom.crossfadeout(duration)
316
+ def zoom_out(t):
317
+ return 1.15 - (0.15 * min(t / duration, 1))
318
+ clip2_zoom = clip2.resize(zoom_out) if clip2.duration >= duration else clip2
319
+ clip2_in = clip2_zoom.crossfadein(duration)
320
+ return clip1_out, clip2_in
321
+ elif transition_type == "Whip Pan":
322
+ return clip1.fadeout(duration * 0.5), clip2.fadein(duration * 0.5)
323
+ elif transition_type == "Dreamy Fade":
324
+ return clip1.crossfadeout(duration * 1.2), clip2.crossfadein(duration * 1.2)
325
+ elif transition_type == "Snap Cut":
326
+ return clip1, clip2
327
+ else:
328
+ return clip1.crossfadeout(duration), clip2.crossfadein(duration)
329
+
330
+ def process_voiceover_to_subtitles(voice_over_path):
331
+ global generation_cancelled
332
+ try:
333
+ if generation_cancelled:
334
+ return [], ""
335
+ model = whisper.load_model("tiny")
336
+ result = model.transcribe(voice_over_path, word_timestamps=True, fp16=False)
337
+ if generation_cancelled:
338
+ return [], ""
339
+ wordlevel_info = []
340
+ for segment in result['segments']:
341
+ if generation_cancelled:
342
+ return [], ""
343
+ if 'words' in segment:
344
+ for word in segment['words']:
345
+ wordlevel_info.append({'word': word['word'].strip(), 'start': word['start'], 'end': word['end']})
346
+ return split_text_into_lines(wordlevel_info), result['text']
347
+ except Exception as e:
348
+ if generation_cancelled:
349
+ return [], ""
350
+ raise e
351
+
352
+ def cleanup_resources():
353
+ global current_video_clip
354
+ try:
355
+ if current_video_clip:
356
+ current_video_clip.close()
357
+ current_video_clip = None
358
+ except:
359
+ pass
360
+
361
+ def cancel_generation():
362
+ global generation_cancelled
363
+ generation_cancelled = True
364
+ cleanup_resources()
365
+ return "Generation cancelled", None
366
+
367
+ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_text, duration_minutes, video_quality, transition_type, progress=gr.Progress(track_tqdm=True)):
368
+ global generation_cancelled, current_video_clip
369
+ generation_cancelled = False
370
+ current_video_clip = None
371
+ progress(0, desc="Starting...")
372
+ if generation_cancelled:
373
+ return None, "Generation cancelled"
374
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
375
+
376
+ # Updated paths for Hugging Face
377
+ source_path = 'video_clips'
378
+ if not os.path.isdir(source_path):
379
+ return None, "Video clips folder not found. Please upload video clips to the 'video_clips' folder."
380
+ output_path = 'exports'
381
+ os.makedirs(output_path, exist_ok=True)
382
+
383
+ video_extensions = ('.mp4', '.avi', '.mkv', '.mov')
384
+ all_files = [f for f in os.listdir(source_path) if f.lower().endswith(video_extensions)]
385
+ if not all_files:
386
+ return None, "No video files found in 'video_clips' folder"
387
+ random.shuffle(all_files)
388
+ if generation_cancelled:
389
+ return None, "Generation cancelled"
390
+ bg_music_path = None
391
+ bg_music_folder_path = 'background_music'
392
+ if os.path.isdir(bg_music_folder_path):
393
+ audio_extensions = ('.mp3', '.wav', '.m4a', '.aac')
394
+ possible_files = [f for f in os.listdir(bg_music_folder_path) if f.lower().endswith(audio_extensions) and not f.startswith('voiceover_')]
395
+ if len(possible_files) >= 1:
396
+ bg_music_path = os.path.join(bg_music_folder_path, possible_files[0])
397
+ target_duration_seconds = 0
398
+ voice_over_audio = None
399
+ linelevel_subtitles = None
400
+ voice_over_path = None
401
+ if text_input and text_input.strip():
402
+ progress(0.1, desc="Generating TTS...")
403
+ voice_name = AVAILABLE_VOICES[voice_selection]["name"] if voice_selection in AVAILABLE_VOICES else "Puck"
404
+ tts_path, tts_message = generate_tts_audio(text_input, voice_name)
405
+ if generation_cancelled:
406
+ return None, "Generation cancelled"
407
+ if tts_path:
408
+ voice_over_folder_path = 'voice_over'
409
+ os.makedirs(voice_over_folder_path, exist_ok=True)
410
+ voice_filename = f"tts_voiceover_{timestamp}.wav"
411
+ saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
412
+ shutil.copy2(tts_path, saved_voice_path)
413
+ voice_over_path = saved_voice_path
414
+ else:
415
+ return None, f"TTS failed: {tts_message}"
416
+ elif audio_input:
417
+ if generation_cancelled:
418
+ return None, "Generation cancelled"
419
+ voice_over_folder_path = 'voice_over'
420
+ os.makedirs(voice_over_folder_path, exist_ok=True)
421
+ voice_filename = f"uploaded_voiceover_{timestamp}.mp3"
422
+ saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
423
+ shutil.copy2(audio_input, saved_voice_path)
424
+ voice_over_path = saved_voice_path
425
+ if voice_over_path:
426
+ try:
427
+ progress(0.2, desc="Processing audio...")
428
+ if generation_cancelled:
429
+ return None, "Generation cancelled"
430
+ voice_over_audio = AudioFileClip(voice_over_path)
431
+ target_duration_seconds = voice_over_audio.duration
432
+ linelevel_subtitles, _ = process_voiceover_to_subtitles(voice_over_path)
433
+ if generation_cancelled:
434
+ voice_over_audio.close()
435
+ return None, "Generation cancelled"
436
+ except Exception as e:
437
+ return None, f"Audio error: {str(e)}"
438
+ else:
439
+ if not bg_music_path:
440
+ return None, "Need text/audio or background music"
441
+ target_duration_seconds = duration_minutes * 60
442
+ progress(0.3, desc="Preparing audio...")
443
+ if generation_cancelled:
444
+ if voice_over_audio:
445
+ voice_over_audio.close()
446
+ return None, "Generation cancelled"
447
+ audio_tracks = []
448
+ if voice_over_audio:
449
+ audio_tracks.append(voice_over_audio)
450
+ if bg_music_path:
451
+ try:
452
+ background_audio = AudioFileClip(bg_music_path)
453
+ background_audio = background_audio.fx(afx.volumex, 0.10)
454
+ background_audio = background_audio.fx(afx.audio_loop, duration=target_duration_seconds)
455
+ audio_tracks.append(background_audio)
456
+ except Exception as e:
457
+ print(f"Background music error: {e}")
458
+ final_audio = CompositeAudioClip(audio_tracks) if len(audio_tracks) > 1 else (audio_tracks[0] if audio_tracks else None)
459
+ progress(0.4, desc="Setting up video...")
460
+ if generation_cancelled:
461
+ cleanup_resources()
462
+ return None, "Generation cancelled"
463
+ if video_quality == "High":
464
+ target_height, bitrate, preset, crf = 1080, "8000k", "veryfast", "20"
465
+ elif video_quality == "Standard":
466
+ target_height, bitrate, preset, crf = 720, "4000k", "veryfast", "24"
467
+ else:
468
+ target_height, bitrate, preset, crf = 480, "1000k", "ultrafast", "28"
469
+ progress(0.5, desc="Processing clips...")
470
+ video_clips = []
471
+ current_duration = 0
472
+ file_index = 0
473
+ safety_counter = 0
474
+ max_iterations = len(all_files) * 3
475
+ while current_duration < target_duration_seconds and safety_counter < max_iterations:
476
+ if generation_cancelled:
477
+ for clip in video_clips:
478
+ try:
479
+ clip.close()
480
+ except:
481
+ pass
482
+ cleanup_resources()
483
+ return None, "Generation cancelled"
484
+ if file_index >= len(all_files):
485
+ file_index = 0
486
+ random.shuffle(all_files)
487
+ video_file = all_files[file_index]
488
+ file_index += 1
489
+ safety_counter += 1
490
+ try:
491
+ full_clip = VideoFileClip(os.path.join(source_path, video_file))
492
+ current_video_clip = full_clip
493
+ if generation_cancelled:
494
+ full_clip.close()
495
+ cleanup_resources()
496
+ return None, "Generation cancelled"
497
+ if full_clip.h != target_height:
498
+ aspect_ratio = full_clip.w / full_clip.h
499
+ new_width = int(target_height * aspect_ratio)
500
+ if new_width % 2 != 0:
501
+ new_width -= 1
502
+ adjusted_height = target_height if target_height % 2 == 0 else target_height - 1
503
+ full_clip = full_clip.resize((new_width, adjusted_height))
504
+ else:
505
+ full_clip = ensure_even_dimensions(full_clip)
506
+ subclip = get_random_subclip_and_slow(full_clip)
507
+ remaining_duration = target_duration_seconds - current_duration
508
+ if subclip.duration > remaining_duration:
509
+ subclip = subclip.subclip(0, remaining_duration)
510
+ video_clips.append(ensure_even_dimensions(subclip))
511
+ current_duration += subclip.duration
512
+ progress(0.5 + (safety_counter * 0.1 / max_iterations), desc=f"Clip {len(video_clips)}")
513
+ except Exception as e:
514
+ print(f"Error: {e}")
515
+ continue
516
+ if generation_cancelled:
517
+ for clip in video_clips:
518
+ try:
519
+ clip.close()
520
+ except:
521
+ pass
522
+ cleanup_resources()
523
+ return None, "Generation cancelled"
524
+ if not video_clips:
525
+ return None, "No clips processed"
526
+
527
+ total_video_duration = sum(clip.duration for clip in video_clips)
528
+ duration_diff = total_video_duration - target_duration_seconds
529
+ if abs(duration_diff) > 0.1:
530
+ if duration_diff > 0:
531
+ trim_amount = duration_diff
532
+ new_last_clip = video_clips[-1].subclip(0, video_clips[-1].duration - trim_amount)
533
+ video_clips[-1] = new_last_clip
534
+ else:
535
+ extend_amount = abs(duration_diff)
536
+ new_last_clip = video_clips[-1].fx(vfx.loop, duration=video_clips[-1].duration + extend_amount)
537
+ video_clips[-1] = new_last_clip
538
+ progress(0.6, desc="Applying transitions...")
539
+ transition_duration = {"Snap Cut": 0.1, "Whip Pan": 0.3, "Dreamy Fade": 0.8, "Smooth Blend": 0.5, "Ken Burns Zoom": 0.5}.get(transition_type, 0.5)
540
+ processed_clips = []
541
+ for i in range(len(video_clips)):
542
+ if i == 0:
543
+ if len(video_clips) > 1:
544
+ clip_out, _ = apply_transition_effect(video_clips[i], video_clips[i+1], transition_type, transition_duration)
545
+ processed_clips.append(clip_out)
546
+ else:
547
+ processed_clips.append(video_clips[i])
548
+ elif i == len(video_clips) - 1:
549
+ _, clip_in = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
550
+ processed_clips.append(clip_in)
551
+ else:
552
+ _, clip_with_transition = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
553
+ processed_clips.append(clip_with_transition)
554
+ progress(0.7, desc="Concatenating...")
555
+ if generation_cancelled:
556
+ for c in processed_clips:
557
+ try:
558
+ c.close()
559
+ except:
560
+ pass
561
+ cleanup_resources()
562
+ return None, "Generation cancelled"
563
+ if transition_type == "Snap Cut":
564
+ final_video_only = concatenate_videoclips(processed_clips, method="compose")
565
+ else:
566
+ final_video_only = concatenate_videoclips(processed_clips, method="compose", padding=-transition_duration)
567
+ final_video_only = ensure_even_dimensions(final_video_only)
568
+ current_video_clip = final_video_only
569
+ if final_audio:
570
+ final_video_only = final_video_only.set_duration(final_audio.duration)
571
+ progress(0.8, desc="Adding overlays...")
572
+ if generation_cancelled:
573
+ try:
574
+ final_video_only.close()
575
+ except:
576
+ pass
577
+ cleanup_resources()
578
+ return None, "Generation cancelled"
579
+ all_subtitle_clips = []
580
+ if linelevel_subtitles:
581
+ for line in linelevel_subtitles:
582
+ if generation_cancelled:
583
+ try:
584
+ final_video_only.close()
585
+ except:
586
+ pass
587
+ cleanup_resources()
588
+ return None, "Generation cancelled"
589
+ try:
590
+ subtitle_fontsize = min(42, final_video_only.size[1] // 25)
591
+ all_subtitle_clips.extend(create_caption(line, final_video_only.size, font="Helvetica-Bold", fontsize=subtitle_fontsize, color='white'))
592
+ except Exception as e:
593
+ print(f"Subtitle error: {e}")
594
+ continue
595
+ all_clips = [final_video_only.set_opacity(0.65)]
596
+ if all_subtitle_clips:
597
+ all_clips.extend(all_subtitle_clips)
598
+ if title_text and title_text.strip():
599
+ title_clips = create_title_overlay(title_text, final_video_only.size, duration=4)
600
+ all_clips.extend(title_clips)
601
+ final_video = CompositeVideoClip(all_clips)
602
+ current_video_clip = final_video
603
+ if final_audio:
604
+ final_video = final_video.set_audio(final_audio)
605
+ progress(0.9, desc="Exporting...")
606
+ if generation_cancelled:
607
+ try:
608
+ final_video.close()
609
+ except:
610
+ pass
611
+ cleanup_resources()
612
+ return None, "Generation cancelled"
613
+ output_filename = f'video_{timestamp}.mp4'
614
+ final_output_path = os.path.join(output_path, output_filename)
615
+ try:
616
+ final_video.write_videofile(
617
+ final_output_path,
618
+ codec="libx264",
619
+ audio_codec="aac",
620
+ fps=24,
621
+ preset=preset,
622
+ bitrate=bitrate,
623
+ audio_bitrate="128k",
624
+ threads=8,
625
+ ffmpeg_params=["-crf", crf, "-pix_fmt", "yuv420p", "-movflags", "+faststart", "-tune", "fastdecode"]
626
+ )
627
+ except Exception as e:
628
+ if generation_cancelled:
629
+ return None, "Generation cancelled"
630
+ return None, f"Export error: {str(e)}"
631
+ progress(1.0, desc="Done")
632
+ if generation_cancelled:
633
+ try:
634
+ if os.path.exists(final_output_path):
635
+ os.remove(final_output_path)
636
+ except:
637
+ pass
638
+ cleanup_resources()
639
+ return None, "Generation cancelled"
640
+ try:
641
+ final_video.close()
642
+ if voice_over_audio:
643
+ voice_over_audio.close()
644
+ current_video_clip = None
645
+ except:
646
+ pass
647
+ audio_source = ""
648
+ if text_input and text_input.strip():
649
+ audio_source = f"TTS ({AVAILABLE_VOICES[voice_selection]['name'] if voice_selection in AVAILABLE_VOICES else 'Puck'})"
650
+ elif voice_over_path:
651
+ audio_source = "Uploaded Audio"
652
+ else:
653
+ audio_source = "Background Music"
654
+ summary = f"Complete\n{output_filename}\n{audio_source}\n{transition_type}\n{target_duration_seconds:.1f}s\n{len(linelevel_subtitles) if linelevel_subtitles else 0} subtitles"
655
+ return final_output_path, summary
656
+
657
+ with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface:
658
+ gr.Markdown("# 🎬 AI Video Generator")
659
+ gr.Markdown("Upload video clips to `video_clips` folder and optionally background music to `background_music` folder.")
660
+
661
+ with gr.Row():
662
+ with gr.Column():
663
+ text_input = gr.Textbox(label="Text for TTS", lines=4, placeholder="Enter text to convert to speech...")
664
+ voice_dropdown = gr.Dropdown(
665
+ choices=[(f"{v['name']} - {v['description']}", k) for k, v in AVAILABLE_VOICES.items()],
666
+ value="Puck",
667
+ label="Voice Selection"
668
+ )
669
+ audio_input = gr.Audio(type="filepath", label="Or Upload Audio File")
670
+ title_input = gr.Textbox(label="Video Title (Optional)", lines=2, placeholder="Enter video title...")
671
+ duration_slider = gr.Slider(0.5, 10, 2, 0.5, label="Duration (minutes) - only used if no audio")
672
+ quality_radio = gr.Radio(["High", "Standard", "Preview"], value="High", label="Video Quality")
673
+ transition_radio = gr.Radio(
674
+ ["Smooth Blend", "Ken Burns Zoom", "Whip Pan", "Dreamy Fade", "Snap Cut"],
675
+ value="Smooth Blend",
676
+ label="Transition Effect"
677
+ )
678
+ with gr.Row():
679
+ submit_btn = gr.Button("🎥 Generate Video", variant="primary", size="lg")
680
+ stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")
681
+
682
+ with gr.Column():
683
+ video_output = gr.Video(label="Generated Video")
684
+ summary_output = gr.Textbox(label="Status", lines=8)
685
+
686
+ submit_btn.click(
687
+ fn=merge_videos_with_subtitles,
688
+ inputs=[text_input, voice_dropdown, audio_input, title_input, duration_slider, quality_radio, transition_radio],
689
+ outputs=[video_output, summary_output]
690
+ )
691
+ stop_btn.click(fn=cancel_generation, outputs=[summary_output, video_output])
692
+
693
+ if __name__ == "__main__":
694
+ interface.launch(
695
+ server_name="0.0.0.0",
696
+ server_port=7860,
697
+ show_error=True
698
  )