Shreevathsam commited on
Commit
5e56d63
·
verified ·
1 Parent(s): 9b6da36

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -320
app.py CHANGED
@@ -5,25 +5,22 @@ import whisper
5
  import shutil
6
  import wave
7
  import base64
8
- from moviepy.editor import (VideoFileClip, AudioFileClip, TextClip,
9
- concatenate_videoclips, CompositeVideoClip, CompositeAudioClip, ImageClip)
10
  import moviepy.audio.fx.all as afx
11
  import moviepy.video.fx.all as vfx
12
  import gradio as gr
13
- from PIL import Image, ImageDraw, ImageFilter, ImageFont
14
  import numpy as np
15
- from functools import lru_cache
16
  import urllib.request
17
  from google import genai
18
  from google.genai import types
19
 
20
- # Create necessary directories
21
  os.makedirs('video_clips', exist_ok=True)
22
  os.makedirs('background_music', exist_ok=True)
23
  os.makedirs('voice_over', exist_ok=True)
24
  os.makedirs('exports', exist_ok=True)
25
 
26
- # Get API key from environment variable (will be set in Hugging Face Space settings)
27
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
28
  if GOOGLE_API_KEY:
29
  os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
@@ -77,7 +74,7 @@ def generate_tts_audio(text_input, voice_name="Puck"):
77
  return None, f"Error: {str(e)}"
78
 
79
  def split_text_into_lines(data):
80
- MaxChars, MaxDuration, MaxGap = 60, 2.5, 1.5
81
  subtitles, line, line_duration = [], [], 0
82
  for idx, word_data in enumerate(data):
83
  line.append(word_data)
@@ -104,188 +101,100 @@ def split_text_into_lines(data):
104
  })
105
  return subtitles
106
 
107
- @lru_cache(maxsize=1000)
108
- def get_cached_text_clip(text, font, fontsize, color):
109
- return TextClip(text, font=font, fontsize=fontsize, color=color)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
  def create_title_overlay(title_text, framesize, duration=4):
112
  if not title_text or not title_text.strip():
113
  return []
114
  frame_width, frame_height = framesize
115
- FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf"
116
- FONT_PATH = "/tmp/Poppins-Bold.ttf"
117
- if not os.path.exists(FONT_PATH):
118
- try:
 
119
  urllib.request.urlretrieve(FONT_URL, FONT_PATH)
120
- except:
121
- FONT_PATH = None
122
- TOP_MARGIN = int(frame_height * 0.115)
123
- FONT_SIZE = int(frame_height * 0.042)
124
- STROKE_WIDTH = max(1, int(frame_height * 0.003))
125
- LINE_SPACING = max(4, int(frame_height * 0.008))
126
- def load_font(size):
127
- try:
128
- if FONT_PATH and os.path.exists(FONT_PATH):
129
- return ImageFont.truetype(FONT_PATH, size)
130
- return ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", size)
131
- except:
132
- return ImageFont.load_default()
133
- font_obj = load_font(FONT_SIZE)
134
  base = Image.new("RGBA", (frame_width, frame_height), (0, 0, 0, 0))
135
- temp_img = Image.new("RGBA", (frame_width, frame_height), (0,0,0,0))
136
- temp_draw = ImageDraw.Draw(temp_img)
137
- def measure_text(text, font):
138
- try:
139
- bbox = temp_draw.textbbox((0,0), text, font=font, stroke_width=STROKE_WIDTH)
140
- return bbox[2]-bbox[0], bbox[3]-bbox[1]
141
- except:
142
- return 100, 50
143
- def wrap_text(text, font, max_width):
144
- words = text.upper().split()
145
- lines, current = [], []
146
- for word in words:
147
- test_line = " ".join(current + [word])
148
- w, _ = measure_text(test_line, font)
149
- if w <= max_width:
150
- current.append(word)
151
- else:
152
- if current:
153
- lines.append(" ".join(current))
154
- current = [word]
155
- else:
156
- lines.append(word)
157
- current = []
158
- if current:
159
- lines.append(" ".join(current))
160
- return lines[:4]
161
- lines = wrap_text(title_text, font_obj, frame_width * 0.90)
162
- line_heights = [measure_text(line, font_obj)[1] for line in lines]
163
- y_start = TOP_MARGIN
164
- x_center = frame_width // 2
165
  draw = ImageDraw.Draw(base)
166
- y = y_start
167
- for i, line in enumerate(lines):
168
- w, h = measure_text(line, font_obj)
169
- x = x_center - w // 2
170
- draw.text((x+2, y+2), line, font=font_obj, fill=(0,0,0,180))
171
- draw.text((x, y), line, font=font_obj, fill=(255,255,255,255), stroke_width=STROKE_WIDTH, stroke_fill=(0,0,0,255))
172
- y += line_heights[i] + LINE_SPACING
 
 
 
 
 
 
173
  return [ImageClip(np.array(base), duration=duration)]
174
 
175
- def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=14, color='white'):
176
- full_duration = textJSON['end'] - textJSON['start']
177
- word_clips = []
178
- xy_textclips_positions = []
179
- frame_width, frame_height = framesize
180
- max_line_width = frame_width * 0.8
181
- lines, current_line, current_line_width = [], [], 0
182
- for wordJSON in textJSON['textcontents']:
183
- word_upper = wordJSON['word'].upper()
184
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
185
- temp_space = get_cached_text_clip(" ", font, fontsize, color)
186
- word_width, word_height = temp_word.size
187
- space_width, _ = temp_space.size
188
- if current_line_width + word_width + space_width > max_line_width and current_line:
189
- lines.append({'words': current_line.copy(), 'width': current_line_width, 'height': word_height})
190
- current_line = [wordJSON]
191
- current_line_width = word_width + space_width
192
- else:
193
- current_line.append(wordJSON)
194
- current_line_width += word_width + space_width
195
- if current_line:
196
- word_upper = current_line[0]['word'].upper()
197
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
198
- _, word_height = temp_word.size
199
- lines.append({'words': current_line, 'width': current_line_width, 'height': word_height})
200
- total_text_height = sum(line['height'] for line in lines) + (len(lines) - 1) * 3
201
- subtitle_y_position = int(frame_height * 0.65)
202
- current_y = subtitle_y_position
203
- if lines:
204
- shadow_padding = 25
205
- shadow_height_extra = 15
206
- total_subtitle_width = max(line['width'] for line in lines)
207
- bg_width = int(total_subtitle_width + shadow_padding * 2)
208
- bg_height = int(total_text_height + shadow_height_extra * 2)
209
- img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
210
- draw = ImageDraw.Draw(img)
211
- draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=15, fill=(0, 0, 0, 128))
212
- img_array = np.array(img)
213
- shadow_bg = ImageClip(img_array, duration=full_duration).set_start(textJSON['start'])
214
- shadow_x = (frame_width - total_subtitle_width) / 2 - shadow_padding
215
- shadow_y = subtitle_y_position - shadow_height_extra
216
- shadow_bg = shadow_bg.set_position((shadow_x, shadow_y))
217
- word_clips.append(shadow_bg)
218
- for line in lines:
219
- line_words = line['words']
220
- word_dimensions = []
221
- for wordJSON in line_words:
222
- word_upper = wordJSON['word'].upper()
223
- temp_word = get_cached_text_clip(word_upper, font, fontsize, color)
224
- temp_space = get_cached_text_clip(" ", font, fontsize, color)
225
- word_width, word_height = temp_word.size
226
- space_width, _ = temp_space.size
227
- word_dimensions.append({
228
- 'word_data': wordJSON,
229
- 'word_width': word_width,
230
- 'word_height': word_height,
231
- 'space_width': space_width,
232
- 'word_upper': word_upper
233
- })
234
- line_start_x = (frame_width - line['width']) / 2
235
- current_x = line_start_x
236
- for word_dim in word_dimensions:
237
- wordJSON = word_dim['word_data']
238
- word_width = word_dim['word_width']
239
- word_height = word_dim['word_height']
240
- space_width = word_dim['space_width']
241
- word_upper = word_dim['word_upper']
242
- shadow_text = get_cached_text_clip(word_upper, font, fontsize, 'black')
243
- shadow_text = shadow_text.set_start(textJSON['start']).set_duration(full_duration)
244
- shadow_text = shadow_text.set_position((current_x + 1, current_y + 1)).set_opacity(0.3)
245
- word_clips.append(shadow_text)
246
- word_clip = get_cached_text_clip(word_upper, font, fontsize, color)
247
- word_clip = word_clip.set_start(textJSON['start']).set_duration(full_duration)
248
- word_clip = word_clip.set_position((current_x, current_y))
249
- space_clip = get_cached_text_clip(" ", font, fontsize, color)
250
- space_clip = space_clip.set_start(textJSON['start']).set_duration(full_duration)
251
- space_clip = space_clip.set_position((current_x + word_width, current_y))
252
- xy_textclips_positions.append({
253
- "x_pos": current_x,
254
- "y_pos": current_y,
255
- "width": word_width,
256
- "height": word_height,
257
- "word": word_upper,
258
- "start": wordJSON['start'],
259
- "end": wordJSON['end'],
260
- "duration": wordJSON['end'] - wordJSON['start']
261
- })
262
- word_clips.append(word_clip)
263
- word_clips.append(space_clip)
264
- current_x += word_width + space_width
265
- current_y += line['height'] + 3
266
- for highlight_word in xy_textclips_positions:
267
- bg_width = int(highlight_word['width'] + 16)
268
- bg_height = int(highlight_word['height'] + 8)
269
- img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0))
270
- draw = ImageDraw.Draw(img)
271
- draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=8, fill=(147, 0, 211, 180))
272
- img_array = np.array(img)
273
- bg_clip = ImageClip(img_array, duration=highlight_word['duration'])
274
- bg_clip = bg_clip.set_start(highlight_word['start'])
275
- bg_x = highlight_word['x_pos'] - 8
276
- bg_y = highlight_word['y_pos'] - 4
277
- bg_clip = bg_clip.set_position((bg_x, bg_y))
278
- shadow_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'black')
279
- shadow_highlight = shadow_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
280
- shadow_highlight = shadow_highlight.set_position((highlight_word['x_pos'] + 1, highlight_word['y_pos'] + 1)).set_opacity(0.4)
281
- word_clip_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'white')
282
- word_clip_highlight = word_clip_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration'])
283
- word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos']))
284
- word_clips.append(bg_clip)
285
- word_clips.append(shadow_highlight)
286
- word_clips.append(word_clip_highlight)
287
- return word_clips
288
-
289
  def get_random_subclip_and_slow(clip):
290
  subclip_durations = [2, 3, 4]
291
  subclip_duration = random.choice(subclip_durations)
@@ -369,41 +278,37 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
369
  generation_cancelled = False
370
  current_video_clip = None
371
  progress(0, desc="Starting...")
372
- if generation_cancelled:
373
- return None, "Generation cancelled"
374
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
375
 
376
- # Updated paths for Hugging Face
377
  source_path = 'video_clips'
378
  if not os.path.isdir(source_path):
379
- return None, "Video clips folder not found. Please upload video clips to the 'video_clips' folder."
 
380
  output_path = 'exports'
381
  os.makedirs(output_path, exist_ok=True)
382
 
383
  video_extensions = ('.mp4', '.avi', '.mkv', '.mov')
384
  all_files = [f for f in os.listdir(source_path) if f.lower().endswith(video_extensions)]
385
  if not all_files:
386
- return None, "No video files found in 'video_clips' folder"
 
387
  random.shuffle(all_files)
388
- if generation_cancelled:
389
- return None, "Generation cancelled"
390
  bg_music_path = None
391
  bg_music_folder_path = 'background_music'
392
  if os.path.isdir(bg_music_folder_path):
393
- audio_extensions = ('.mp3', '.wav', '.m4a', '.aac')
394
- possible_files = [f for f in os.listdir(bg_music_folder_path) if f.lower().endswith(audio_extensions) and not f.startswith('voiceover_')]
395
- if len(possible_files) >= 1:
396
  bg_music_path = os.path.join(bg_music_folder_path, possible_files[0])
397
- target_duration_seconds = 0
398
- voice_over_audio = None
399
- linelevel_subtitles = None
400
  voice_over_path = None
 
 
401
  if text_input and text_input.strip():
402
  progress(0.1, desc="Generating TTS...")
403
  voice_name = AVAILABLE_VOICES[voice_selection]["name"] if voice_selection in AVAILABLE_VOICES else "Puck"
404
  tts_path, tts_message = generate_tts_audio(text_input, voice_name)
405
- if generation_cancelled:
406
- return None, "Generation cancelled"
407
  if tts_path:
408
  voice_over_folder_path = 'voice_over'
409
  os.makedirs(voice_over_folder_path, exist_ok=True)
@@ -414,39 +319,33 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
414
  else:
415
  return None, f"TTS failed: {tts_message}"
416
  elif audio_input:
417
- if generation_cancelled:
418
- return None, "Generation cancelled"
419
  voice_over_folder_path = 'voice_over'
420
  os.makedirs(voice_over_folder_path, exist_ok=True)
421
  voice_filename = f"uploaded_voiceover_{timestamp}.mp3"
422
  saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
423
  shutil.copy2(audio_input, saved_voice_path)
424
  voice_over_path = saved_voice_path
 
425
  if voice_over_path:
426
  try:
427
  progress(0.2, desc="Processing audio...")
428
- if generation_cancelled:
429
- return None, "Generation cancelled"
430
  voice_over_audio = AudioFileClip(voice_over_path)
431
  target_duration_seconds = voice_over_audio.duration
432
  linelevel_subtitles, _ = process_voiceover_to_subtitles(voice_over_path)
433
- if generation_cancelled:
434
- voice_over_audio.close()
435
- return None, "Generation cancelled"
436
  except Exception as e:
437
  return None, f"Audio error: {str(e)}"
438
  else:
439
  if not bg_music_path:
440
  return None, "Need text/audio or background music"
441
  target_duration_seconds = duration_minutes * 60
 
 
442
  progress(0.3, desc="Preparing audio...")
443
- if generation_cancelled:
444
- if voice_over_audio:
445
- voice_over_audio.close()
446
- return None, "Generation cancelled"
447
  audio_tracks = []
448
  if voice_over_audio:
449
  audio_tracks.append(voice_over_audio)
 
450
  if bg_music_path:
451
  try:
452
  background_audio = AudioFileClip(bg_music_path)
@@ -455,45 +354,35 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
455
  audio_tracks.append(background_audio)
456
  except Exception as e:
457
  print(f"Background music error: {e}")
 
458
  final_audio = CompositeAudioClip(audio_tracks) if len(audio_tracks) > 1 else (audio_tracks[0] if audio_tracks else None)
 
459
  progress(0.4, desc="Setting up video...")
460
- if generation_cancelled:
461
- cleanup_resources()
462
- return None, "Generation cancelled"
463
  if video_quality == "High":
464
  target_height, bitrate, preset, crf = 1080, "8000k", "veryfast", "20"
465
  elif video_quality == "Standard":
466
  target_height, bitrate, preset, crf = 720, "4000k", "veryfast", "24"
467
  else:
468
  target_height, bitrate, preset, crf = 480, "1000k", "ultrafast", "28"
 
469
  progress(0.5, desc="Processing clips...")
 
470
  video_clips = []
471
  current_duration = 0
472
  file_index = 0
473
- safety_counter = 0
474
- max_iterations = len(all_files) * 3
475
- while current_duration < target_duration_seconds and safety_counter < max_iterations:
476
- if generation_cancelled:
477
- for clip in video_clips:
478
- try:
479
- clip.close()
480
- except:
481
- pass
482
- cleanup_resources()
483
- return None, "Generation cancelled"
484
  if file_index >= len(all_files):
485
  file_index = 0
486
  random.shuffle(all_files)
 
487
  video_file = all_files[file_index]
488
  file_index += 1
489
- safety_counter += 1
490
  try:
491
  full_clip = VideoFileClip(os.path.join(source_path, video_file))
492
- current_video_clip = full_clip
493
- if generation_cancelled:
494
- full_clip.close()
495
- cleanup_resources()
496
- return None, "Generation cancelled"
497
  if full_clip.h != target_height:
498
  aspect_ratio = full_clip.w / full_clip.h
499
  new_width = int(target_height * aspect_ratio)
@@ -503,40 +392,37 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
503
  full_clip = full_clip.resize((new_width, adjusted_height))
504
  else:
505
  full_clip = ensure_even_dimensions(full_clip)
 
506
  subclip = get_random_subclip_and_slow(full_clip)
507
  remaining_duration = target_duration_seconds - current_duration
 
508
  if subclip.duration > remaining_duration:
509
  subclip = subclip.subclip(0, remaining_duration)
 
510
  video_clips.append(ensure_even_dimensions(subclip))
511
  current_duration += subclip.duration
512
- progress(0.5 + (safety_counter * 0.1 / max_iterations), desc=f"Clip {len(video_clips)}")
513
  except Exception as e:
514
- print(f"Error: {e}")
515
  continue
516
- if generation_cancelled:
517
- for clip in video_clips:
518
- try:
519
- clip.close()
520
- except:
521
- pass
522
- cleanup_resources()
523
- return None, "Generation cancelled"
524
  if not video_clips:
525
  return None, "No clips processed"
526
 
 
527
  total_video_duration = sum(clip.duration for clip in video_clips)
528
  duration_diff = total_video_duration - target_duration_seconds
 
529
  if abs(duration_diff) > 0.1:
530
  if duration_diff > 0:
531
  trim_amount = duration_diff
532
- new_last_clip = video_clips[-1].subclip(0, video_clips[-1].duration - trim_amount)
533
- video_clips[-1] = new_last_clip
534
  else:
535
  extend_amount = abs(duration_diff)
536
- new_last_clip = video_clips[-1].fx(vfx.loop, duration=video_clips[-1].duration + extend_amount)
537
- video_clips[-1] = new_last_clip
538
  progress(0.6, desc="Applying transitions...")
539
  transition_duration = {"Snap Cut": 0.1, "Whip Pan": 0.3, "Dreamy Fade": 0.8, "Smooth Blend": 0.5, "Ken Burns Zoom": 0.5}.get(transition_type, 0.5)
 
540
  processed_clips = []
541
  for i in range(len(video_clips)):
542
  if i == 0:
@@ -551,67 +437,45 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
551
  else:
552
  _, clip_with_transition = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
553
  processed_clips.append(clip_with_transition)
 
554
  progress(0.7, desc="Concatenating...")
555
- if generation_cancelled:
556
- for c in processed_clips:
557
- try:
558
- c.close()
559
- except:
560
- pass
561
- cleanup_resources()
562
- return None, "Generation cancelled"
563
  if transition_type == "Snap Cut":
564
  final_video_only = concatenate_videoclips(processed_clips, method="compose")
565
  else:
566
  final_video_only = concatenate_videoclips(processed_clips, method="compose", padding=-transition_duration)
 
567
  final_video_only = ensure_even_dimensions(final_video_only)
568
- current_video_clip = final_video_only
569
- if final_audio:
570
- final_video_only = final_video_only.set_duration(final_audio.duration)
 
 
571
  progress(0.8, desc="Adding overlays...")
572
- if generation_cancelled:
573
- try:
574
- final_video_only.close()
575
- except:
576
- pass
577
- cleanup_resources()
578
- return None, "Generation cancelled"
579
- all_subtitle_clips = []
580
- if linelevel_subtitles:
581
- for line in linelevel_subtitles:
582
- if generation_cancelled:
583
- try:
584
- final_video_only.close()
585
- except:
586
- pass
587
- cleanup_resources()
588
- return None, "Generation cancelled"
589
- try:
590
- subtitle_fontsize = min(42, final_video_only.size[1] // 25)
591
- all_subtitle_clips.extend(create_caption(line, final_video_only.size, font="Helvetica-Bold", fontsize=subtitle_fontsize, color='white'))
592
- except Exception as e:
593
- print(f"Subtitle error: {e}")
594
- continue
595
  all_clips = [final_video_only.set_opacity(0.65)]
596
- if all_subtitle_clips:
597
- all_clips.extend(all_subtitle_clips)
 
 
 
 
 
598
  if title_text and title_text.strip():
599
  title_clips = create_title_overlay(title_text, final_video_only.size, duration=4)
600
  all_clips.extend(title_clips)
 
601
  final_video = CompositeVideoClip(all_clips)
602
- current_video_clip = final_video
603
  if final_audio:
604
  final_video = final_video.set_audio(final_audio)
 
605
  progress(0.9, desc="Exporting...")
606
- if generation_cancelled:
607
- try:
608
- final_video.close()
609
- except:
610
- pass
611
- cleanup_resources()
612
- return None, "Generation cancelled"
613
  output_filename = f'video_{timestamp}.mp4'
614
  final_output_path = os.path.join(output_path, output_filename)
 
615
  try:
616
  final_video.write_videofile(
617
  final_output_path,
@@ -622,65 +486,50 @@ def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_
622
  bitrate=bitrate,
623
  audio_bitrate="128k",
624
  threads=8,
625
- ffmpeg_params=["-crf", crf, "-pix_fmt", "yuv420p", "-movflags", "+faststart", "-tune", "fastdecode"]
626
  )
627
  except Exception as e:
628
- if generation_cancelled:
629
- return None, "Generation cancelled"
630
  return None, f"Export error: {str(e)}"
 
631
  progress(1.0, desc="Done")
632
- if generation_cancelled:
633
- try:
634
- if os.path.exists(final_output_path):
635
- os.remove(final_output_path)
636
- except:
637
- pass
638
- cleanup_resources()
639
- return None, "Generation cancelled"
640
  try:
641
  final_video.close()
642
  if voice_over_audio:
643
  voice_over_audio.close()
644
- current_video_clip = None
645
  except:
646
  pass
647
- audio_source = ""
648
- if text_input and text_input.strip():
649
- audio_source = f"TTS ({AVAILABLE_VOICES[voice_selection]['name'] if voice_selection in AVAILABLE_VOICES else 'Puck'})"
650
- elif voice_over_path:
651
- audio_source = "Uploaded Audio"
652
- else:
653
- audio_source = "Background Music"
654
- summary = f"Complete\n{output_filename}\n{audio_source}\n{transition_type}\n{target_duration_seconds:.1f}s\n{len(linelevel_subtitles) if linelevel_subtitles else 0} subtitles"
655
  return final_output_path, summary
656
 
657
  with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface:
658
  gr.Markdown("# 🎬 AI Video Generator")
659
- gr.Markdown("Upload video clips to `video_clips` folder and optionally background music to `background_music` folder.")
660
 
661
  with gr.Row():
662
  with gr.Column():
663
- text_input = gr.Textbox(label="Text for TTS", lines=4, placeholder="Enter text to convert to speech...")
664
  voice_dropdown = gr.Dropdown(
665
  choices=[(f"{v['name']} - {v['description']}", k) for k, v in AVAILABLE_VOICES.items()],
666
  value="Puck",
667
- label="Voice Selection"
668
  )
669
- audio_input = gr.Audio(type="filepath", label="Or Upload Audio File")
670
- title_input = gr.Textbox(label="Video Title (Optional)", lines=2, placeholder="Enter video title...")
671
- duration_slider = gr.Slider(0.5, 10, 2, 0.5, label="Duration (minutes) - only used if no audio")
672
- quality_radio = gr.Radio(["High", "Standard", "Preview"], value="High", label="Video Quality")
673
  transition_radio = gr.Radio(
674
  ["Smooth Blend", "Ken Burns Zoom", "Whip Pan", "Dreamy Fade", "Snap Cut"],
675
  value="Smooth Blend",
676
- label="Transition Effect"
677
  )
678
  with gr.Row():
679
- submit_btn = gr.Button("🎥 Generate Video", variant="primary", size="lg")
680
- stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg")
681
 
682
  with gr.Column():
683
- video_output = gr.Video(label="Generated Video")
684
  summary_output = gr.Textbox(label="Status", lines=8)
685
 
686
  submit_btn.click(
@@ -691,8 +540,4 @@ with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface:
691
  stop_btn.click(fn=cancel_generation, outputs=[summary_output, video_output])
692
 
693
  if __name__ == "__main__":
694
- interface.launch(
695
- server_name="0.0.0.0",
696
- server_port=7860,
697
- show_error=True
698
- )
 
5
  import shutil
6
  import wave
7
  import base64
8
+ from moviepy.editor import (VideoFileClip, AudioFileClip, concatenate_videoclips,
9
+ CompositeVideoClip, CompositeAudioClip, ImageClip)
10
  import moviepy.audio.fx.all as afx
11
  import moviepy.video.fx.all as vfx
12
  import gradio as gr
13
+ from PIL import Image, ImageDraw, ImageFont
14
  import numpy as np
 
15
  import urllib.request
16
  from google import genai
17
  from google.genai import types
18
 
 
19
  os.makedirs('video_clips', exist_ok=True)
20
  os.makedirs('background_music', exist_ok=True)
21
  os.makedirs('voice_over', exist_ok=True)
22
  os.makedirs('exports', exist_ok=True)
23
 
 
24
  GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '')
25
  if GOOGLE_API_KEY:
26
  os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY
 
74
  return None, f"Error: {str(e)}"
75
 
76
  def split_text_into_lines(data):
77
+ MaxChars, MaxDuration, MaxGap = 40, 2.5, 1.5
78
  subtitles, line, line_duration = [], [], 0
79
  for idx, word_data in enumerate(data):
80
  line.append(word_data)
 
101
  })
102
  return subtitles
103
 
104
+ def create_subtitle_image(text, frame_size, fontsize=42):
105
+ """Create subtitle as PIL Image - more reliable than TextClip"""
106
+ frame_width, frame_height = frame_size
107
+
108
+ # Load font
109
+ FONT_PATH = None
110
+ try:
111
+ FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf"
112
+ FONT_PATH = "/tmp/Poppins-Bold.ttf"
113
+ if not os.path.exists(FONT_PATH):
114
+ urllib.request.urlretrieve(FONT_URL, FONT_PATH)
115
+ font = ImageFont.truetype(FONT_PATH, fontsize)
116
+ except:
117
+ font = ImageFont.load_default()
118
+
119
+ # Create transparent image
120
+ img = Image.new('RGBA', (frame_width, frame_height), (0, 0, 0, 0))
121
+ draw = ImageDraw.Draw(img)
122
+
123
+ # Get text size
124
+ bbox = draw.textbbox((0, 0), text.upper(), font=font)
125
+ text_width = bbox[2] - bbox[0]
126
+ text_height = bbox[3] - bbox[1]
127
+
128
+ # Position at bottom center
129
+ x = (frame_width - text_width) // 2
130
+ y = int(frame_height * 0.75)
131
+
132
+ # Draw background
133
+ padding = 20
134
+ bg_x1 = x - padding
135
+ bg_y1 = y - padding
136
+ bg_x2 = x + text_width + padding
137
+ bg_y2 = y + text_height + padding
138
+ draw.rounded_rectangle([bg_x1, bg_y1, bg_x2, bg_y2], radius=15, fill=(0, 0, 0, 180))
139
+
140
+ # Draw text with shadow
141
+ draw.text((x+2, y+2), text.upper(), font=font, fill=(0, 0, 0, 255))
142
+ draw.text((x, y), text.upper(), font=font, fill=(255, 255, 255, 255))
143
+
144
+ return np.array(img)
145
+
146
+ def create_simple_subtitles(subtitle_data, frame_size, total_duration):
147
+ """Create simple, reliable subtitles using ImageClips"""
148
+ subtitle_clips = []
149
+
150
+ for item in subtitle_data:
151
+ text = item['word']
152
+ start_time = item['start']
153
+ end_time = item['end']
154
+ duration = end_time - start_time
155
+
156
+ # Create subtitle image
157
+ img_array = create_subtitle_image(text, frame_size)
158
+
159
+ # Create ImageClip
160
+ clip = ImageClip(img_array, duration=duration)
161
+ clip = clip.set_start(start_time)
162
+
163
+ subtitle_clips.append(clip)
164
+
165
+ return subtitle_clips
166
 
167
  def create_title_overlay(title_text, framesize, duration=4):
168
  if not title_text or not title_text.strip():
169
  return []
170
  frame_width, frame_height = framesize
171
+
172
+ try:
173
+ FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf"
174
+ FONT_PATH = "/tmp/Poppins-Bold.ttf"
175
+ if not os.path.exists(FONT_PATH):
176
  urllib.request.urlretrieve(FONT_URL, FONT_PATH)
177
+ font = ImageFont.truetype(FONT_PATH, int(frame_height * 0.06))
178
+ except:
179
+ font = ImageFont.load_default()
180
+
 
 
 
 
 
 
 
 
 
 
181
  base = Image.new("RGBA", (frame_width, frame_height), (0, 0, 0, 0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  draw = ImageDraw.Draw(base)
183
+
184
+ # Simple centered title
185
+ text = title_text.upper()
186
+ bbox = draw.textbbox((0, 0), text, font=font)
187
+ text_width = bbox[2] - bbox[0]
188
+
189
+ x = (frame_width - text_width) // 2
190
+ y = int(frame_height * 0.1)
191
+
192
+ # Shadow and text
193
+ draw.text((x+3, y+3), text, font=font, fill=(0, 0, 0, 200))
194
+ draw.text((x, y), text, font=font, fill=(255, 255, 255, 255))
195
+
196
  return [ImageClip(np.array(base), duration=duration)]
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  def get_random_subclip_and_slow(clip):
199
  subclip_durations = [2, 3, 4]
200
  subclip_duration = random.choice(subclip_durations)
 
278
  generation_cancelled = False
279
  current_video_clip = None
280
  progress(0, desc="Starting...")
 
 
 
281
 
282
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
283
  source_path = 'video_clips'
284
  if not os.path.isdir(source_path):
285
+ return None, "Video clips folder not found"
286
+
287
  output_path = 'exports'
288
  os.makedirs(output_path, exist_ok=True)
289
 
290
  video_extensions = ('.mp4', '.avi', '.mkv', '.mov')
291
  all_files = [f for f in os.listdir(source_path) if f.lower().endswith(video_extensions)]
292
  if not all_files:
293
+ return None, "No video files found"
294
+
295
  random.shuffle(all_files)
296
+
 
297
  bg_music_path = None
298
  bg_music_folder_path = 'background_music'
299
  if os.path.isdir(bg_music_folder_path):
300
+ audio_extensions = ('.mp3', '.wav', '.aac')
301
+ possible_files = [f for f in os.listdir(bg_music_folder_path) if f.lower().endswith(audio_extensions)]
302
+ if possible_files:
303
  bg_music_path = os.path.join(bg_music_folder_path, possible_files[0])
304
+
 
 
305
  voice_over_path = None
306
+ linelevel_subtitles = None
307
+
308
  if text_input and text_input.strip():
309
  progress(0.1, desc="Generating TTS...")
310
  voice_name = AVAILABLE_VOICES[voice_selection]["name"] if voice_selection in AVAILABLE_VOICES else "Puck"
311
  tts_path, tts_message = generate_tts_audio(text_input, voice_name)
 
 
312
  if tts_path:
313
  voice_over_folder_path = 'voice_over'
314
  os.makedirs(voice_over_folder_path, exist_ok=True)
 
319
  else:
320
  return None, f"TTS failed: {tts_message}"
321
  elif audio_input:
 
 
322
  voice_over_folder_path = 'voice_over'
323
  os.makedirs(voice_over_folder_path, exist_ok=True)
324
  voice_filename = f"uploaded_voiceover_{timestamp}.mp3"
325
  saved_voice_path = os.path.join(voice_over_folder_path, voice_filename)
326
  shutil.copy2(audio_input, saved_voice_path)
327
  voice_over_path = saved_voice_path
328
+
329
  if voice_over_path:
330
  try:
331
  progress(0.2, desc="Processing audio...")
 
 
332
  voice_over_audio = AudioFileClip(voice_over_path)
333
  target_duration_seconds = voice_over_audio.duration
334
  linelevel_subtitles, _ = process_voiceover_to_subtitles(voice_over_path)
 
 
 
335
  except Exception as e:
336
  return None, f"Audio error: {str(e)}"
337
  else:
338
  if not bg_music_path:
339
  return None, "Need text/audio or background music"
340
  target_duration_seconds = duration_minutes * 60
341
+ voice_over_audio = None
342
+
343
  progress(0.3, desc="Preparing audio...")
344
+
 
 
 
345
  audio_tracks = []
346
  if voice_over_audio:
347
  audio_tracks.append(voice_over_audio)
348
+
349
  if bg_music_path:
350
  try:
351
  background_audio = AudioFileClip(bg_music_path)
 
354
  audio_tracks.append(background_audio)
355
  except Exception as e:
356
  print(f"Background music error: {e}")
357
+
358
  final_audio = CompositeAudioClip(audio_tracks) if len(audio_tracks) > 1 else (audio_tracks[0] if audio_tracks else None)
359
+
360
  progress(0.4, desc="Setting up video...")
361
+
 
 
362
  if video_quality == "High":
363
  target_height, bitrate, preset, crf = 1080, "8000k", "veryfast", "20"
364
  elif video_quality == "Standard":
365
  target_height, bitrate, preset, crf = 720, "4000k", "veryfast", "24"
366
  else:
367
  target_height, bitrate, preset, crf = 480, "1000k", "ultrafast", "28"
368
+
369
  progress(0.5, desc="Processing clips...")
370
+
371
  video_clips = []
372
  current_duration = 0
373
  file_index = 0
374
+
375
+ while current_duration < target_duration_seconds:
 
 
 
 
 
 
 
 
 
376
  if file_index >= len(all_files):
377
  file_index = 0
378
  random.shuffle(all_files)
379
+
380
  video_file = all_files[file_index]
381
  file_index += 1
382
+
383
  try:
384
  full_clip = VideoFileClip(os.path.join(source_path, video_file))
385
+
 
 
 
 
386
  if full_clip.h != target_height:
387
  aspect_ratio = full_clip.w / full_clip.h
388
  new_width = int(target_height * aspect_ratio)
 
392
  full_clip = full_clip.resize((new_width, adjusted_height))
393
  else:
394
  full_clip = ensure_even_dimensions(full_clip)
395
+
396
  subclip = get_random_subclip_and_slow(full_clip)
397
  remaining_duration = target_duration_seconds - current_duration
398
+
399
  if subclip.duration > remaining_duration:
400
  subclip = subclip.subclip(0, remaining_duration)
401
+
402
  video_clips.append(ensure_even_dimensions(subclip))
403
  current_duration += subclip.duration
 
404
  except Exception as e:
405
+ print(f"Error processing {video_file}: {e}")
406
  continue
407
+
 
 
 
 
 
 
 
408
  if not video_clips:
409
  return None, "No clips processed"
410
 
411
+ # Ensure exact duration match
412
  total_video_duration = sum(clip.duration for clip in video_clips)
413
  duration_diff = total_video_duration - target_duration_seconds
414
+
415
  if abs(duration_diff) > 0.1:
416
  if duration_diff > 0:
417
  trim_amount = duration_diff
418
+ video_clips[-1] = video_clips[-1].subclip(0, video_clips[-1].duration - trim_amount)
 
419
  else:
420
  extend_amount = abs(duration_diff)
421
+ video_clips[-1] = video_clips[-1].fx(vfx.loop, duration=video_clips[-1].duration + extend_amount)
422
+
423
  progress(0.6, desc="Applying transitions...")
424
  transition_duration = {"Snap Cut": 0.1, "Whip Pan": 0.3, "Dreamy Fade": 0.8, "Smooth Blend": 0.5, "Ken Burns Zoom": 0.5}.get(transition_type, 0.5)
425
+
426
  processed_clips = []
427
  for i in range(len(video_clips)):
428
  if i == 0:
 
437
  else:
438
  _, clip_with_transition = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration)
439
  processed_clips.append(clip_with_transition)
440
+
441
  progress(0.7, desc="Concatenating...")
442
+
 
 
 
 
 
 
 
443
  if transition_type == "Snap Cut":
444
  final_video_only = concatenate_videoclips(processed_clips, method="compose")
445
  else:
446
  final_video_only = concatenate_videoclips(processed_clips, method="compose", padding=-transition_duration)
447
+
448
  final_video_only = ensure_even_dimensions(final_video_only)
449
+
450
+ # Fix black screen - loop if needed
451
+ if final_audio and final_video_only.duration < final_audio.duration:
452
+ final_video_only = final_video_only.fx(vfx.loop, duration=final_audio.duration)
453
+
454
  progress(0.8, desc="Adding overlays...")
455
+
456
+ # Create subtitle clips using reliable method
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  all_clips = [final_video_only.set_opacity(0.65)]
458
+
459
+ if linelevel_subtitles:
460
+ print(f"Creating {len(linelevel_subtitles)} subtitle sections")
461
+ subtitle_clips = create_simple_subtitles(linelevel_subtitles, final_video_only.size, final_video_only.duration)
462
+ all_clips.extend(subtitle_clips)
463
+ print(f"Added {len(subtitle_clips)} subtitle clips")
464
+
465
  if title_text and title_text.strip():
466
  title_clips = create_title_overlay(title_text, final_video_only.size, duration=4)
467
  all_clips.extend(title_clips)
468
+
469
  final_video = CompositeVideoClip(all_clips)
470
+
471
  if final_audio:
472
  final_video = final_video.set_audio(final_audio)
473
+
474
  progress(0.9, desc="Exporting...")
475
+
 
 
 
 
 
 
476
  output_filename = f'video_{timestamp}.mp4'
477
  final_output_path = os.path.join(output_path, output_filename)
478
+
479
  try:
480
  final_video.write_videofile(
481
  final_output_path,
 
486
  bitrate=bitrate,
487
  audio_bitrate="128k",
488
  threads=8,
489
+ ffmpeg_params=["-crf", crf, "-pix_fmt", "yuv420p", "-movflags", "+faststart"]
490
  )
491
  except Exception as e:
 
 
492
  return None, f"Export error: {str(e)}"
493
+
494
  progress(1.0, desc="Done")
495
+
 
 
 
 
 
 
 
496
  try:
497
  final_video.close()
498
  if voice_over_audio:
499
  voice_over_audio.close()
 
500
  except:
501
  pass
502
+
503
+ audio_source = "TTS" if text_input else ("Uploaded" if audio_input else "BGM")
504
+ summary = f"Complete\n{output_filename}\n{audio_source}\n{target_duration_seconds:.1f}s\n{len(linelevel_subtitles) if linelevel_subtitles else 0} subs"
 
 
 
 
 
505
  return final_output_path, summary
506
 
507
  with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface:
508
  gr.Markdown("# 🎬 AI Video Generator")
 
509
 
510
  with gr.Row():
511
  with gr.Column():
512
+ text_input = gr.Textbox(label="Text for TTS", lines=4)
513
  voice_dropdown = gr.Dropdown(
514
  choices=[(f"{v['name']} - {v['description']}", k) for k, v in AVAILABLE_VOICES.items()],
515
  value="Puck",
516
+ label="Voice"
517
  )
518
+ audio_input = gr.Audio(type="filepath", label="Or Upload Audio")
519
+ title_input = gr.Textbox(label="Title (Optional)", lines=2)
520
+ duration_slider = gr.Slider(0.5, 10, 2, 0.5, label="Duration (min)")
521
+ quality_radio = gr.Radio(["High", "Standard", "Preview"], value="High", label="Quality")
522
  transition_radio = gr.Radio(
523
  ["Smooth Blend", "Ken Burns Zoom", "Whip Pan", "Dreamy Fade", "Snap Cut"],
524
  value="Smooth Blend",
525
+ label="Transition"
526
  )
527
  with gr.Row():
528
+ submit_btn = gr.Button("Generate Video", variant="primary")
529
+ stop_btn = gr.Button("Stop", variant="stop")
530
 
531
  with gr.Column():
532
+ video_output = gr.Video(label="Output")
533
  summary_output = gr.Textbox(label="Status", lines=8)
534
 
535
  submit_btn.click(
 
540
  stop_btn.click(fn=cancel_generation, outputs=[summary_output, video_output])
541
 
542
  if __name__ == "__main__":
543
+ interface.launch(server_name="0.0.0.0", server_port=7860, show_error=True)