OrbitMC commited on
Commit
020c4c9
·
verified ·
1 Parent(s): 69c690d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +316 -248
app.py CHANGED
@@ -1,29 +1,26 @@
 
1
  from kokoro import KPipeline
2
-
3
  import soundfile as sf
4
  import torch
5
-
6
- import soundfile as sf
7
  import os
8
- from moviepy.editor import VideoFileClip, AudioFileClip, ImageClip
9
- from PIL import Image
10
- import tempfile
 
11
  import random
12
  import cv2
13
  import math
14
- import os, requests, io, time, re, random
 
15
  from moviepy.editor import (
16
- VideoFileClip, concatenate_videoclips, AudioFileClip, ImageClip,
17
- CompositeVideoClip, TextClip, CompositeAudioClip
18
  )
19
  import gradio as gr
20
- import shutil
21
- import os
22
  import moviepy.video.fx.all as vfx
23
  import moviepy.config as mpy_config
24
  from pydub import AudioSegment
25
  from pydub.generators import Sine
26
-
27
  from PIL import Image, ImageDraw, ImageFont
28
  import numpy as np
29
  from bs4 import BeautifulSoup
@@ -31,46 +28,35 @@ import base64
31
  from urllib.parse import quote
32
  import pysrt
33
  from gtts import gTTS
34
- import gradio as gr # Import Gradio
35
 
36
- # Import Kokoro
37
- from kokoro import KPipeline
38
 
39
- # Import moviepy components
40
- from moviepy.editor import (
41
- VideoFileClip, concatenate_videoclips, AudioFileClip, ImageClip,
42
- CompositeVideoClip, TextClip, CompositeAudioClip, concatenate_audioclips
43
- )
44
- import moviepy.video.fx.all as vfx
45
- from PIL import Image, ImageDraw, ImageFont
46
- from pydub import AudioSegment
47
- from pydub.generators import Sine
48
 
49
- # ---------------- Secret Management ---------------- #
50
- # Get secrets from environment variables (set in Gradio Space settings)
51
- PEXELS_API_KEY = os.getenv('PEXELS_API_KEY', '')
52
- OPENROUTER_API_KEY = os.getenv('OPENROUTER_API_KEY', '')
53
- OPENROUTER_MODEL = os.getenv('OPENROUTER_MODEL', 'openai/gpt-oss-120b:free')
54
 
55
- # Check if secrets are loaded
56
- if not PEXELS_API_KEY or not OPENROUTER_API_KEY:
57
- print("WARNING: API keys not found in environment variables!")
58
- print("Please set PEXELS_API_KEY and OPENROUTER_API_KEY in your Gradio Space secrets.")
59
 
60
- # ---------------- Global Configuration ---------------- #
61
- OUTPUT_VIDEO_FILENAME = "final_video.mp4"
62
- USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
 
 
63
 
64
- # Initialize Kokoro TTS pipeline (using American English)
65
- try:
66
- pipeline = KPipeline(lang_code='a')
67
- except Exception as e:
68
- print(f"Warning: Kokoro initialization failed: {e}")
69
- pipeline = None
70
 
71
- # Global variables for video generation
72
  selected_voice = 'af_heart' # Default voice
73
- voice_speed = 0.9 # Default voice speed
74
  font_size = 45 # Default font size
75
  video_clip_probability = 0.25 # Default probability for video clips
76
  bg_music_volume = 0.08 # Default background music volume
@@ -80,17 +66,53 @@ TARGET_RESOLUTION = None
80
  CAPTION_COLOR = None
81
  TEMP_FOLDER = None
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # ---------------- Helper Functions ---------------- #
84
 
 
 
 
 
 
 
 
 
85
  def generate_script(user_input):
86
  """Generate documentary script with proper OpenRouter handling."""
87
- if not OPENROUTER_API_KEY:
88
- print("OpenRouter API key not configured")
89
- return None
90
-
91
  headers = {
92
  'Authorization': f'Bearer {OPENROUTER_API_KEY}',
93
- 'HTTP-Referer': 'https://huggingface.co', # Use HuggingFace as referer
94
  'X-Title': 'AI Documentary Maker'
95
  }
96
 
@@ -211,10 +233,6 @@ def parse_script(script_text):
211
 
212
  def search_pexels_videos(query, pexels_api_key):
213
  """Search for a video on Pexels by query and return a random HD video."""
214
- if not pexels_api_key:
215
- print("Pexels API key not configured")
216
- return None
217
-
218
  headers = {'Authorization': pexels_api_key}
219
  base_url = "https://api.pexels.com/videos/search"
220
  num_pages = 3
@@ -250,11 +268,11 @@ def search_pexels_videos(query, pexels_api_key):
250
  break
251
 
252
  elif response.status_code == 429:
253
- print(f"Rate limit hit (attempt {attempt+1}/{max_retries}). Retrying in {retry_delay} seconds...")
254
  time.sleep(retry_delay)
255
  retry_delay *= 2
256
  else:
257
- print(f"Error fetching videos: {response.status_code} {response.text}")
258
  break
259
 
260
  except requests.exceptions.RequestException as e:
@@ -266,15 +284,11 @@ def search_pexels_videos(query, pexels_api_key):
266
  print(f"Selected random video from {len(all_videos)} HD videos")
267
  return random_video
268
  else:
269
- print("No suitable videos found after searching all pages.")
270
  return None
271
 
272
  def search_pexels_images(query, pexels_api_key):
273
  """Search for an image on Pexels by query."""
274
- if not pexels_api_key:
275
- print("Pexels API key not configured")
276
- return None
277
-
278
  headers = {'Authorization': pexels_api_key}
279
  url = "https://api.pexels.com/v1/search"
280
  params = {"query": query, "per_page": 5, "orientation": "landscape"}
@@ -298,18 +312,18 @@ def search_pexels_images(query, pexels_api_key):
298
  return None
299
 
300
  elif response.status_code == 429:
301
- print(f"Rate limit hit (attempt {attempt+1}/{max_retries}). Retrying in {retry_delay} seconds...")
302
  time.sleep(retry_delay)
303
  retry_delay *= 2
304
  else:
305
- print(f"Error fetching images: {response.status_code} {response.text}")
306
- break
307
 
308
  except requests.exceptions.RequestException as e:
309
  print(f"Request exception: {e}")
310
- break
311
 
312
- print(f"No Pexels images found for query: {query} after all attempts")
313
  return None
314
 
315
  def search_google_images(query):
@@ -348,8 +362,6 @@ def download_image(image_url, filename):
348
  for chunk in response.iter_content(chunk_size=8192):
349
  f.write(chunk)
350
 
351
- print(f"Image downloaded successfully to: {filename}")
352
-
353
  try:
354
  img = Image.open(filename)
355
  img.verify()
@@ -359,19 +371,14 @@ def download_image(image_url, filename):
359
  img.save(filename)
360
  print(f"Image validated and processed: {filename}")
361
  return filename
362
- except Exception as e_validate:
363
- print(f"Downloaded file is not a valid image: {e_validate}")
364
  if os.path.exists(filename):
365
  os.remove(filename)
366
  return None
367
 
368
- except requests.exceptions.RequestException as e_download:
369
- print(f"Image download error: {e_download}")
370
- if os.path.exists(filename):
371
- os.remove(filename)
372
- return None
373
- except Exception as e_general:
374
- print(f"General error during image processing: {e_general}")
375
  if os.path.exists(filename):
376
  os.remove(filename)
377
  return None
@@ -393,7 +400,7 @@ def download_video(video_url, filename):
393
  return None
394
 
395
  def generate_media(prompt, user_image=None, current_index=0, total_segments=1):
396
- """Generate a visual asset by searching for videos or images."""
397
  safe_prompt = re.sub(r'[^\w\s-]', '', prompt).strip().replace(' ', '_')
398
 
399
  if "news" in prompt.lower():
@@ -450,41 +457,33 @@ def generate_silent_audio(duration, sample_rate=24000):
450
  def generate_tts(text, voice):
451
  """Generate TTS audio using Kokoro, falling back to gTTS or silent audio if needed."""
452
  safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
453
- file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.wav")
454
 
455
- if os.path.exists(file_path):
456
- print(f"Using cached TTS for text '{text[:10]}...'")
 
 
 
 
 
 
 
457
  return file_path
458
-
459
- # Try Kokoro if available
460
- if pipeline:
461
  try:
462
- kokoro_voice = selected_voice if voice == 'en' else voice
463
- generator = pipeline(text, voice=kokoro_voice, speed=voice_speed, split_pattern=r'\n+')
464
- audio_segments = []
465
- for i, (gs, ps, audio) in enumerate(generator):
466
- audio_segments.append(audio)
467
- full_audio = np.concatenate(audio_segments) if len(audio_segments) > 1 else audio_segments[0]
468
- sf.write(file_path, full_audio, 24000)
469
- print(f"TTS audio saved to {file_path} (Kokoro)")
470
  return file_path
471
- except Exception as e:
472
- print(f"Error with Kokoro TTS: {e}")
473
-
474
- # Fallback to gTTS
475
- try:
476
- print("Falling back to gTTS...")
477
- tts = gTTS(text=text, lang='en')
478
- mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
479
- tts.save(mp3_path)
480
- audio = AudioSegment.from_mp3(mp3_path)
481
- audio.export(file_path, format="wav")
482
- os.remove(mp3_path)
483
- print(f"Fallback TTS saved to {file_path} (gTTS)")
484
- return file_path
485
- except Exception as fallback_error:
486
- print(f"Both TTS methods failed: {fallback_error}")
487
- return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
488
 
489
  def apply_kenburns_effect(clip, target_resolution, effect_type=None):
490
  """Apply a smooth Ken Burns effect with a single movement pattern."""
@@ -505,8 +504,8 @@ def apply_kenburns_effect(clip, target_resolution, effect_type=None):
505
  new_height = int(new_height * base_scale)
506
  clip = clip.resize(newsize=(new_width, new_height))
507
 
508
- max_offset_x = new_width - target_w
509
- max_offset_y = new_height - target_h
510
 
511
  available_effects = ["zoom-in", "zoom-out", "pan-left", "pan-right", "up-left"]
512
  if effect_type is None or effect_type == "random":
@@ -525,17 +524,17 @@ def apply_kenburns_effect(clip, target_resolution, effect_type=None):
525
  elif effect_type == "pan-left":
526
  start_zoom = 1.0
527
  end_zoom = 1.0
528
- start_center = (max_offset_x + target_w / 2, (max_offset_y // 2) + target_h / 2)
529
- end_center = (target_w / 2, (max_offset_y // 2) + target_h / 2)
530
  elif effect_type == "pan-right":
531
  start_zoom = 1.0
532
  end_zoom = 1.0
533
- start_center = (target_w / 2, (max_offset_y // 2) + target_h / 2)
534
- end_center = (max_offset_x + target_w / 2, (max_offset_y // 2) + target_h / 2)
535
  elif effect_type == "up-left":
536
  start_zoom = 1.0
537
  end_zoom = 1.0
538
- start_center = (max_offset_x + target_w / 2, max_offset_y + target_h / 2)
539
  end_center = (target_w / 2, target_h / 2)
540
  else:
541
  raise ValueError(f"Unsupported effect_type: {effect_type}")
@@ -596,17 +595,16 @@ def add_background_music(final_video, bg_music_volume=0.10):
596
  final_video = final_video.set_audio(mixed_audio)
597
  print("Background music added successfully")
598
  else:
599
- print("No MP3 files found, skipping background music")
600
  return final_video
601
  except Exception as e:
602
  print(f"Error adding background music: {e}")
603
- print("Continuing without background music")
604
  return final_video
605
 
606
  def create_clip(media_path, asset_type, tts_path, duration=None, effects=None, narration_text=None, segment_index=0):
607
  """Create a video clip with synchronized subtitles and narration."""
608
  try:
609
- print(f"Creating clip #{segment_index} with asset_type: {asset_type}, media_path: {media_path}")
610
  if not os.path.exists(media_path) or not os.path.exists(tts_path):
611
  print("Missing media or TTS file")
612
  return None
@@ -635,7 +633,20 @@ def create_clip(media_path, asset_type, tts_path, duration=None, effects=None, n
635
  else:
636
  return None
637
 
638
- # Skip subtitles for now since captions are disabled
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  clip = clip.set_audio(audio_clip)
640
  print(f"Clip created: {clip.duration:.1f}s")
641
  return clip
@@ -643,14 +654,54 @@ def create_clip(media_path, asset_type, tts_path, duration=None, effects=None, n
643
  print(f"Error in create_clip: {str(e)}")
644
  return None
645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  # ---------------- Main Video Generation Function ---------------- #
 
647
  def generate_video(user_input, resolution, caption_option):
648
- """Generate a video based on user input via Gradio."""
649
  global TARGET_RESOLUTION, CAPTION_COLOR, TEMP_FOLDER
650
 
651
- # Check if API keys are configured
652
- if not PEXELS_API_KEY or not OPENROUTER_API_KEY:
653
- return None
 
654
 
655
  # Set resolution
656
  if resolution == "Full":
@@ -658,7 +709,7 @@ def generate_video(user_input, resolution, caption_option):
658
  elif resolution == "Short":
659
  TARGET_RESOLUTION = (1080, 1920)
660
  else:
661
- TARGET_RESOLUTION = (1920, 1080) # Default
662
 
663
  # Set caption color
664
  CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
@@ -666,119 +717,88 @@ def generate_video(user_input, resolution, caption_option):
666
  # Create a unique temporary folder
667
  TEMP_FOLDER = tempfile.mkdtemp()
668
 
669
- print("Generating script from API...")
670
- script = generate_script(user_input)
671
- if not script:
672
- print("Failed to generate script.")
673
- shutil.rmtree(TEMP_FOLDER)
674
- return None
675
- print("Generated Script:\n", script)
676
- elements = parse_script(script)
677
- if not elements:
678
- print("Failed to parse script into elements.")
679
- shutil.rmtree(TEMP_FOLDER)
680
- return None
681
- print(f"Parsed {len(elements)//2} script segments.")
682
-
683
- paired_elements = []
684
- for i in range(0, len(elements), 2):
685
- if i + 1 < len(elements):
686
- paired_elements.append((elements[i], elements[i + 1]))
687
-
688
- if not paired_elements:
689
- print("No valid script segments found.")
690
- shutil.rmtree(TEMP_FOLDER)
691
- return None
692
-
693
- clips = []
694
- for idx, (media_elem, tts_elem) in enumerate(paired_elements):
695
- print(f"\nProcessing segment {idx+1}/{len(paired_elements)} with prompt: '{media_elem['prompt']}'")
696
- media_asset = generate_media(media_elem['prompt'], current_index=idx, total_segments=len(paired_elements))
697
- if not media_asset:
698
- print(f"Skipping segment {idx+1} due to missing media asset.")
699
- continue
700
- tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
701
- if not tts_path:
702
- print(f"Skipping segment {idx+1} due to TTS generation failure.")
703
- continue
704
- clip = create_clip(
705
- media_path=media_asset['path'],
706
- asset_type=media_asset['asset_type'],
707
- tts_path=tts_path,
708
- duration=tts_elem['duration'],
709
- effects=media_elem.get('effects', 'fade-in'),
710
- narration_text=tts_elem['text'],
711
- segment_index=idx
712
- )
713
- if clip:
714
- clips.append(clip)
715
- else:
716
- print(f"Clip creation failed for segment {idx+1}.")
717
-
718
- if not clips:
719
- print("No clips were successfully created.")
720
- shutil.rmtree(TEMP_FOLDER)
721
- return None
722
-
723
- print("\nConcatenating clips...")
724
- final_video = concatenate_videoclips(clips, method="compose")
725
- final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
726
-
727
- print(f"Exporting final video to {OUTPUT_VIDEO_FILENAME}...")
728
- final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
729
- print(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
730
 
731
- # Clean up
732
- print("Cleaning up temporary files...")
733
- shutil.rmtree(TEMP_FOLDER)
734
- print("Temporary files removed.")
 
 
 
 
 
735
 
736
- return OUTPUT_VIDEO_FILENAME
737
-
738
- # ---------------- Gradio Interface ---------------- #
739
- VOICE_CHOICES = {
740
- 'Emma (Female)': 'af_heart',
741
- 'Bella (Female)': 'af_bella',
742
- 'Nicole (Female)': 'af_nicole',
743
- 'Aoede (Female)': 'af_aoede',
744
- 'Kore (Female)': 'af_kore',
745
- 'Sarah (Female)': 'af_sarah',
746
- 'Nova (Female)': 'af_nova',
747
- 'Sky (Female)': 'af_sky',
748
- 'Alloy (Female)': 'af_alloy',
749
- 'Jessica (Female)': 'af_jessica',
750
- 'River (Female)': 'af_river',
751
- 'Michael (Male)': 'am_michael',
752
- 'Fenrir (Male)': 'am_fenrir',
753
- 'Puck (Male)': 'am_puck',
754
- 'Echo (Male)': 'am_echo',
755
- 'Eric (Male)': 'am_eric',
756
- 'Liam (Male)': 'am_liam',
757
- 'Onyx (Male)': 'am_onyx',
758
- 'Santa (Male)': 'am_santa',
759
- 'Adam (Male)': 'am_adam',
760
- 'Emma 🇬🇧 (Female)': 'bf_emma',
761
- 'Isabella 🇬🇧 (Female)': 'bf_isabella',
762
- 'Alice 🇬🇧 (Female)': 'bf_alice',
763
- 'Lily 🇬🇧 (Female)': 'bf_lily',
764
- 'George 🇬🇧 (Male)': 'bm_george',
765
- 'Fable 🇬🇧 (Male)': 'bm_fable',
766
- 'Lewis 🇬🇧 (Male)': 'bm_lewis',
767
- 'Daniel 🇬🇧 (Male)': 'bm_daniel'
768
- }
769
 
770
  def generate_video_with_options(user_input, resolution, caption_option, music_file, voice, vclip_prob, bg_vol, video_fps, video_preset, v_speed, caption_size):
 
771
  global selected_voice, voice_speed, font_size, video_clip_probability, bg_music_volume, fps, preset
772
 
773
- # Check if API keys are configured
774
- if not PEXELS_API_KEY or not OPENROUTER_API_KEY:
775
- return gr.Error("API keys not configured. Please set PEXELS_API_KEY and OPENROUTER_API_KEY in Space secrets.")
776
-
777
  # Update global variables with user selections
778
  selected_voice = VOICE_CHOICES[voice]
779
  voice_speed = v_speed
780
  font_size = caption_size
781
- video_clip_probability = vclip_prob / 100 # Convert from percentage to decimal
782
  bg_music_volume = bg_vol
783
  fps = video_fps
784
  preset = video_preset
@@ -790,29 +810,77 @@ def generate_video_with_options(user_input, resolution, caption_option, music_fi
790
  print(f"Uploaded music saved as: {target_path}")
791
 
792
  # Generate the video
793
- return generate_video(user_input, resolution, caption_option)
 
 
 
 
 
794
 
795
  # Create the Gradio interface
796
- iface = gr.Interface(
797
- fn=generate_video_with_options,
798
- inputs=[
799
- gr.Textbox(label="Video Concept", placeholder="Enter your video concept here...", lines=3),
800
- gr.Radio(["Full", "Short"], label="Resolution", value="Full"),
801
- gr.Radio(["No"], label="Captions (Coming Soon)", value="No"),
802
- gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"]),
803
- gr.Dropdown(choices=list(VOICE_CHOICES.keys()), label="Choose Voice", value="Emma (Female)"),
804
- gr.Slider(0, 100, value=25, step=1, label="Video Clip Usage Probability (%)"),
805
- gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume"),
806
- gr.Slider(10, 60, value=30, step=1, label="Video FPS"),
807
- gr.Dropdown(choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
808
- value="veryfast", label="Export Preset"),
809
- gr.Slider(0.5, 1.5, value=1.2, step=0.05, label="Voice Speed"),
810
- gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
811
- ],
812
- outputs=gr.Video(label="Generated Video"),
813
- title="AI Documentary Video Generator",
814
- description="Create short documentary videos with AI. Upload music, choose voice, and customize settings.\n\n⚠️ **Important**: Make sure to set your API keys in the Space secrets!"
815
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
 
817
  # Launch the interface
818
  if __name__ == "__main__":
 
1
+ # Import necessary libraries
2
  from kokoro import KPipeline
 
3
  import soundfile as sf
4
  import torch
 
 
5
  import os
6
+ import requests
7
+ import io
8
+ import time
9
+ import re
10
  import random
11
  import cv2
12
  import math
13
+ import tempfile
14
+ import shutil
15
  from moviepy.editor import (
16
+ VideoFileClip, concatenate_videoclips, AudioFileClip, ImageClip,
17
+ CompositeVideoClip, TextClip, CompositeAudioClip, concatenate_audioclips
18
  )
19
  import gradio as gr
 
 
20
  import moviepy.video.fx.all as vfx
21
  import moviepy.config as mpy_config
22
  from pydub import AudioSegment
23
  from pydub.generators import Sine
 
24
  from PIL import Image, ImageDraw, ImageFont
25
  import numpy as np
26
  from bs4 import BeautifulSoup
 
28
  from urllib.parse import quote
29
  import pysrt
30
  from gtts import gTTS
 
31
 
32
+ # Initialize Kokoro TTS pipeline (using American English)
33
+ pipeline = KPipeline(lang_code='a')
34
 
35
+ # Try to set ImageMagick binary if available
36
+ try:
37
+ mpy_config.change_settings({"IMAGEMAGICK_BINARY": "/usr/bin/convert"})
38
+ except:
39
+ print("ImageMagick not found, using alternative methods")
 
 
 
 
40
 
41
+ # ---------------- Global Configuration ---------------- #
 
 
 
 
42
 
43
+ # Get secrets from environment variables (Gradio Spaces)
44
+ PEXELS_API_KEY = os.environ.get('PEXELS_API_KEY', '')
45
+ OPENROUTER_API_KEY = os.environ.get('OPENROUTER_API_KEY', '')
 
46
 
47
+ # Fallback to hardcoded values if secrets not set (for local testing only)
48
+ if not PEXELS_API_KEY:
49
+ PEXELS_API_KEY = 'YOUR_PEXELS_KEY_HERE' # Replace with your key for local testing
50
+ if not OPENROUTER_API_KEY:
51
+ OPENROUTER_API_KEY = 'YOUR_OPENROUTER_KEY_HERE' # Replace with your key for local testing
52
 
53
+ OPENROUTER_MODEL = "moonshotai/kimi-k2:free"
54
+ OUTPUT_VIDEO_FILENAME = "final_video.mp4"
55
+ USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
 
 
 
56
 
57
+ # Global variables for Gradio interface
58
  selected_voice = 'af_heart' # Default voice
59
+ voice_speed = 1.2 # Default voice speed (changed from 0.9 to 1.2 to match slider default)
60
  font_size = 45 # Default font size
61
  video_clip_probability = 0.25 # Default probability for video clips
62
  bg_music_volume = 0.08 # Default background music volume
 
66
  CAPTION_COLOR = None
67
  TEMP_FOLDER = None
68
 
69
+ # Voice choices dictionary
70
+ VOICE_CHOICES = {
71
+ 'Emma (Female)': 'af_heart',
72
+ 'Bella (Female)': 'af_bella',
73
+ 'Nicole (Female)': 'af_nicole',
74
+ 'Aoede (Female)': 'af_aoede',
75
+ 'Kore (Female)': 'af_kore',
76
+ 'Sarah (Female)': 'af_sarah',
77
+ 'Nova (Female)': 'af_nova',
78
+ 'Sky (Female)': 'af_sky',
79
+ 'Alloy (Female)': 'af_alloy',
80
+ 'Jessica (Female)': 'af_jessica',
81
+ 'River (Female)': 'af_river',
82
+ 'Michael (Male)': 'am_michael',
83
+ 'Fenrir (Male)': 'am_fenrir',
84
+ 'Puck (Male)': 'am_puck',
85
+ 'Echo (Male)': 'am_echo',
86
+ 'Eric (Male)': 'am_eric',
87
+ 'Liam (Male)': 'am_liam',
88
+ 'Onyx (Male)': 'am_onyx',
89
+ 'Santa (Male)': 'am_santa',
90
+ 'Adam (Male)': 'am_adam',
91
+ 'Emma 🇬🇧 (Female)': 'bf_emma',
92
+ 'Isabella 🇬🇧 (Female)': 'bf_isabella',
93
+ 'Alice 🇬🇧 (Female)': 'bf_alice',
94
+ 'Lily 🇬🇧 (Female)': 'bf_lily',
95
+ 'George 🇬🇧 (Male)': 'bm_george',
96
+ 'Fable 🇬🇧 (Male)': 'bm_fable',
97
+ 'Lewis 🇬🇧 (Male)': 'bm_lewis',
98
+ 'Daniel 🇬🇧 (Male)': 'bm_daniel'
99
+ }
100
+
101
  # ---------------- Helper Functions ---------------- #
102
 
103
+ def check_api_keys():
104
+ """Check if API keys are properly configured."""
105
+ if not PEXELS_API_KEY or PEXELS_API_KEY == 'YOUR_PEXELS_KEY_HERE':
106
+ return False, "PEXELS_API_KEY not configured"
107
+ if not OPENROUTER_API_KEY or OPENROUTER_API_KEY == 'YOUR_OPENROUTER_KEY_HERE':
108
+ return False, "OPENROUTER_API_KEY not configured"
109
+ return True, "API keys configured"
110
+
111
  def generate_script(user_input):
112
  """Generate documentary script with proper OpenRouter handling."""
 
 
 
 
113
  headers = {
114
  'Authorization': f'Bearer {OPENROUTER_API_KEY}',
115
+ 'HTTP-Referer': 'https://huggingface.co',
116
  'X-Title': 'AI Documentary Maker'
117
  }
118
 
 
233
 
234
  def search_pexels_videos(query, pexels_api_key):
235
  """Search for a video on Pexels by query and return a random HD video."""
 
 
 
 
236
  headers = {'Authorization': pexels_api_key}
237
  base_url = "https://api.pexels.com/videos/search"
238
  num_pages = 3
 
268
  break
269
 
270
  elif response.status_code == 429:
271
+ print(f"Rate limit hit. Retrying in {retry_delay} seconds...")
272
  time.sleep(retry_delay)
273
  retry_delay *= 2
274
  else:
275
+ print(f"Error fetching videos: {response.status_code}")
276
  break
277
 
278
  except requests.exceptions.RequestException as e:
 
284
  print(f"Selected random video from {len(all_videos)} HD videos")
285
  return random_video
286
  else:
287
+ print("No suitable videos found.")
288
  return None
289
 
290
  def search_pexels_images(query, pexels_api_key):
291
  """Search for an image on Pexels by query."""
 
 
 
 
292
  headers = {'Authorization': pexels_api_key}
293
  url = "https://api.pexels.com/v1/search"
294
  params = {"query": query, "per_page": 5, "orientation": "landscape"}
 
312
  return None
313
 
314
  elif response.status_code == 429:
315
+ print(f"Rate limit hit. Retrying in {retry_delay} seconds...")
316
  time.sleep(retry_delay)
317
  retry_delay *= 2
318
  else:
319
+ print(f"Error fetching images: {response.status_code}")
320
+ return None
321
 
322
  except requests.exceptions.RequestException as e:
323
  print(f"Request exception: {e}")
324
+ return None
325
 
326
+ print(f"No Pexels images found for query: {query}")
327
  return None
328
 
329
  def search_google_images(query):
 
362
  for chunk in response.iter_content(chunk_size=8192):
363
  f.write(chunk)
364
 
 
 
365
  try:
366
  img = Image.open(filename)
367
  img.verify()
 
371
  img.save(filename)
372
  print(f"Image validated and processed: {filename}")
373
  return filename
374
+ except Exception as e:
375
+ print(f"Downloaded file is not a valid image: {e}")
376
  if os.path.exists(filename):
377
  os.remove(filename)
378
  return None
379
 
380
+ except Exception as e:
381
+ print(f"Image download error: {e}")
 
 
 
 
 
382
  if os.path.exists(filename):
383
  os.remove(filename)
384
  return None
 
400
  return None
401
 
402
  def generate_media(prompt, user_image=None, current_index=0, total_segments=1):
403
+ """Generate a visual asset by searching for media."""
404
  safe_prompt = re.sub(r'[^\w\s-]', '', prompt).strip().replace(' ', '_')
405
 
406
  if "news" in prompt.lower():
 
457
  def generate_tts(text, voice):
458
  """Generate TTS audio using Kokoro, falling back to gTTS or silent audio if needed."""
459
  safe_text = re.sub(r'[^\w\s-]', '', text[:10]).strip().replace(' ', '_')
460
+ file_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}_{int(time.time())}.wav")
461
 
462
+ try:
463
+ kokoro_voice = selected_voice if voice == 'en' else voice
464
+ generator = pipeline(text, voice=kokoro_voice, speed=voice_speed, split_pattern=r'\n+')
465
+ audio_segments = []
466
+ for i, (gs, ps, audio) in enumerate(generator):
467
+ audio_segments.append(audio)
468
+ full_audio = np.concatenate(audio_segments) if len(audio_segments) > 1 else audio_segments[0]
469
+ sf.write(file_path, full_audio, 24000)
470
+ print(f"TTS audio saved to {file_path} (Kokoro)")
471
  return file_path
472
+ except Exception as e:
473
+ print(f"Error with Kokoro TTS: {e}")
 
474
  try:
475
+ print("Falling back to gTTS...")
476
+ tts = gTTS(text=text, lang='en')
477
+ mp3_path = os.path.join(TEMP_FOLDER, f"tts_{safe_text}.mp3")
478
+ tts.save(mp3_path)
479
+ audio = AudioSegment.from_mp3(mp3_path)
480
+ audio.export(file_path, format="wav")
481
+ os.remove(mp3_path)
482
+ print(f"Fallback TTS saved to {file_path} (gTTS)")
483
  return file_path
484
+ except Exception as fallback_error:
485
+ print(f"Both TTS methods failed: {fallback_error}")
486
+ return generate_silent_audio(duration=max(3, len(text.split()) * 0.5))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  def apply_kenburns_effect(clip, target_resolution, effect_type=None):
489
  """Apply a smooth Ken Burns effect with a single movement pattern."""
 
504
  new_height = int(new_height * base_scale)
505
  clip = clip.resize(newsize=(new_width, new_height))
506
 
507
+ max_offset_x = max(0, new_width - target_w)
508
+ max_offset_y = max(0, new_height - target_h)
509
 
510
  available_effects = ["zoom-in", "zoom-out", "pan-left", "pan-right", "up-left"]
511
  if effect_type is None or effect_type == "random":
 
524
  elif effect_type == "pan-left":
525
  start_zoom = 1.0
526
  end_zoom = 1.0
527
+ start_center = (max_offset_x + target_w / 2, (max_offset_y // 2) + target_h / 2) if max_offset_x > 0 else (new_width / 2, new_height / 2)
528
+ end_center = (target_w / 2, (max_offset_y // 2) + target_h / 2) if max_offset_x > 0 else start_center
529
  elif effect_type == "pan-right":
530
  start_zoom = 1.0
531
  end_zoom = 1.0
532
+ start_center = (target_w / 2, (max_offset_y // 2) + target_h / 2) if max_offset_x > 0 else (new_width / 2, new_height / 2)
533
+ end_center = (max_offset_x + target_w / 2, (max_offset_y // 2) + target_h / 2) if max_offset_x > 0 else start_center
534
  elif effect_type == "up-left":
535
  start_zoom = 1.0
536
  end_zoom = 1.0
537
+ start_center = (max_offset_x + target_w / 2, max_offset_y + target_h / 2) if max_offset_x > 0 and max_offset_y > 0 else (new_width / 2, new_height / 2)
538
  end_center = (target_w / 2, target_h / 2)
539
  else:
540
  raise ValueError(f"Unsupported effect_type: {effect_type}")
 
595
  final_video = final_video.set_audio(mixed_audio)
596
  print("Background music added successfully")
597
  else:
598
+ print("No music.mp3 file found, skipping background music")
599
  return final_video
600
  except Exception as e:
601
  print(f"Error adding background music: {e}")
 
602
  return final_video
603
 
604
  def create_clip(media_path, asset_type, tts_path, duration=None, effects=None, narration_text=None, segment_index=0):
605
  """Create a video clip with synchronized subtitles and narration."""
606
  try:
607
+ print(f"Creating clip #{segment_index} with asset_type: {asset_type}")
608
  if not os.path.exists(media_path) or not os.path.exists(tts_path):
609
  print("Missing media or TTS file")
610
  return None
 
633
  else:
634
  return None
635
 
636
+ # Simplified subtitle handling (no ImageMagick dependency)
637
+ if narration_text and CAPTION_COLOR != "transparent":
638
+ try:
639
+ # Create a simple subtitle without complex text effects
640
+ subtitle_text = narration_text
641
+ # Create subtitle as image overlay to avoid ImageMagick issues
642
+ subtitle_img = create_subtitle_image(subtitle_text, TARGET_RESOLUTION)
643
+ subtitle_clip = ImageClip(subtitle_img).set_duration(target_duration)
644
+ subtitle_clip = subtitle_clip.set_position(('center', 'bottom'))
645
+ clip = CompositeVideoClip([clip, subtitle_clip])
646
+ except Exception as sub_error:
647
+ print(f"Subtitle creation failed: {sub_error}")
648
+ # Continue without subtitles
649
+
650
  clip = clip.set_audio(audio_clip)
651
  print(f"Clip created: {clip.duration:.1f}s")
652
  return clip
 
654
  print(f"Error in create_clip: {str(e)}")
655
  return None
656
 
657
+ def create_subtitle_image(text, resolution):
658
+ """Create a subtitle image using PIL instead of TextClip to avoid ImageMagick issues."""
659
+ width, height = resolution
660
+ img = Image.new('RGBA', (width, 100), (0, 0, 0, 0))
661
+ draw = ImageDraw.Draw(img)
662
+
663
+ # Try to use a font, fall back to default if not available
664
+ try:
665
+ font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
666
+ except:
667
+ font = ImageFont.load_default()
668
+
669
+ # Get text size
670
+ bbox = draw.textbbox((0, 0), text, font=font)
671
+ text_width = bbox[2] - bbox[0]
672
+ text_height = bbox[3] - bbox[1]
673
+
674
+ # Center the text
675
+ position = ((width - text_width) // 2, (100 - text_height) // 2)
676
+
677
+ # Draw text with outline effect
678
+ for adj in range(-2, 3):
679
+ for adj2 in range(-2, 3):
680
+ draw.text((position[0] + adj, position[1] + adj2), text, font=font, fill=(0, 0, 0, 128))
681
+
682
+ draw.text(position, text, font=font, fill=(255, 255, 255, 255))
683
+
684
+ # Save to temporary file
685
+ temp_path = os.path.join(TEMP_FOLDER, f"subtitle_{int(time.time())}.png")
686
+ img.save(temp_path)
687
+ return temp_path
688
+
689
+ def fix_imagemagick_policy():
690
+ """Attempt to fix ImageMagick security policies (may not work in Gradio Spaces)."""
691
+ # This won't work in Gradio Spaces due to lack of sudo access
692
+ # We'll handle this gracefully
693
+ return False
694
+
695
  # ---------------- Main Video Generation Function ---------------- #
696
+
697
  def generate_video(user_input, resolution, caption_option):
698
+ """Generate a video based on user input."""
699
  global TARGET_RESOLUTION, CAPTION_COLOR, TEMP_FOLDER
700
 
701
+ # Check API keys
702
+ api_status, api_message = check_api_keys()
703
+ if not api_status:
704
+ return None, f"Error: {api_message}. Please configure the API keys in Space secrets."
705
 
706
  # Set resolution
707
  if resolution == "Full":
 
709
  elif resolution == "Short":
710
  TARGET_RESOLUTION = (1080, 1920)
711
  else:
712
+ TARGET_RESOLUTION = (1920, 1080)
713
 
714
  # Set caption color
715
  CAPTION_COLOR = "white" if caption_option == "Yes" else "transparent"
 
717
  # Create a unique temporary folder
718
  TEMP_FOLDER = tempfile.mkdtemp()
719
 
720
+ try:
721
+ print("Generating script from API...")
722
+ script = generate_script(user_input)
723
+ if not script:
724
+ print("Failed to generate script.")
725
+ return None, "Failed to generate script. Please check your API key and try again."
726
+
727
+ print("Generated Script:\n", script)
728
+ elements = parse_script(script)
729
+ if not elements:
730
+ print("Failed to parse script into elements.")
731
+ return None, "Failed to parse script. Please try again."
732
+
733
+ print(f"Parsed {len(elements)//2} script segments.")
734
+
735
+ paired_elements = []
736
+ for i in range(0, len(elements), 2):
737
+ if i + 1 < len(elements):
738
+ paired_elements.append((elements[i], elements[i + 1]))
739
+
740
+ if not paired_elements:
741
+ print("No valid script segments found.")
742
+ return None, "No valid script segments found."
743
+
744
+ clips = []
745
+ for idx, (media_elem, tts_elem) in enumerate(paired_elements):
746
+ print(f"\nProcessing segment {idx+1}/{len(paired_elements)}")
747
+ media_asset = generate_media(media_elem['prompt'], current_index=idx, total_segments=len(paired_elements))
748
+ if not media_asset:
749
+ print(f"Skipping segment {idx+1} due to missing media asset.")
750
+ continue
751
+ tts_path = generate_tts(tts_elem['text'], tts_elem['voice'])
752
+ if not tts_path:
753
+ print(f"Skipping segment {idx+1} due to TTS generation failure.")
754
+ continue
755
+ clip = create_clip(
756
+ media_path=media_asset['path'],
757
+ asset_type=media_asset['asset_type'],
758
+ tts_path=tts_path,
759
+ duration=tts_elem['duration'],
760
+ effects=media_elem.get('effects', 'fade-in'),
761
+ narration_text=tts_elem['text'],
762
+ segment_index=idx
763
+ )
764
+ if clip:
765
+ clips.append(clip)
766
+
767
+ if not clips:
768
+ print("No clips were successfully created.")
769
+ return None, "No clips were successfully created."
770
+
771
+ print("\nConcatenating clips...")
772
+ final_video = concatenate_videoclips(clips, method="compose")
773
+ final_video = add_background_music(final_video, bg_music_volume=bg_music_volume)
774
+
775
+ print(f"Exporting final video...")
776
+ final_video.write_videofile(OUTPUT_VIDEO_FILENAME, codec='libx264', fps=fps, preset=preset)
777
+ print(f"Final video saved as {OUTPUT_VIDEO_FILENAME}")
778
+
779
+ return OUTPUT_VIDEO_FILENAME, "Video generated successfully!"
 
780
 
781
+ except Exception as e:
782
+ print(f"Error during video generation: {str(e)}")
783
+ return None, f"Error: {str(e)}"
784
+ finally:
785
+ # Clean up
786
+ print("Cleaning up temporary files...")
787
+ if TEMP_FOLDER and os.path.exists(TEMP_FOLDER):
788
+ shutil.rmtree(TEMP_FOLDER)
789
+ print("Temporary files removed.")
790
 
791
+ # ---------------- Gradio Interface Functions ---------------- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
792
 
793
  def generate_video_with_options(user_input, resolution, caption_option, music_file, voice, vclip_prob, bg_vol, video_fps, video_preset, v_speed, caption_size):
794
+ """Wrapper function for Gradio interface."""
795
  global selected_voice, voice_speed, font_size, video_clip_probability, bg_music_volume, fps, preset
796
 
 
 
 
 
797
  # Update global variables with user selections
798
  selected_voice = VOICE_CHOICES[voice]
799
  voice_speed = v_speed
800
  font_size = caption_size
801
+ video_clip_probability = vclip_prob / 100
802
  bg_music_volume = bg_vol
803
  fps = video_fps
804
  preset = video_preset
 
810
  print(f"Uploaded music saved as: {target_path}")
811
 
812
  # Generate the video
813
+ video_path, message = generate_video(user_input, resolution, caption_option)
814
+
815
+ if video_path:
816
+ return video_path, message
817
+ else:
818
+ return None, message
819
 
820
  # Create the Gradio interface
821
+ with gr.Blocks(title="AI Documentary Video Generator") as iface:
822
+ gr.Markdown("# 🎬 AI Documentary Video Generator")
823
+ gr.Markdown("Create professional documentary-style videos with AI narration and visuals.")
824
+
825
+ with gr.Row():
826
+ with gr.Column():
827
+ user_input = gr.Textbox(
828
+ label="Video Concept",
829
+ placeholder="Enter your video concept here... (e.g., 'The history of space exploration', 'Climate change impacts on oceans')",
830
+ lines=3
831
+ )
832
+
833
+ with gr.Row():
834
+ resolution = gr.Radio(["Full", "Short"], label="Resolution", value="Full")
835
+ caption_option = gr.Radio(["No"], label="Captions (Coming Soon)", value="No")
836
+
837
+ music_file = gr.File(label="Upload Background Music (MP3)", file_types=[".mp3"])
838
+
839
+ with gr.Accordion("Advanced Settings", open=False):
840
+ voice = gr.Dropdown(
841
+ choices=list(VOICE_CHOICES.keys()),
842
+ label="Choose Voice",
843
+ value="Emma (Female)"
844
+ )
845
+ vclip_prob = gr.Slider(0, 100, value=25, step=1, label="Video Clip Usage Probability (%)")
846
+ bg_vol = gr.Slider(0.0, 1.0, value=0.08, step=0.01, label="Background Music Volume")
847
+ video_fps = gr.Slider(10, 60, value=30, step=1, label="Video FPS")
848
+ video_preset = gr.Dropdown(
849
+ choices=["ultrafast", "superfast", "veryfast", "faster", "fast", "medium", "slow"],
850
+ value="veryfast",
851
+ label="Export Preset"
852
+ )
853
+ v_speed = gr.Slider(0.5, 1.5, value=1.2, step=0.05, label="Voice Speed")
854
+ caption_size = gr.Slider(20, 100, value=45, step=1, label="Caption Font Size")
855
+
856
+ generate_btn = gr.Button("🎬 Generate Video", variant="primary")
857
+
858
+ with gr.Column():
859
+ output_video = gr.Video(label="Generated Video")
860
+ status_message = gr.Textbox(label="Status", interactive=False)
861
+
862
+ gr.Markdown("""
863
+ ### 📝 Instructions:
864
+ 1. Enter a topic or concept for your documentary
865
+ 2. Choose resolution (Full for 16:9, Short for 9:16)
866
+ 3. Optionally upload background music
867
+ 4. Adjust advanced settings if needed
868
+ 5. Click Generate Video and wait for processing
869
+
870
+ ### ⚠️ Note:
871
+ - Video generation may take 2-5 minutes depending on length
872
+ - Make sure API keys are configured in Space secrets
873
+ """)
874
+
875
+ generate_btn.click(
876
+ fn=generate_video_with_options,
877
+ inputs=[
878
+ user_input, resolution, caption_option, music_file,
879
+ voice, vclip_prob, bg_vol, video_fps, video_preset,
880
+ v_speed, caption_size
881
+ ],
882
+ outputs=[output_video, status_message]
883
+ )
884
 
885
  # Launch the interface
886
  if __name__ == "__main__":