import os from datetime import datetime import random import whisper import shutil import wave import base64 from moviepy.editor import (VideoFileClip, AudioFileClip, TextClip, concatenate_videoclips, CompositeVideoClip, CompositeAudioClip, ImageClip) import moviepy.audio.fx.all as afx import moviepy.video.fx.all as vfx import gradio as gr from PIL import Image, ImageDraw, ImageFont import numpy as np from functools import lru_cache import urllib.request from google import genai from google.genai import types # CHANGED: Create local directories instead of Google Drive paths os.makedirs('video_clips', exist_ok=True) os.makedirs('background_music', exist_ok=True) os.makedirs('voice_over', exist_ok=True) os.makedirs('exports', exist_ok=True) # CHANGED: Get API key from environment variable (secure method) GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY', '') if GOOGLE_API_KEY: os.environ['GOOGLE_API_KEY'] = GOOGLE_API_KEY generation_cancelled = False current_video_clip = None AVAILABLE_VOICES = { "Puck": {"name": "Puck", "description": "Young adult female (US)"}, "Charon": {"name": "Charon", "description": "Young adult male (US)"}, "Kore": {"name": "Kore", "description": "Young adult female (US)"}, "Fenrir": {"name": "Fenrir", "description": "Young adult male (US)"}, "Aoede": {"name": "Aoede", "description": "Young adult female (US)"} } def wave_file(filename, pcm_data, channels=1, rate=24000, sample_width=2): with wave.open(filename, "wb") as wf: wf.setnchannels(channels) wf.setsampwidth(sample_width) wf.setframerate(rate) wf.writeframes(pcm_data) def generate_tts_audio(text_input, voice_name="Puck"): global generation_cancelled try: if generation_cancelled: return None, "Generation cancelled" client = genai.Client() response = client.models.generate_content( model="gemini-2.5-flash-preview-tts", contents=text_input, config=types.GenerateContentConfig( response_modalities=["AUDIO"], speech_config=types.SpeechConfig( voice_config=types.VoiceConfig( prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=voice_name) ) ), ) ) if generation_cancelled: return None, "Generation cancelled" audio_data = response.candidates[0].content.parts[0].inline_data.data if isinstance(audio_data, str): audio_data = base64.b64decode(audio_data) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") temp_audio_path = f'/tmp/tts_audio_{timestamp}.wav' wave_file(temp_audio_path, audio_data) return temp_audio_path, "TTS generated" except Exception as e: return None, f"Error: {str(e)}" def split_text_into_lines(data): MaxChars, MaxDuration, MaxGap = 60, 2.5, 1.5 subtitles, line, line_duration = [], [], 0 for idx, word_data in enumerate(data): line.append(word_data) line_duration += word_data["end"] - word_data["start"] chars_exceeded = len(" ".join(item["word"] for item in line)) > MaxChars duration_exceeded = line_duration > MaxDuration sentence_ended = word_data["word"].rstrip().endswith(('.', '!', '?')) maxgap_exceeded = idx > 0 and word_data['start'] - data[idx-1]['end'] > MaxGap if chars_exceeded or duration_exceeded or sentence_ended or maxgap_exceeded: if line: subtitles.append({ "word": " ".join(item["word"] for item in line), "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line }) line, line_duration = [], 0 if line: subtitles.append({ "word": " ".join(item["word"] for item in line), "start": line[0]["start"], "end": line[-1]["end"], "textcontents": line }) return subtitles @lru_cache(maxsize=1000) def get_cached_text_clip(text, font, fontsize, color): return TextClip(text, font=font, fontsize=fontsize, color=color) def create_title_overlay(title_text, framesize, duration=4): if not title_text or not title_text.strip(): return [] frame_width, frame_height = framesize FONT_URL = "https://github.com/google/fonts/raw/main/ofl/poppins/Poppins-Bold.ttf" FONT_PATH = "/tmp/Poppins-Bold.ttf" if not os.path.exists(FONT_PATH): try: urllib.request.urlretrieve(FONT_URL, FONT_PATH) except: FONT_PATH = None TOP_MARGIN = int(frame_height * 0.115) FONT_SIZE = int(frame_height * 0.042) STROKE_WIDTH = max(1, int(frame_height * 0.003)) LINE_SPACING = max(4, int(frame_height * 0.008)) def load_font(size): try: if FONT_PATH and os.path.exists(FONT_PATH): return ImageFont.truetype(FONT_PATH, size) return ImageFont.truetype("/usr/share/fonts/truetype/liberation/LiberationSans-Bold.ttf", size) except: return ImageFont.load_default() font_obj = load_font(FONT_SIZE) base = Image.new("RGBA", (frame_width, frame_height), (0, 0, 0, 0)) temp_img = Image.new("RGBA", (frame_width, frame_height), (0,0,0,0)) temp_draw = ImageDraw.Draw(temp_img) def measure_text(text, font): try: bbox = temp_draw.textbbox((0,0), text, font=font, stroke_width=STROKE_WIDTH) return bbox[2]-bbox[0], bbox[3]-bbox[1] except: return 100, 50 def wrap_text(text, font, max_width): words = text.upper().split() lines, current = [], [] for word in words: test_line = " ".join(current + [word]) w, _ = measure_text(test_line, font) if w <= max_width: current.append(word) else: if current: lines.append(" ".join(current)) current = [word] else: lines.append(word) current = [] if current: lines.append(" ".join(current)) return lines[:4] lines = wrap_text(title_text, font_obj, frame_width * 0.90) line_heights = [measure_text(line, font_obj)[1] for line in lines] y_start = TOP_MARGIN x_center = frame_width // 2 draw = ImageDraw.Draw(base) y = y_start for i, line in enumerate(lines): w, h = measure_text(line, font_obj) x = x_center - w // 2 draw.text((x+2, y+2), line, font=font_obj, fill=(0,0,0,180)) draw.text((x, y), line, font=font_obj, fill=(255,255,255,255), stroke_width=STROKE_WIDTH, stroke_fill=(0,0,0,255)) y += line_heights[i] + LINE_SPACING return [ImageClip(np.array(base), duration=duration)] def create_caption(textJSON, framesize, font="Helvetica-Bold", fontsize=14, color='white'): full_duration = textJSON['end'] - textJSON['start'] word_clips = [] xy_textclips_positions = [] frame_width, frame_height = framesize max_line_width = frame_width * 0.8 lines, current_line, current_line_width = [], [], 0 for wordJSON in textJSON['textcontents']: word_upper = wordJSON['word'].upper() temp_word = get_cached_text_clip(word_upper, font, fontsize, color) temp_space = get_cached_text_clip(" ", font, fontsize, color) word_width, word_height = temp_word.size space_width, _ = temp_space.size if current_line_width + word_width + space_width > max_line_width and current_line: lines.append({'words': current_line.copy(), 'width': current_line_width, 'height': word_height}) current_line = [wordJSON] current_line_width = word_width + space_width else: current_line.append(wordJSON) current_line_width += word_width + space_width if current_line: word_upper = current_line[0]['word'].upper() temp_word = get_cached_text_clip(word_upper, font, fontsize, color) _, word_height = temp_word.size lines.append({'words': current_line, 'width': current_line_width, 'height': word_height}) total_text_height = sum(line['height'] for line in lines) + (len(lines) - 1) * 3 subtitle_y_position = int(frame_height * 0.65) current_y = subtitle_y_position if lines: shadow_padding = 25 shadow_height_extra = 15 total_subtitle_width = max(line['width'] for line in lines) bg_width = int(total_subtitle_width + shadow_padding * 2) bg_height = int(total_text_height + shadow_height_extra * 2) img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0)) draw = ImageDraw.Draw(img) draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=15, fill=(0, 0, 0, 128)) img_array = np.array(img) shadow_bg = ImageClip(img_array, duration=full_duration).set_start(textJSON['start']) shadow_x = (frame_width - total_subtitle_width) / 2 - shadow_padding shadow_y = subtitle_y_position - shadow_height_extra shadow_bg = shadow_bg.set_position((shadow_x, shadow_y)) word_clips.append(shadow_bg) for line in lines: line_words = line['words'] word_dimensions = [] for wordJSON in line_words: word_upper = wordJSON['word'].upper() temp_word = get_cached_text_clip(word_upper, font, fontsize, color) temp_space = get_cached_text_clip(" ", font, fontsize, color) word_width, word_height = temp_word.size space_width, _ = temp_space.size word_dimensions.append({ 'word_data': wordJSON, 'word_width': word_width, 'word_height': word_height, 'space_width': space_width, 'word_upper': word_upper }) line_start_x = (frame_width - line['width']) / 2 current_x = line_start_x for word_dim in word_dimensions: wordJSON = word_dim['word_data'] word_width = word_dim['word_width'] word_height = word_dim['word_height'] space_width = word_dim['space_width'] word_upper = word_dim['word_upper'] shadow_text = get_cached_text_clip(word_upper, font, fontsize, 'black') shadow_text = shadow_text.set_start(textJSON['start']).set_duration(full_duration) shadow_text = shadow_text.set_position((current_x + 1, current_y + 1)).set_opacity(0.3) word_clips.append(shadow_text) word_clip = get_cached_text_clip(word_upper, font, fontsize, color) word_clip = word_clip.set_start(textJSON['start']).set_duration(full_duration) word_clip = word_clip.set_position((current_x, current_y)) space_clip = get_cached_text_clip(" ", font, fontsize, color) space_clip = space_clip.set_start(textJSON['start']).set_duration(full_duration) space_clip = space_clip.set_position((current_x + word_width, current_y)) xy_textclips_positions.append({ "x_pos": current_x, "y_pos": current_y, "width": word_width, "height": word_height, "word": word_upper, "start": wordJSON['start'], "end": wordJSON['end'], "duration": wordJSON['end'] - wordJSON['start'] }) word_clips.append(word_clip) word_clips.append(space_clip) current_x += word_width + space_width current_y += line['height'] + 3 for highlight_word in xy_textclips_positions: bg_width = int(highlight_word['width'] + 16) bg_height = int(highlight_word['height'] + 8) img = Image.new('RGBA', (bg_width, bg_height), (0, 0, 0, 0)) draw = ImageDraw.Draw(img) draw.rounded_rectangle([(0, 0), (bg_width-1, bg_height-1)], radius=8, fill=(147, 0, 211, 180)) img_array = np.array(img) bg_clip = ImageClip(img_array, duration=highlight_word['duration']) bg_clip = bg_clip.set_start(highlight_word['start']) bg_x = highlight_word['x_pos'] - 8 bg_y = highlight_word['y_pos'] - 4 bg_clip = bg_clip.set_position((bg_x, bg_y)) shadow_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'black') shadow_highlight = shadow_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration']) shadow_highlight = shadow_highlight.set_position((highlight_word['x_pos'] + 1, highlight_word['y_pos'] + 1)).set_opacity(0.4) word_clip_highlight = get_cached_text_clip(highlight_word['word'], font, fontsize, 'white') word_clip_highlight = word_clip_highlight.set_start(highlight_word['start']).set_duration(highlight_word['duration']) word_clip_highlight = word_clip_highlight.set_position((highlight_word['x_pos'], highlight_word['y_pos'])) word_clips.append(bg_clip) word_clips.append(shadow_highlight) word_clips.append(word_clip_highlight) return word_clips def get_random_subclip_and_slow(clip): subclip_durations = [2, 3, 4] subclip_duration = random.choice(subclip_durations) if clip.duration < subclip_duration: return clip.speedx(0.5) start_time = random.uniform(0, clip.duration - subclip_duration) subclip = clip.subclip(start_time, start_time + subclip_duration) return subclip.speedx(0.5) def ensure_even_dimensions(clip): width, height = clip.size if width % 2 != 0: width -= 1 if height % 2 != 0: height -= 1 if (width, height) != clip.size: return clip.resize((width, height)) return clip def apply_transition_effect(clip1, clip2, transition_type, duration=0.5): if transition_type == "Smooth Blend": return clip1.crossfadeout(duration), clip2.crossfadein(duration) elif transition_type == "Ken Burns Zoom": def zoom_in(t): return 1 + (0.15 * min(t / clip1.duration, 1)) clip1_zoom = clip1.resize(zoom_in) clip1_out = clip1_zoom.crossfadeout(duration) def zoom_out(t): return 1.15 - (0.15 * min(t / duration, 1)) clip2_zoom = clip2.resize(zoom_out) if clip2.duration >= duration else clip2 clip2_in = clip2_zoom.crossfadein(duration) return clip1_out, clip2_in elif transition_type == "Whip Pan": return clip1.fadeout(duration * 0.5), clip2.fadein(duration * 0.5) elif transition_type == "Dreamy Fade": return clip1.crossfadeout(duration * 1.2), clip2.crossfadein(duration * 1.2) elif transition_type == "Snap Cut": return clip1, clip2 else: return clip1.crossfadeout(duration), clip2.crossfadein(duration) def process_voiceover_to_subtitles(voice_over_path): global generation_cancelled try: if generation_cancelled: return [], "" model = whisper.load_model("tiny") result = model.transcribe(voice_over_path, word_timestamps=True, fp16=False) if generation_cancelled: return [], "" wordlevel_info = [] for segment in result['segments']: if generation_cancelled: return [], "" if 'words' in segment: for word in segment['words']: wordlevel_info.append({'word': word['word'].strip(), 'start': word['start'], 'end': word['end']}) return split_text_into_lines(wordlevel_info), result['text'] except Exception as e: if generation_cancelled: return [], "" raise e def cleanup_resources(): global current_video_clip try: if current_video_clip: current_video_clip.close() current_video_clip = None except: pass def cancel_generation(): global generation_cancelled generation_cancelled = True cleanup_resources() return "Generation cancelled", None def merge_videos_with_subtitles(text_input, voice_selection, audio_input, title_text, duration_minutes, video_quality, transition_type, progress=gr.Progress(track_tqdm=True)): global generation_cancelled, current_video_clip generation_cancelled = False current_video_clip = None progress(0, desc="Starting...") if generation_cancelled: return None, "Generation cancelled" timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # CHANGED: Use local paths instead of Google Drive source_path = 'video_clips' if not os.path.isdir(source_path): return None, "Video clips folder not found" output_path = 'exports' os.makedirs(output_path, exist_ok=True) video_extensions = ('.mp4', '.avi', '.mkv', '.mov') all_files = [f for f in os.listdir(source_path) if f.lower().endswith(video_extensions)] if not all_files: return None, "No video files found" random.shuffle(all_files) if generation_cancelled: return None, "Generation cancelled" bg_music_path = None # CHANGED: Use local background_music folder bg_music_folder_path = 'background_music' if os.path.isdir(bg_music_folder_path): audio_extensions = ('.mp3', '.wav', '.m4a', '.aac') possible_files = [f for f in os.listdir(bg_music_folder_path) if f.lower().endswith(audio_extensions) and not f.startswith('voiceover_')] if len(possible_files) >= 1: bg_music_path = os.path.join(bg_music_folder_path, possible_files[0]) target_duration_seconds = 0 voice_over_audio = None linelevel_subtitles = None voice_over_path = None if text_input and text_input.strip(): progress(0.1, desc="Generating TTS...") voice_name = AVAILABLE_VOICES[voice_selection]["name"] if voice_selection in AVAILABLE_VOICES else "Puck" tts_path, tts_message = generate_tts_audio(text_input, voice_name) if generation_cancelled: return None, "Generation cancelled" if tts_path: # CHANGED: Use local voice_over folder voice_over_folder_path = 'voice_over' os.makedirs(voice_over_folder_path, exist_ok=True) voice_filename = f"tts_voiceover_{timestamp}.wav" saved_voice_path = os.path.join(voice_over_folder_path, voice_filename) shutil.copy2(tts_path, saved_voice_path) voice_over_path = saved_voice_path else: return None, f"TTS failed: {tts_message}" elif audio_input: if generation_cancelled: return None, "Generation cancelled" voice_over_folder_path = 'voice_over' os.makedirs(voice_over_folder_path, exist_ok=True) voice_filename = f"uploaded_voiceover_{timestamp}.mp3" saved_voice_path = os.path.join(voice_over_folder_path, voice_filename) shutil.copy2(audio_input, saved_voice_path) voice_over_path = saved_voice_path if voice_over_path: try: progress(0.2, desc="Processing audio...") if generation_cancelled: return None, "Generation cancelled" voice_over_audio = AudioFileClip(voice_over_path) target_duration_seconds = voice_over_audio.duration linelevel_subtitles, _ = process_voiceover_to_subtitles(voice_over_path) if generation_cancelled: voice_over_audio.close() return None, "Generation cancelled" except Exception as e: return None, f"Audio error: {str(e)}" else: if not bg_music_path: return None, "Need text/audio or background music" target_duration_seconds = duration_minutes * 60 progress(0.3, desc="Preparing audio...") if generation_cancelled: if voice_over_audio: voice_over_audio.close() return None, "Generation cancelled" audio_tracks = [] if voice_over_audio: audio_tracks.append(voice_over_audio) if bg_music_path: try: background_audio = AudioFileClip(bg_music_path) # CHANGED: Increased volume from 0.015 to 0.10 (louder background music) background_audio = background_audio.fx(afx.volumex, 0.10) background_audio = background_audio.fx(afx.audio_loop, duration=target_duration_seconds) audio_tracks.append(background_audio) except Exception as e: print(f"Background music error: {e}") final_audio = CompositeAudioClip(audio_tracks) if len(audio_tracks) > 1 else (audio_tracks[0] if audio_tracks else None) progress(0.4, desc="Setting up video...") if generation_cancelled: cleanup_resources() return None, "Generation cancelled" if video_quality == "High": target_height, bitrate, preset, crf = 1080, "8000k", "veryfast", "20" elif video_quality == "Standard": target_height, bitrate, preset, crf = 720, "4000k", "veryfast", "24" else: target_height, bitrate, preset, crf = 480, "1000k", "ultrafast", "28" progress(0.5, desc="Processing clips...") video_clips = [] current_duration = 0 file_index = 0 safety_counter = 0 max_iterations = len(all_files) * 3 while current_duration < target_duration_seconds and safety_counter < max_iterations: if generation_cancelled: for clip in video_clips: try: clip.close() except: pass cleanup_resources() return None, "Generation cancelled" if file_index >= len(all_files): file_index = 0 random.shuffle(all_files) video_file = all_files[file_index] file_index += 1 safety_counter += 1 try: full_clip = VideoFileClip(os.path.join(source_path, video_file)) current_video_clip = full_clip if generation_cancelled: full_clip.close() cleanup_resources() return None, "Generation cancelled" if full_clip.h != target_height: aspect_ratio = full_clip.w / full_clip.h new_width = int(target_height * aspect_ratio) if new_width % 2 != 0: new_width -= 1 adjusted_height = target_height if target_height % 2 == 0 else target_height - 1 full_clip = full_clip.resize((new_width, adjusted_height)) else: full_clip = ensure_even_dimensions(full_clip) subclip = get_random_subclip_and_slow(full_clip) remaining_duration = target_duration_seconds - current_duration if subclip.duration > remaining_duration: subclip = subclip.subclip(0, remaining_duration) video_clips.append(ensure_even_dimensions(subclip)) current_duration += subclip.duration progress(0.5 + (safety_counter * 0.1 / max_iterations), desc=f"Clip {len(video_clips)}") except Exception as e: print(f"Error: {e}") continue if generation_cancelled: for clip in video_clips: try: clip.close() except: pass cleanup_resources() return None, "Generation cancelled" if not video_clips: return None, "No clips processed" # FIXED: Ensure exact duration match to prevent black screens total_video_duration = sum(clip.duration for clip in video_clips) duration_diff = total_video_duration - target_duration_seconds if abs(duration_diff) > 0.1: if duration_diff > 0: trim_amount = duration_diff new_last_clip = video_clips[-1].subclip(0, video_clips[-1].duration - trim_amount) video_clips[-1] = new_last_clip else: extend_amount = abs(duration_diff) new_last_clip = video_clips[-1].fx(vfx.loop, duration=video_clips[-1].duration + extend_amount) video_clips[-1] = new_last_clip progress(0.6, desc="Applying transitions...") transition_duration = {"Snap Cut": 0.1, "Whip Pan": 0.3, "Dreamy Fade": 0.8, "Smooth Blend": 0.5, "Ken Burns Zoom": 0.5}.get(transition_type, 0.5) processed_clips = [] for i in range(len(video_clips)): if i == 0: if len(video_clips) > 1: clip_out, _ = apply_transition_effect(video_clips[i], video_clips[i+1], transition_type, transition_duration) processed_clips.append(clip_out) else: processed_clips.append(video_clips[i]) elif i == len(video_clips) - 1: _, clip_in = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration) processed_clips.append(clip_in) else: _, clip_with_transition = apply_transition_effect(video_clips[i-1], video_clips[i], transition_type, transition_duration) processed_clips.append(clip_with_transition) progress(0.7, desc="Concatenating...") if generation_cancelled: for c in processed_clips: try: c.close() except: pass cleanup_resources() return None, "Generation cancelled" if transition_type == "Snap Cut": final_video_only = concatenate_videoclips(processed_clips, method="compose") else: final_video_only = concatenate_videoclips(processed_clips, method="compose", padding=-transition_duration) final_video_only = ensure_even_dimensions(final_video_only) current_video_clip = final_video_only # FIXED: Loop video if shorter than audio to prevent black screen if final_audio and final_video_only.duration < final_audio.duration: final_video_only = final_video_only.fx(vfx.loop, duration=final_audio.duration) progress(0.8, desc="Adding overlays...") if generation_cancelled: try: final_video_only.close() except: pass cleanup_resources() return None, "Generation cancelled" all_subtitle_clips = [] if linelevel_subtitles: for line in linelevel_subtitles: if generation_cancelled: try: final_video_only.close() except: pass cleanup_resources() return None, "Generation cancelled" try: subtitle_fontsize = min(42, final_video_only.size[1] // 25) all_subtitle_clips.extend(create_caption(line, final_video_only.size, font="Helvetica-Bold", fontsize=subtitle_fontsize, color='white')) except Exception as e: print(f"Subtitle error: {e}") continue all_clips = [final_video_only.set_opacity(0.65)] if all_subtitle_clips: all_clips.extend(all_subtitle_clips) if title_text and title_text.strip(): title_clips = create_title_overlay(title_text, final_video_only.size, duration=4) all_clips.extend(title_clips) final_video = CompositeVideoClip(all_clips) current_video_clip = final_video if final_audio: final_video = final_video.set_audio(final_audio) progress(0.9, desc="Exporting...") if generation_cancelled: try: final_video.close() except: pass cleanup_resources() return None, "Generation cancelled" output_filename = f'video_{timestamp}.mp4' final_output_path = os.path.join(output_path, output_filename) try: final_video.write_videofile( final_output_path, codec="libx264", audio_codec="aac", fps=24, preset=preset, bitrate=bitrate, audio_bitrate="128k", threads=8, ffmpeg_params=["-crf", crf, "-pix_fmt", "yuv420p", "-movflags", "+faststart", "-tune", "fastdecode"] ) except Exception as e: if generation_cancelled: return None, "Generation cancelled" return None, f"Export error: {str(e)}" progress(1.0, desc="Done") if generation_cancelled: try: if os.path.exists(final_output_path): os.remove(final_output_path) except: pass cleanup_resources() return None, "Generation cancelled" try: final_video.close() if voice_over_audio: voice_over_audio.close() current_video_clip = None except: pass audio_source = "" if text_input and text_input.strip(): audio_source = f"TTS ({AVAILABLE_VOICES[voice_selection]['name'] if voice_selection in AVAILABLE_VOICES else 'Puck'})" elif voice_over_path: audio_source = "Uploaded Audio" else: audio_source = "Background Music" summary = f"Complete\n{output_filename}\n{audio_source}\n{transition_type}\n{target_duration_seconds:.1f}s\n{len(linelevel_subtitles) if linelevel_subtitles else 0} subtitles" return final_output_path, summary # CHANGED: Removed share=True and debug=True for production with gr.Blocks(title="Video Generator", theme=gr.themes.Soft()) as interface: gr.Markdown("# 🎬 AI Video Generator") gr.Markdown("Upload video clips to `video_clips` folder and optionally background music to `background_music` folder.") with gr.Row(): with gr.Column(): text_input = gr.Textbox(label="Text for TTS", lines=4, placeholder="Enter text to convert to speech...") voice_dropdown = gr.Dropdown( choices=[(f"{v['name']} - {v['description']}", k) for k, v in AVAILABLE_VOICES.items()], value="Puck", label="Voice Selection" ) audio_input = gr.Audio(type="filepath", label="Or Upload Audio File") title_input = gr.Textbox(label="Video Title (Optional)", lines=2, placeholder="Enter video title...") duration_slider = gr.Slider(0.5, 10, 2, 0.5, label="Duration (minutes) - only used if no audio") quality_radio = gr.Radio(["High", "Standard", "Preview"], value="High", label="Video Quality") transition_radio = gr.Radio( ["Smooth Blend", "Ken Burns Zoom", "Whip Pan", "Dreamy Fade", "Snap Cut"], value="Smooth Blend", label="Transition Effect" ) with gr.Row(): submit_btn = gr.Button("🎥 Generate Video", variant="primary", size="lg") stop_btn = gr.Button("⏹️ Stop", variant="stop", size="lg") with gr.Column(): video_output = gr.Video(label="Generated Video") summary_output = gr.Textbox(label="Status", lines=8) submit_btn.click( fn=merge_videos_with_subtitles, inputs=[text_input, voice_dropdown, audio_input, title_input, duration_slider, quality_radio, transition_radio], outputs=[video_output, summary_output] ) stop_btn.click(fn=cancel_generation, outputs=[summary_output, video_output]) # CHANGED: Updated launch settings for Hugging Face if __name__ == "__main__": interface.launch( server_name="0.0.0.0", server_port=7860, show_error=True )