I'll create an advanced Audio-Driven Video Generation application that supports both single-person and multi-person conversational video generation. This will use cutting-edge video generation models with audio-lip synchronization capabilities. ```python # app.py import gradio as gr import numpy as np import spaces import torch from diffusers import DiffusionPipeline, DDIMScheduler from diffusers.utils import export_to_video import time from typing import List, Tuple, Optional import tempfile import os from PIL import Image # Configuration MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt-1-1" MAX_DURATION = 120 # 2 minutes in seconds AUDIO_SAMPLE_RATE = 16000 class VideoGenerator: def __init__(self): self.pipe = None self.device = "cuda" if torch.cuda.is_available() else "cpu" self._load_model() @spaces.GPU(duration=1500) def _load_model(self): """Load and compile the video generation model with AoT optimization""" print("Loading video generation model...") self.pipe = DiffusionPipeline.from_pretrained( MODEL_ID, torch_dtype=torch.float16, variant="fp16" ) self.pipe.scheduler = DDIMScheduler.from_config(self.pipe.scheduler.config) self.pipe = self.pipe.to(self.device) # AoT Compilation for performance boost print("Applying AoT compilation...") with spaces.aoti_capture(self.pipe.unet) as call: # Create dummy inputs for compilation dummy_prompt = "person talking" dummy_image = Image.new('RGB', (512, 512), color='white') self.pipe( prompt=dummy_prompt, image=dummy_image, num_inference_steps=1, height=512, width=512, num_frames=4 ) # Export and compile the UNet exported = torch.export.export( self.pipe.unet, args=call.args, kwargs=call.kwargs, ) compiled_unet = spaces.aoti_compile(exported) # Apply compiled model back to pipeline spaces.aoti_apply(compiled_unet, self.pipe.unet) print("Model loaded and compiled successfully!") def generate_video_segment( self, prompt: str, reference_image: Optional[np.ndarray], audio_features: dict, duration: int, fps: int = 24 ) -> List[np.ndarray]: """Generate a video segment with audio-driven animation""" if self.pipe is None: raise gr.Error("Model not loaded. Please wait...") num_frames = int(duration * fps) # Prepare initial frame from reference image or create default if reference_image is not None: initial_frame = Image.fromarray(reference_image) else: initial_frame = Image.new('RGB', (512, 512), color='white') # Generate video frames with audio conditioning print(f"Generating {duration}s video with {num_frames} frames...") frames = [] for i in range(0, num_frames, 8): # Generate in chunks of 8 frames chunk_frames = min(8, num_frames - i) # Audio-driven conditioning (simplified - in production use actual audio features) audio_conditioning = { "tempo": audio_features.get("tempo", 120), "energy": audio_features.get("energy", 0.5), "pitch": audio_features.get("pitch", 0.5) } # Generate frames with diffusion pipeline output = self.pipe( prompt=f"{prompt}, {audio_conditioning['tempo']} BPM tempo, realistic face, lip sync", image=initial_frame, num_inference_steps=25, height=512, width=512, num_frames=chunk_frames, guidance_scale=7.5, generator=torch.Generator().manual_seed(42 + i) ) # Extract frames for j in range(chunk_frames): frame = output.frames[0][j] frame_array = np.array(frame) frames.append(frame_array) return frames # Initialize global generator generator = VideoGenerator() def extract_audio_features(audio_data: Tuple[int, np.ndarray]) -> dict: """Extract basic features from audio for conditioning""" sample_rate, audio = audio_data if audio.size == 0: return {"tempo": 120, "energy": 0.5, "pitch": 0.5} # Calculate energy (RMS) energy = np.sqrt(np.mean(audio**2)) energy_normalized = min(1.0, energy / 0.1) # Normalize # Estimate pitch using zero crossing rate (simplified) zero_crossings = np.where(np.diff(np.sign(audio)))[0] estimated_freq = len(zero_crossings) / (len(audio) / sample_rate) * 60 # BPM tempo = np.clip(estimated_freq, 60, 200) # Simple spectral centroid for pitch estimation fft = np.fft.fft(audio) magnitude = np.abs(fft[:len(fft)//2]) freqs = np.fft.fftfreq(len(fft), 1/sample_rate)[:len(fft)//2] spectral_centroid = np.sum(freqs * magnitude) / (np.sum(magnitude) + 1e-10) pitch_normalized = min(1.0, spectral_centroid / 2000) return { "tempo": tempo, "energy": energy_normalized, "pitch": pitch_normalized } @spaces.GPU(duration=180) def generate_conversational_video( audio_1: Tuple[int, np.ndarray], prompt_1: str, audio_2: Optional[Tuple[int, np.ndarray]] = None, prompt_2: Optional[str] = None, reference_image_1: Optional[np.ndarray] = None, reference_image_2: Optional[np.ndarray] = None, duration: int = 30, mode: str = "single", fps: int = 24, progress=gr.Progress() ) -> str: """Generate conversational video from audio inputs""" try: progress(0.1, desc="Processing audio inputs...") # Extract features from audio(s) audio_features_1 = extract_audio_features(audio_1) if audio_2 is not None: audio_features_2 = extract_audio_features(audio_2) progress(0.2, desc="Initializing video generation...") # Generate video segments based on mode if mode == "single": progress(0.3, desc="Generating single-person video...") frames = generator.generate_video_segment( prompt=prompt_1, reference_image=reference_image_1, audio_features=audio_features_1, duration=duration, fps=fps ) else: # multi-person conversation progress(0.25, desc="Generating person 1 video...") frames_1 = generator.generate_video_segment( prompt=f"Person 1: {prompt_1}", reference_image=reference_image_1, audio_features=audio_features_1, duration=duration//2, fps=fps ) progress(0.5, desc="Generating person 2 video...") frames_2 = generator.generate_video_segment( prompt=f"Person 2: {prompt_2 or 'Responding'}", reference_image=reference_image_2, audio_features=audio_features_2 or {"tempo": 120, "energy": 0.5, "pitch": 0.5}, duration=duration//2, fps=fps ) progress(0.7, desc="Combining conversation...") # Interleave frames for conversation effect frames = [] for i in range(min(len(frames_1), len(frames_2))): frames.extend([frames_1[i], frames_2[i]]) progress(0.9, desc="Rendering video...") # Create temporary file for video with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp_file: video_path = tmp_file.name # Export frames to video export_to_video(frames, video_path, fps=fps) progress(1.0, desc="Video generation complete!") return video_path except Exception as e: raise gr.Error(f"Video generation failed: {str(e)}") def create_reference_image_from_prompt(prompt: str, seed: int = 42) -> np.ndarray: """Create a reference image from text prompt""" @spaces.GPU(duration=30) def generate_image(): # Use a simple image generation for reference from diffusers import StableDiffusionPipeline img_pipe = StableDiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16 ).to("cuda") image = img_pipe( prompt=f"portrait of {prompt}, photorealistic, neutral expression", num_inference_steps=20, guidance_scale=7.5, generator=torch.Generator().manual_seed(seed) ).images[0] return np.array(image) return generate_image() # Gradio Interface with gr.Blocks( title="Audio-Driven Conversational Video Generator", description="Generate realistic conversational videos from audio inputs with up to 2 minutes duration", theme=gr.themes.Soft(), css=""" .header { text-align: center; margin-bottom: 2rem; } .mode-toggle { margin: 1rem 0; } .person-section { border: 1px solid #e0e0e0; border-radius: 8px; padding: 1rem; margin: 1rem 0; } .warning { background-color: #fff3cd; border: 1px solid #ffeaa7; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; } .success { background-color: #d4edda; border: 1px solid #c3e6cb; border-radius: 4px; padding: 0.75rem; margin: 0.5rem 0; } """ ) as demo: gr.HTML("""

🎬 Audio-Driven Conversational Video Generator

Generate realistic talking videos from audio with support for single and multi-person conversations

Built with anycoder - Advanced AI Video Generation

""") with gr.Row(): mode = gr.Radio( choices=["single", "multi-person"], value="single", label="Generation Mode", info="Choose between single person or conversational video" ) duration = gr.Slider( minimum=5, maximum=MAX_DURATION, value=30, step=5, label="Duration (seconds)", info="Video length up to 2 minutes" ) fps = gr.Slider( minimum=12, maximum=30, value=24, step=1, label="FPS", info="Frames per second for output video" ) # Person 1 inputs with gr.Group(elem_classes="person-section"): gr.Markdown("### 👤 Person 1") with gr.Row(): audio_1 = gr.Audio( sources=["upload", "microphone"], type="numpy", label="Audio Input 1", info="Upload audio file or record directly" ) ref_img_1 = gr.Image( sources=["upload"], type="numpy", label="Reference Image 1 (Optional)", info="Upload a reference image for the first person" ) prompt_1 = gr.Textbox( label="Prompt for Person 1", placeholder="Describe the first person (e.g., 'young woman, professional attire')", value="friendly person speaking naturally" ) with gr.Row(): generate_ref_1 = gr.Button("Generate Reference Image 1", size="sm") use_placeholder_1 = gr.Button("Use Default Avatar 1", size="sm") # Person 2 inputs (for multi-person mode) with gr.Group(elem_classes="person-section", visible=False) as person_2_section: gr.Markdown("### 👥 Person 2") with gr.Row(): audio_2 = gr.Audio( sources=["upload", "microphone"], type="numpy", label="Audio Input 2", info="Upload or record second person's audio" ) ref_img_2 = gr.Image( sources=["upload"], type="numpy", label="Reference Image 2 (Optional)", info="Upload a reference image for the second person" ) prompt_2 = gr.Textbox( label="Prompt for Person 2", placeholder="Describe the second person", value="friendly person responding" ) with gr.Row(): generate_ref_2 = gr.Button("Generate Reference Image 2", size="sm") use_placeholder_2 = gr.Button("Use Default Avatar 2", size="sm") # Generation controls with gr.Row(): generate_btn = gr.Button( "🎥 Generate Video", variant="primary", size="lg" ) stop_btn = gr.Button("⏹ Stop Generation", variant="stop", size="lg", visible=False) # Output video_output = gr.Video( label="Generated Conversational Video", autoplay=True, show_label=True, show_share_button=True, show_download_button=True ) # Status and info status_info = gr.HTML( value='
🔧 Model loading... This may take a few minutes for initial setup.
', label="Status" ) # Example gallery gr.Examples( examples=[ [ "single", 30, 24, None, # Will use default audio "professional presenter in business attire", None, None ], [ "multi-person", 60, 24, None, "casual young woman", None, "casual young man" ] ], inputs=[mode, duration, fps, audio_1, prompt_1, audio_2, prompt_2], cache_examples=False ) # Event handlers def toggle_mode(selected_mode): """Show/hide person 2 section based on mode""" if selected_mode == "multi-person": return gr.update(visible=True), gr.update(value="🎥 Generate Conversation") else: return gr.update(visible=False), gr.update(value="🎥 Generate Video") mode.change( toggle_mode, inputs=[mode], outputs=[person_2_section, generate_btn] ) # Generate reference images generate_ref_1.click( create_reference_image_from_prompt, inputs=[prompt_1], outputs=[ref_img_1] ).then( lambda: gr.update(value='
✅ Reference image generated for Person 1
'), outputs=[status_info] ) generate_ref_2.click( create_reference_image_from_prompt, inputs=[prompt_2], outputs=[ref_img_2] ).then( lambda: gr.update(value='
✅ Reference image generated for Person 2
'), outputs=[status_info] ) # Use default avatars def create_default_avatar(person_id: int): """Create a simple default avatar""" color_map = {1: "#FFE4E1", 2: "#E1F4FF"} avatar = Image.new('RGB', (256, 256), color=color_map.get(person_id, "#FFFFFF")) # Add simple face features from PIL import ImageDraw draw = ImageDraw.Draw(avatar) # Simple face outline draw.ellipse([50, 50, 206, 206], outline="#000000", width=3) # Eyes draw.ellipse([80, 90, 110, 120], fill="#000000") draw.ellipse([146, 90, 176, 120], fill="#000000") # Smile draw.arc([100, 130, 156, 160], 0, 180, fill="#000000", width=2) return np.array(avatar) use_placeholder_1.click( lambda: create_default_avatar(1), outputs=[ref_img_1] ) use_placeholder_2.click( lambda: create_default_avatar(2), outputs=[ref_img_2] ) # Main generation function def start_generation(*args): """Start video generation with loading indicator""" return ( gr.update(visible=False), # Hide generate button gr.update(visible=True), # Show stop button gr.update(value='
⚙️ Generating video... This may take several minutes depending on duration.
'), None # Clear previous video ) def stop_generation(): """Handle stop generation""" return ( gr.update(visible=True), # Show generate button gr.update(visible=False), # Hide stop button gr.update(value='
🔧 Generation stopped. Ready for new video.
') ) generate_btn.click( start_generation, outputs=[generate_btn, stop_btn, status_info, video_output] ).then( generate_conversational_video, inputs=[ audio_1, prompt_1, audio_2, prompt_2, ref_img_1, ref_img_2, duration, mode, fps ], outputs=[video_output] ).then( lambda: gr.update(value='
✅ Video generation complete! You can now download your video.
'), outputs=[status_info] ).then( lambda: [gr.update(visible=True), gr.update(visible=False)], outputs=[generate_btn, stop_btn] ) stop_btn.click( stop_generation, outputs=[generate_btn, stop_btn, status_info] ) # Update status on model load demo.load( lambda: gr.update(value='
✅ Model loaded successfully! Ready to generate videos.
'), outputs=[status_info] ) if __name__ == "__main__": demo.queue(max_size=10, default_concurrency_limit=1) demo.launch(share=True) ``` ```python # requirements.txt gradio torch diffusers transformers accelerate numpy pillow opencv-python spaces torchvision ```