import gradio as gr import subprocess import os import requests import base64 import wave import json API_KEY = os.getenv("GEMINI_API_KEY") if not API_KEY: print("Gimini Api key NOT Found") API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}" def generate_tts(text_to_speak, output_file="output.wav"): payload = { "contents": [{"parts": [{"text": text_to_speak}]}], "generationConfig": { "responseModalities": ["AUDIO"], "speechConfig": { "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}} }, }, "model": "gemini-2.5-flash-preview-tts", } headers = {"Content-Type": "application/json"} response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) result = response.json() audio_part = result["candidates"][0]["content"]["parts"][0] base64_audio_data = audio_part["inlineData"]["data"] mime_type = audio_part["inlineData"]["mimeType"] sample_rate = int(mime_type.split("rate=")[-1]) pcm_data = base64.b64decode(base64_audio_data) with wave.open(output_file, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(sample_rate) wf.writeframes(pcm_data) return output_file def preprocess_video(input_path, output_path="processed_face.mp4"): """ Standardize input video so Wav2Lip can process it reliably. - Resizes to 640x640 - Sets FPS to 25 - Forces compatible codec/pixel format """ cmd = [ "ffmpeg", "-y", "-i", input_path, "-vf", "scale=640:640,fps=25", "-c:v", "libx264", "-pix_fmt", "yuv420p", output_path ] subprocess.run(cmd, check=True) return output_path def make_talking_video(video_file, text): # Step 1: Generate TTS audio audio_path = "output.wav" generate_tts(text, audio_path) # Step 2: Preprocess input video preprocessed_video = preprocess_video(video_file) # Step 3: Run Wav2Lip inference output_video = "result_voice.mp4" command = [ "python", "-m", "Wav2Lip.inference", "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth", "--face", preprocessed_video, "--audio", audio_path, "--outfile", output_video, ] subprocess.run(command, check=True) return output_video with gr.Blocks() as demo: gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS") with gr.Row(): video_input = gr.Video(label="Upload 30s Video") text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?") output_video = gr.Video(label="Result", autoplay=True) submit_btn = gr.Button("Generate Talking Video") submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video) if __name__ == "__main__": demo.launch()