Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import subprocess | |
| import os | |
| import requests | |
| import base64 | |
| import wave | |
| import json | |
| API_KEY = os.getenv("GEMINI_API_KEY") | |
| if not API_KEY: | |
| print("Gimini Api key NOT Found") | |
| API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}" | |
| def generate_tts(text_to_speak, output_file="output.wav"): | |
| payload = { | |
| "contents": [{"parts": [{"text": text_to_speak}]}], | |
| "generationConfig": { | |
| "responseModalities": ["AUDIO"], | |
| "speechConfig": { | |
| "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}} | |
| }, | |
| }, | |
| "model": "gemini-2.5-flash-preview-tts", | |
| } | |
| headers = {"Content-Type": "application/json"} | |
| response = requests.post(API_URL, headers=headers, data=json.dumps(payload)) | |
| result = response.json() | |
| audio_part = result["candidates"][0]["content"]["parts"][0] | |
| base64_audio_data = audio_part["inlineData"]["data"] | |
| mime_type = audio_part["inlineData"]["mimeType"] | |
| sample_rate = int(mime_type.split("rate=")[-1]) | |
| pcm_data = base64.b64decode(base64_audio_data) | |
| with wave.open(output_file, "wb") as wf: | |
| wf.setnchannels(1) | |
| wf.setsampwidth(2) | |
| wf.setframerate(sample_rate) | |
| wf.writeframes(pcm_data) | |
| return output_file | |
| def preprocess_video(input_path, output_path="processed_face.mp4"): | |
| """ | |
| Standardize input video so Wav2Lip can process it reliably. | |
| - Resizes to 640x640 | |
| - Sets FPS to 25 | |
| - Forces compatible codec/pixel format | |
| """ | |
| cmd = [ | |
| "ffmpeg", "-y", "-i", input_path, | |
| "-vf", "scale=640:640,fps=25", | |
| "-c:v", "libx264", "-pix_fmt", "yuv420p", | |
| output_path | |
| ] | |
| subprocess.run(cmd, check=True) | |
| return output_path | |
| def make_talking_video(video_file, text): | |
| # Step 1: Generate TTS audio | |
| audio_path = "output.wav" | |
| generate_tts(text, audio_path) | |
| # Step 2: Preprocess input video | |
| preprocessed_video = preprocess_video(video_file) | |
| # Step 3: Run Wav2Lip inference | |
| output_video = "result_voice.mp4" | |
| command = [ | |
| "python", "-m", "Wav2Lip.inference", | |
| "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth", | |
| "--face", preprocessed_video, | |
| "--audio", audio_path, | |
| "--outfile", output_video, | |
| ] | |
| subprocess.run(command, check=True) | |
| return output_video | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS") | |
| with gr.Row(): | |
| video_input = gr.Video(label="Upload 30s Video") | |
| text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?") | |
| output_video = gr.Video(label="Result", autoplay=True) | |
| submit_btn = gr.Button("Generate Talking Video") | |
| submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video) | |
| if __name__ == "__main__": | |
| demo.launch() | |