Ali-Raza-167's picture
Update app.py
44c9183 verified
import gradio as gr
import subprocess
import os
import requests
import base64
import wave
import json
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
print("Gimini Api key NOT Found")
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}"
def generate_tts(text_to_speak, output_file="output.wav"):
payload = {
"contents": [{"parts": [{"text": text_to_speak}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}}
},
},
"model": "gemini-2.5-flash-preview-tts",
}
headers = {"Content-Type": "application/json"}
response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
result = response.json()
audio_part = result["candidates"][0]["content"]["parts"][0]
base64_audio_data = audio_part["inlineData"]["data"]
mime_type = audio_part["inlineData"]["mimeType"]
sample_rate = int(mime_type.split("rate=")[-1])
pcm_data = base64.b64decode(base64_audio_data)
with wave.open(output_file, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(pcm_data)
return output_file
def preprocess_video(input_path, output_path="processed_face.mp4"):
"""
Standardize input video so Wav2Lip can process it reliably.
- Resizes to 640x640
- Sets FPS to 25
- Forces compatible codec/pixel format
"""
cmd = [
"ffmpeg", "-y", "-i", input_path,
"-vf", "scale=640:640,fps=25",
"-c:v", "libx264", "-pix_fmt", "yuv420p",
output_path
]
subprocess.run(cmd, check=True)
return output_path
def make_talking_video(video_file, text):
# Step 1: Generate TTS audio
audio_path = "output.wav"
generate_tts(text, audio_path)
# Step 2: Preprocess input video
preprocessed_video = preprocess_video(video_file)
# Step 3: Run Wav2Lip inference
output_video = "result_voice.mp4"
command = [
"python", "-m", "Wav2Lip.inference",
"--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
"--face", preprocessed_video,
"--audio", audio_path,
"--outfile", output_video,
]
subprocess.run(command, check=True)
return output_video
with gr.Blocks() as demo:
gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS")
with gr.Row():
video_input = gr.Video(label="Upload 30s Video")
text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?")
output_video = gr.Video(label="Result", autoplay=True)
submit_btn = gr.Button("Generate Talking Video")
submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video)
if __name__ == "__main__":
demo.launch()