Spaces:
Build error
Build error
File size: 2,971 Bytes
1bff274 568aa4d d85a747 44c9183 d85a747 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 568aa4d 8cebf37 bec316f 568aa4d bec316f 568aa4d bec316f 568aa4d bec316f 568aa4d 8cebf37 568aa4d bec316f 568aa4d 1bff274 568aa4d 1bff274 568aa4d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | import gradio as gr
import subprocess
import os
import requests
import base64
import wave
import json
API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
print("Gimini Api key NOT Found")
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}"
def generate_tts(text_to_speak, output_file="output.wav"):
payload = {
"contents": [{"parts": [{"text": text_to_speak}]}],
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}}
},
},
"model": "gemini-2.5-flash-preview-tts",
}
headers = {"Content-Type": "application/json"}
response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
result = response.json()
audio_part = result["candidates"][0]["content"]["parts"][0]
base64_audio_data = audio_part["inlineData"]["data"]
mime_type = audio_part["inlineData"]["mimeType"]
sample_rate = int(mime_type.split("rate=")[-1])
pcm_data = base64.b64decode(base64_audio_data)
with wave.open(output_file, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(pcm_data)
return output_file
def preprocess_video(input_path, output_path="processed_face.mp4"):
"""
Standardize input video so Wav2Lip can process it reliably.
- Resizes to 640x640
- Sets FPS to 25
- Forces compatible codec/pixel format
"""
cmd = [
"ffmpeg", "-y", "-i", input_path,
"-vf", "scale=640:640,fps=25",
"-c:v", "libx264", "-pix_fmt", "yuv420p",
output_path
]
subprocess.run(cmd, check=True)
return output_path
def make_talking_video(video_file, text):
# Step 1: Generate TTS audio
audio_path = "output.wav"
generate_tts(text, audio_path)
# Step 2: Preprocess input video
preprocessed_video = preprocess_video(video_file)
# Step 3: Run Wav2Lip inference
output_video = "result_voice.mp4"
command = [
"python", "-m", "Wav2Lip.inference",
"--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
"--face", preprocessed_video,
"--audio", audio_path,
"--outfile", output_video,
]
subprocess.run(command, check=True)
return output_video
with gr.Blocks() as demo:
gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS")
with gr.Row():
video_input = gr.Video(label="Upload 30s Video")
text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?")
output_video = gr.Video(label="Result", autoplay=True)
submit_btn = gr.Button("Generate Talking Video")
submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video)
if __name__ == "__main__":
demo.launch()
|