File size: 2,971 Bytes
1bff274
568aa4d
 
 
 
 
 
 
d85a747
44c9183
d85a747
568aa4d
 
8cebf37
568aa4d
 
 
 
 
 
8cebf37
 
568aa4d
8cebf37
568aa4d
 
 
 
 
 
 
 
 
 
 
8cebf37
568aa4d
 
 
 
 
 
 
 
8cebf37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568aa4d
8cebf37
568aa4d
 
 
8cebf37
 
 
 
568aa4d
 
8cebf37
568aa4d
8cebf37
568aa4d
8cebf37
bec316f
568aa4d
bec316f
568aa4d
bec316f
 
568aa4d
 
bec316f
568aa4d
8cebf37
568aa4d
bec316f
568aa4d
1bff274
568aa4d
 
1bff274
568aa4d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import subprocess
import os
import requests
import base64
import wave
import json

API_KEY = os.getenv("GEMINI_API_KEY")
if not API_KEY:
    print("Gimini Api key NOT Found")
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent?key={API_KEY}"


def generate_tts(text_to_speak, output_file="output.wav"):
    payload = {
        "contents": [{"parts": [{"text": text_to_speak}]}],
        "generationConfig": {
            "responseModalities": ["AUDIO"],
            "speechConfig": {
                "voiceConfig": {"prebuiltVoiceConfig": {"voiceName": "Kore"}}
            },
        },
        "model": "gemini-2.5-flash-preview-tts",
    }
    headers = {"Content-Type": "application/json"}
    response = requests.post(API_URL, headers=headers, data=json.dumps(payload))
    result = response.json()

    audio_part = result["candidates"][0]["content"]["parts"][0]
    base64_audio_data = audio_part["inlineData"]["data"]
    mime_type = audio_part["inlineData"]["mimeType"]
    sample_rate = int(mime_type.split("rate=")[-1])
    pcm_data = base64.b64decode(base64_audio_data)

    with wave.open(output_file, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(pcm_data)

    return output_file


def preprocess_video(input_path, output_path="processed_face.mp4"):
    """
    Standardize input video so Wav2Lip can process it reliably.
    - Resizes to 640x640
    - Sets FPS to 25
    - Forces compatible codec/pixel format
    """
    cmd = [
        "ffmpeg", "-y", "-i", input_path,
        "-vf", "scale=640:640,fps=25",
        "-c:v", "libx264", "-pix_fmt", "yuv420p",
        output_path
    ]
    subprocess.run(cmd, check=True)
    return output_path


def make_talking_video(video_file, text):
    # Step 1: Generate TTS audio
    audio_path = "output.wav"
    generate_tts(text, audio_path)

    # Step 2: Preprocess input video
    preprocessed_video = preprocess_video(video_file)

    # Step 3: Run Wav2Lip inference
    output_video = "result_voice.mp4"
    command = [
        "python", "-m", "Wav2Lip.inference",
        "--checkpoint_path", "Wav2Lip/checkpoints/wav2lip_gan.pth",
        "--face", preprocessed_video,
        "--audio", audio_path,
        "--outfile", output_video,
    ]
    subprocess.run(command, check=True)

    return output_video


with gr.Blocks() as demo:
    gr.Markdown("## 🎥 Text-to-Speech Avatar with Wav2Lip + Gemini TTS")

    with gr.Row():
        video_input = gr.Video(label="Upload 30s Video")
        text_input = gr.Textbox(label="Enter Prompt", placeholder="What should the avatar say?")

    output_video = gr.Video(label="Result", autoplay=True)

    submit_btn = gr.Button("Generate Talking Video")
    submit_btn.click(make_talking_video, inputs=[video_input, text_input], outputs=output_video)

if __name__ == "__main__":
    demo.launch()