File size: 1,904 Bytes
952337e
 
 
a37c88f
 
bfd9324
a37c88f
 
8479037
a37c88f
8479037
a37c88f
 
952337e
a37c88f
 
 
 
 
 
 
 
952337e
a37c88f
 
 
952337e
a37c88f
952337e
c5790ed
bfd9324
c5790ed
952337e
5b78679
6b0172d
952337e
 
a37c88f
 
 
bfd9324
c5790ed
bfd9324
 
 
 
 
a37c88f
bfd9324
 
 
 
 
 
952337e
 
 
 
 
 
c5790ed
a37c88f
952337e
c5790ed
 
a37c88f
885a000
952337e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gradio as gr
import os
import uuid
import subprocess
import requests
from PIL import Image

# Safe imports
try:
    import librosa
except ImportError:
    os.system("pip install librosa")
    import librosa

try:
    import soundfile as sf
except ImportError:
    os.system("pip install soundfile")
    import soundfile as sf

# ✅ Download Wav2Lip model if missing
MODEL_URL = "https://huggingface.co/spaces/justest/wav2lip-v2/resolve/main/wav2lip_gan.pth"
if not os.path.exists("wav2lip_gan.pth"):
    r = requests.get(MODEL_URL)
    with open("wav2lip_gan.pth", "wb") as f:
        f.write(r.content)

def preprocess(image, audio_file):
    uid = str(uuid.uuid4())
    image_path = f"{uid}_image.jpg"
    audio_out_path = f"{uid}_audio.wav"
    output_path = f"{uid}_output.mp4"

    # ✅ Resize image
    image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
    image.save(image_path)

    # ✅ Resample audio using librosa (16kHz mono)
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    sf.write(audio_out_path, y, 16000)

    return image_path, audio_out_path, output_path

def generate(image, audio_file):
    image_path, audio_path, output_path = preprocess(image, audio_file)

    command = [
        "python3", "inference.py",
        "--checkpoint_path", "wav2lip_gan.pth",
        "--face", image_path,
        "--audio", audio_path,
        "--outfile", output_path
    ]
    subprocess.run(command)

    return output_path

gr.Interface(
    fn=generate,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Audio(type="filepath", label="Upload Audio")
    ],
    outputs=gr.Video(label="Generated Talking Video"),
    title="⚡ Wav2Lip (Optimized for Hugging Face CPU)",
    description="Upload an image and audio. This version uses librosa for resampling and is CPU-friendly.",
    live=True
).launch()