mich123geb commited on
Commit
a30d89d
Β·
verified Β·
1 Parent(s): 33bc7a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -45
app.py CHANGED
@@ -1,61 +1,81 @@
1
- import gradio as gr
2
  import os
3
- import subprocess
4
  import uuid
 
 
 
 
5
  from PIL import Image
6
  from pydub import AudioSegment
7
 
8
- # Auto-install scipy if needed
9
- try:
10
- import scipy
11
- except ImportError:
12
- os.system("pip install scipy")
13
- import scipy
 
 
 
 
14
 
15
- # Download Wav2Lip model if not already downloaded
16
- if not os.path.exists("wav2lip_gan.pth"):
17
- os.system("wget https://huggingface.co/spaces/ZALAME-HAFE/w2l-hf/resolve/main/wav2lip_gan.pth")
 
 
 
18
 
19
- def preprocess(image, audio_path):
20
- uid = str(uuid.uuid4())
21
- image_path = f"{uid}_image.jpg"
22
- audio_out_path = f"{uid}_audio.wav"
23
- output_path = f"{uid}_output.mp4"
24
 
25
- # Resize image height to 256, keep aspect ratio
26
  image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
27
- image.save(image_path)
28
 
29
- # Convert audio to 16kHz mono WAV
30
- audio = AudioSegment.from_file(audio_path)
31
- audio = audio.set_frame_rate(16000).set_channels(1)
32
- audio.export(audio_out_path, format="wav")
33
 
34
- return image_path, audio_out_path, output_path
35
 
36
- def generate(image, audio_file):
37
- image_path, audio_path, output_path = preprocess(image, audio_file)
 
 
 
 
 
 
38
 
39
- command = [
40
- "python", "inference.py",
41
- "--checkpoint_path", "wav2lip_gan.pth",
42
- "--face", image_path,
43
- "--audio", audio_path,
44
- "--outfile", output_path
45
- ]
46
- subprocess.run(command)
 
 
47
 
48
- return output_path
49
 
50
- # Gradio interface
51
- gr.Interface(
 
 
52
  fn=generate,
53
- inputs=[
54
- gr.Image(type="pil", label="Upload Image"),
55
- gr.Audio(type="filepath", label="Upload Audio (any format)")
56
- ],
57
- outputs=gr.Video(label="Generated Talking Video"),
58
- title="πŸ—£οΈ Wav2Lip - Light & Fast",
59
- description="Upload an image and audio to generate a lip-synced video. Optimized for Hugging Face CPU spaces using Pydub.",
60
- live=True
61
- ).launch()
 
 
 
 
1
  import os
 
2
  import uuid
3
+ import subprocess
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
  from PIL import Image
8
  from pydub import AudioSegment
9
 
10
+ # ──────────────────────────────────────────────
11
+ # 1. Download model checkpoint once
12
+ # ──────────────────────────────────────────────
13
+ MODEL_PATH = Path("wav2lip_gan.pth")
14
+ MODEL_URL = (
15
+ "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
16
+ ) # public mirror
17
+
18
+ if not MODEL_PATH.exists():
19
+ os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
20
 
21
+ # ──────────────────────────────────────────────
22
+ # 2. Helper: resize image + convert audio β†’ 16 kHz mono WAV
23
+ # ──────────────────────────────────────────────
24
+ def preprocess(image, audio_file):
25
+ if image is None or audio_file is None:
26
+ raise ValueError("Both an image and an audio file are required.")
27
 
28
+ uid = uuid.uuid4().hex
29
+ img_path = f"{uid}.jpg"
30
+ wav_path = f"{uid}.wav"
31
+ out_path = f"{uid}_result.mp4"
 
32
 
33
+ # resize image to 256 px height (keeps aspect ratio)
34
  image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
35
+ image.save(img_path)
36
 
37
+ # convert audio to 16 kHz mono WAV
38
+ seg = AudioSegment.from_file(audio_file)
39
+ seg = seg.set_frame_rate(16_000).set_channels(1)
40
+ seg.export(wav_path, format="wav")
41
 
42
+ return img_path, wav_path, out_path
43
 
44
+ # ──────────────────────────────────────────────
45
+ # 3. Main inference wrapper
46
+ # ──────────────────────────────────────────────
47
+ def generate(image, audio):
48
+ try:
49
+ img, wav, out_vid = preprocess(image, audio)
50
+ except Exception as e:
51
+ return f"❌ {e}"
52
 
53
+ subprocess.run(
54
+ [
55
+ "python", "inference.py",
56
+ "--checkpoint_path", str(MODEL_PATH),
57
+ "--face", img,
58
+ "--audio", wav,
59
+ "--outfile", out_vid,
60
+ ],
61
+ check=True,
62
+ )
63
 
64
+ return out_vid if Path(out_vid).exists() else "❌ Generation failed."
65
 
66
+ # ──────────────────────────────────────────────
67
+ # 4. Gradio UI
68
+ # ──────────────────────────────────────────────
69
+ demo = gr.Interface(
70
  fn=generate,
71
+ inputs=[gr.Image(type="pil", label="Image"),
72
+ gr.Audio(type="filepath", label="Audio (any format)")],
73
+ outputs=gr.Video(label="Talking-head MP4"),
74
+ title="πŸ—£οΈ Wav2Lip CPU Demo",
75
+ description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).",
76
+ allow_flagging="never",
77
+ live=True,
78
+ )
79
+
80
+ if __name__ == "__main__":
81
+ demo.launch()