Upload 4 files
Browse files- Dockerfile +5 -2
- README.md +18 -26
- app.py +75 -23
- requirements.txt +1 -0
Dockerfile
CHANGED
|
@@ -29,14 +29,17 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 29 |
# pull SadTalker weights once at build time (saves cold-start)
|
| 30 |
RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
|
| 31 |
cd SadTalker && \
|
| 32 |
-
git lfs install && git lfs pull
|
|
|
|
| 33 |
|
| 34 |
# ---------- Real-ESRGAN install from source ----------------------------------
|
| 35 |
RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
|
| 36 |
cd Real-ESRGAN && \
|
| 37 |
pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
|
| 38 |
pip install -r requirements.txt && \
|
| 39 |
-
python setup.py develop
|
|
|
|
|
|
|
| 40 |
|
| 41 |
# ---------- app --------------------------------------------------------------
|
| 42 |
COPY app.py .
|
|
|
|
| 29 |
# pull SadTalker weights once at build time (saves cold-start)
|
| 30 |
RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
|
| 31 |
cd SadTalker && \
|
| 32 |
+
git lfs install && git lfs pull && \
|
| 33 |
+
pip install -e .
|
| 34 |
|
| 35 |
# ---------- Real-ESRGAN install from source ----------------------------------
|
| 36 |
RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
|
| 37 |
cd Real-ESRGAN && \
|
| 38 |
pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
|
| 39 |
pip install -r requirements.txt && \
|
| 40 |
+
python setup.py develop && \
|
| 41 |
+
mkdir -p weights && \
|
| 42 |
+
wget -q -O weights/RealESRGAN_x2plus.pth https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.0/RealESRGAN_x2plus.pth
|
| 43 |
|
| 44 |
# ---------- app --------------------------------------------------------------
|
| 45 |
COPY app.py .
|
README.md
CHANGED
|
@@ -1,30 +1,22 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Vertical SadTalker
|
| 3 |
-
emoji: 🎤
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: pink
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
# Vertical Talking-Head Generator (SadTalker)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
into a 576 × 1024 (9∶16) lip‑synced MP4, perfect for TikTok/Shorts.
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
* Runs on ZeroGPU (H‑200) with fp16 + xformers
|
| 23 |
-
* Optional multi‑GPU via DeepSpeed
|
| 24 |
|
| 25 |
-
|
| 26 |
-
```python
|
| 27 |
-
from gradio_client import Client
|
| 28 |
-
cli = Client("your‑org/vertical‑sadtalker")
|
| 29 |
-
out = cli.predict(image="reporter.png", audio=None, text="Hello world!")
|
| 30 |
-
cli.download(out, "clip.mp4")
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Vertical SadTalker
|
| 3 |
+
emoji: 🎤
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: pink
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: "4.26.0"
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
|
| 12 |
+
# Vertical Talking-Head Generator (SadTalker + Real-ESRGAN)
|
| 13 |
|
| 14 |
+
This Space turns a portrait + audio **or** portrait + text into a 576 × 1024 (9∶16) lip‑synced video, perfect for TikTok/Shorts.
|
|
|
|
| 15 |
|
| 16 |
+
**Pipeline**
|
| 17 |
+
1. XTTS‑v2 (Coqui‑TTS) → WAV (if you supply text)
|
| 18 |
+
2. SadTalker generates 512² talking‑head video
|
| 19 |
+
3. Real‑ESRGAN upscales frames to 1024²
|
| 20 |
+
4. FFmpeg crops/pads to 576 × 1024 and re‑encodes as MP4
|
|
|
|
|
|
|
| 21 |
|
| 22 |
+
Designed for **ZeroGPU** H‑200 slices; add `deepspeed --num_gpus 2 app.py` if you spin it on a dual‑GPU runtime.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,60 +1,112 @@
|
|
| 1 |
-
import os, subprocess, tempfile, uuid, torch, gradio as gr
|
| 2 |
from pathlib import Path
|
| 3 |
from PIL import Image
|
| 4 |
from TTS.api import TTS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from sadtalker.inference import SadTalker
|
| 6 |
|
| 7 |
-
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
sadtalker = SadTalker(device=DEVICE, half=True) # fp16
|
| 10 |
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
tmp = Path(tempfile.mkdtemp())
|
| 13 |
-
img_path = tmp /
|
| 14 |
-
image.convert(
|
| 15 |
|
| 16 |
# handle audio or text
|
| 17 |
if audio is None and text.strip():
|
| 18 |
-
wav_path = tmp /
|
| 19 |
tts.tts_to_file(text=text, file_path=str(wav_path))
|
| 20 |
elif audio is not None:
|
| 21 |
wav_path = Path(audio)
|
| 22 |
else:
|
| 23 |
-
raise gr.Error(
|
| 24 |
|
| 25 |
-
|
|
|
|
| 26 |
sadtalker.infer(
|
| 27 |
source_image=str(img_path),
|
| 28 |
driven_audio=str(wav_path),
|
| 29 |
result_dir=str(tmp),
|
| 30 |
size=512,
|
| 31 |
-
preprocess=
|
| 32 |
still=False,
|
| 33 |
enhancer=None,
|
| 34 |
write_video=True,
|
| 35 |
output_path=str(vid_path)
|
| 36 |
)
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
cmd = [
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
]
|
| 45 |
subprocess.run(cmd, check=True)
|
| 46 |
-
return str(
|
| 47 |
|
| 48 |
demo = gr.Interface(
|
| 49 |
fn=generate,
|
| 50 |
inputs=[
|
| 51 |
-
gr.Image(type=
|
| 52 |
-
gr.Audio(type=
|
| 53 |
-
gr.Textbox(lines=2, placeholder=
|
| 54 |
],
|
| 55 |
-
outputs=gr.Video(label=
|
| 56 |
-
title=
|
| 57 |
)
|
| 58 |
|
| 59 |
-
if __name__ ==
|
| 60 |
-
demo.queue(concurrency_count=
|
|
|
|
| 1 |
+
import os, sys, subprocess, tempfile, uuid, torch, gradio as gr, cv2, numpy as np
|
| 2 |
from pathlib import Path
|
| 3 |
from PIL import Image
|
| 4 |
from TTS.api import TTS
|
| 5 |
+
|
| 6 |
+
# add SadTalker to path if not installed as pkg
|
| 7 |
+
SAD_PATH = '/workspace/SadTalker'
|
| 8 |
+
if SAD_PATH not in sys.path and Path(SAD_PATH).exists():
|
| 9 |
+
sys.path.append(SAD_PATH)
|
| 10 |
from sadtalker.inference import SadTalker
|
| 11 |
|
| 12 |
+
from realesrgan import RealESRGANer
|
| 13 |
+
from basicsr.archs.rrdbnet_arch import RRDBNet
|
| 14 |
+
|
| 15 |
+
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 16 |
+
|
| 17 |
+
# TTS
|
| 18 |
+
tts = TTS(model_name='tts_models/en/xtts_v2', progress_bar=False).to(DEVICE)
|
| 19 |
+
|
| 20 |
+
# SadTalker
|
| 21 |
sadtalker = SadTalker(device=DEVICE, half=True) # fp16
|
| 22 |
|
| 23 |
+
# Real‑ESRGAN upsampler (×2 so 512→1024)
|
| 24 |
+
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
|
| 25 |
+
upsampler = RealESRGANer(
|
| 26 |
+
scale=2,
|
| 27 |
+
model_path='/workspace/Real-ESRGAN/weights/RealESRGAN_x2plus.pth',
|
| 28 |
+
model=model,
|
| 29 |
+
tile=0,
|
| 30 |
+
half=True,
|
| 31 |
+
gpu_id=0 if DEVICE == 'cuda' else None
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
def _upscale_video(in_path: Path, out_path: Path):
|
| 35 |
+
cap = cv2.VideoCapture(str(in_path))
|
| 36 |
+
fps = cap.get(cv2.CAP_PROP_FPS) or 25
|
| 37 |
+
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 38 |
+
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 39 |
+
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 40 |
+
# first frame upscale to get new dims
|
| 41 |
+
ret, frame = cap.read()
|
| 42 |
+
if not ret:
|
| 43 |
+
cap.release()
|
| 44 |
+
raise RuntimeError('Empty video input for upscaling.')
|
| 45 |
+
up_frame, _ = upsampler.enhance(frame, outscale=2)
|
| 46 |
+
new_h, new_w = up_frame.shape[:2]
|
| 47 |
+
writer = cv2.VideoWriter(str(out_path), fourcc, fps, (new_w, new_h))
|
| 48 |
+
writer.write(up_frame)
|
| 49 |
+
while True:
|
| 50 |
+
ret, frame = cap.read()
|
| 51 |
+
if not ret:
|
| 52 |
+
break
|
| 53 |
+
up_frame, _ = upsampler.enhance(frame, outscale=2)
|
| 54 |
+
writer.write(up_frame)
|
| 55 |
+
writer.release()
|
| 56 |
+
cap.release()
|
| 57 |
+
|
| 58 |
+
def generate(image: Image, audio=None, text: str = ''):
|
| 59 |
tmp = Path(tempfile.mkdtemp())
|
| 60 |
+
img_path = tmp / 'input.png'
|
| 61 |
+
image.convert('RGB').save(img_path)
|
| 62 |
|
| 63 |
# handle audio or text
|
| 64 |
if audio is None and text.strip():
|
| 65 |
+
wav_path = tmp / 'speech.wav'
|
| 66 |
tts.tts_to_file(text=text, file_path=str(wav_path))
|
| 67 |
elif audio is not None:
|
| 68 |
wav_path = Path(audio)
|
| 69 |
else:
|
| 70 |
+
raise gr.Error('Provide either audio or text!')
|
| 71 |
|
| 72 |
+
# SadTalker inference
|
| 73 |
+
vid_path = tmp / f'{uuid.uuid4().hex}_base.mp4'
|
| 74 |
sadtalker.infer(
|
| 75 |
source_image=str(img_path),
|
| 76 |
driven_audio=str(wav_path),
|
| 77 |
result_dir=str(tmp),
|
| 78 |
size=512,
|
| 79 |
+
preprocess='crop',
|
| 80 |
still=False,
|
| 81 |
enhancer=None,
|
| 82 |
write_video=True,
|
| 83 |
output_path=str(vid_path)
|
| 84 |
)
|
| 85 |
|
| 86 |
+
# Upscale video with Real-ESRGAN (×2 => 1024×1024)
|
| 87 |
+
upscaled_vid = tmp / f'{uuid.uuid4().hex}_sr.mp4'
|
| 88 |
+
_upscale_video(vid_path, upscaled_vid)
|
| 89 |
+
|
| 90 |
+
# scale & pad to 576×1024 portrait
|
| 91 |
+
final_vid = tmp / 'vertical.mp4'
|
| 92 |
cmd = [
|
| 93 |
+
'ffmpeg', '-y', '-i', str(upscaled_vid),
|
| 94 |
+
'-vf', 'scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30',
|
| 95 |
+
'-c:v', 'libx264', '-crf', '18', '-pix_fmt', 'yuv420p', str(final_vid)
|
| 96 |
]
|
| 97 |
subprocess.run(cmd, check=True)
|
| 98 |
+
return str(final_vid)
|
| 99 |
|
| 100 |
demo = gr.Interface(
|
| 101 |
fn=generate,
|
| 102 |
inputs=[
|
| 103 |
+
gr.Image(type='pil', label='Portrait 512×512'),
|
| 104 |
+
gr.Audio(type='filepath', optional=True, label='Voice (wav/mp3)'),
|
| 105 |
+
gr.Textbox(lines=2, placeholder='…or paste text', label='Text')
|
| 106 |
],
|
| 107 |
+
outputs=gr.Video(label='576×1024 MP4'),
|
| 108 |
+
title='ZeroGPU SadTalker 9:16'
|
| 109 |
)
|
| 110 |
|
| 111 |
+
if __name__ == '__main__':
|
| 112 |
+
demo.queue(concurrency_count=1, max_size=4).launch()
|
requirements.txt
CHANGED
|
@@ -9,3 +9,4 @@ facenet-pytorch
|
|
| 9 |
scipy
|
| 10 |
pydub
|
| 11 |
ninja
|
|
|
|
|
|
| 9 |
scipy
|
| 10 |
pydub
|
| 11 |
ninja
|
| 12 |
+
numpy
|