Upload 4 files
Browse files- Dockerfile +2 -3
- README.md +15 -8
- app.py +45 -78
- requirements.txt +1 -1
Dockerfile
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
# ---------- base image -------------------------------------------------------
|
| 2 |
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
| 3 |
|
|
@@ -37,9 +38,7 @@ RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
|
|
| 37 |
cd Real-ESRGAN && \
|
| 38 |
pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
|
| 39 |
pip install -r requirements.txt && \
|
| 40 |
-
python setup.py develop
|
| 41 |
-
mkdir -p weights && \
|
| 42 |
-
wget -q -O weights/RealESRGAN_x2plus.pth https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.0/RealESRGAN_x2plus.pth
|
| 43 |
|
| 44 |
# ---------- app --------------------------------------------------------------
|
| 45 |
COPY app.py .
|
|
|
|
| 1 |
+
|
| 2 |
# ---------- base image -------------------------------------------------------
|
| 3 |
FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
|
| 4 |
|
|
|
|
| 38 |
cd Real-ESRGAN && \
|
| 39 |
pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
|
| 40 |
pip install -r requirements.txt && \
|
| 41 |
+
python setup.py develop
|
|
|
|
|
|
|
| 42 |
|
| 43 |
# ---------- app --------------------------------------------------------------
|
| 44 |
COPY app.py .
|
README.md
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Vertical SadTalker
|
| 3 |
emoji: 🎤
|
|
@@ -9,14 +10,20 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# Vertical Talking
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
1. XTTS‑v2 (Coqui‑TTS) → WAV (if you supply text)
|
| 18 |
-
2. SadTalker generates 512² talking‑head video
|
| 19 |
-
3. Real‑ESRGAN upscales frames to 1024²
|
| 20 |
-
4. FFmpeg crops/pads to 576 × 1024 and re‑encodes as MP4
|
| 21 |
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
---
|
| 3 |
title: Vertical SadTalker
|
| 4 |
emoji: 🎤
|
|
|
|
| 10 |
pinned: false
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Vertical Talking‑Head Generator (SadTalker)
|
| 14 |
+
|
| 15 |
+
Turns a portrait + audio **or** portrait + text into a 576 × 1024 MP4 ready for TikTok/Shorts.
|
| 16 |
|
| 17 |
+
* XTTS‑v2 TTS when only text is provided
|
| 18 |
+
* SadTalker for lip‑sync & motion
|
| 19 |
+
* Optional Real‑ESRGAN upscaling (512 → 1024)
|
| 20 |
+
* CUDA 12, fp16, xformers; works on one H‑200 or two T4s with `deepspeed --num_gpus 2`
|
| 21 |
|
| 22 |
+
## Simple API
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
```python
|
| 25 |
+
from gradio_client import Client
|
| 26 |
+
cli = Client("your‑hf‑org/vertical‑sadtalker")
|
| 27 |
+
vid = cli.predict(image="face.png", text="Breaking news!")
|
| 28 |
+
cli.download(vid, "clip.mp4")
|
| 29 |
+
```
|
app.py
CHANGED
|
@@ -1,112 +1,79 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
from pathlib import Path
|
| 3 |
from PIL import Image
|
| 4 |
-
from TTS.api import TTS
|
| 5 |
-
|
| 6 |
-
# add SadTalker to path if not installed as pkg
|
| 7 |
-
SAD_PATH = '/workspace/SadTalker'
|
| 8 |
-
if SAD_PATH not in sys.path and Path(SAD_PATH).exists():
|
| 9 |
-
sys.path.append(SAD_PATH)
|
| 10 |
-
from sadtalker.inference import SadTalker
|
| 11 |
-
|
| 12 |
-
from realesrgan import RealESRGANer
|
| 13 |
-
from basicsr.archs.rrdbnet_arch import RRDBNet
|
| 14 |
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
|
| 20 |
-
|
|
|
|
| 21 |
sadtalker = SadTalker(device=DEVICE, half=True) # fp16
|
| 22 |
|
| 23 |
-
|
| 24 |
-
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
|
| 25 |
-
upsampler = RealESRGANer(
|
| 26 |
-
scale=2,
|
| 27 |
-
model_path='/workspace/Real-ESRGAN/weights/RealESRGAN_x2plus.pth',
|
| 28 |
-
model=model,
|
| 29 |
-
tile=0,
|
| 30 |
-
half=True,
|
| 31 |
-
gpu_id=0 if DEVICE == 'cuda' else None
|
| 32 |
-
)
|
| 33 |
-
|
| 34 |
-
def _upscale_video(in_path: Path, out_path: Path):
|
| 35 |
-
cap = cv2.VideoCapture(str(in_path))
|
| 36 |
-
fps = cap.get(cv2.CAP_PROP_FPS) or 25
|
| 37 |
-
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
| 38 |
-
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 39 |
-
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 40 |
-
# first frame upscale to get new dims
|
| 41 |
-
ret, frame = cap.read()
|
| 42 |
-
if not ret:
|
| 43 |
-
cap.release()
|
| 44 |
-
raise RuntimeError('Empty video input for upscaling.')
|
| 45 |
-
up_frame, _ = upsampler.enhance(frame, outscale=2)
|
| 46 |
-
new_h, new_w = up_frame.shape[:2]
|
| 47 |
-
writer = cv2.VideoWriter(str(out_path), fourcc, fps, (new_w, new_h))
|
| 48 |
-
writer.write(up_frame)
|
| 49 |
-
while True:
|
| 50 |
-
ret, frame = cap.read()
|
| 51 |
-
if not ret:
|
| 52 |
-
break
|
| 53 |
-
up_frame, _ = upsampler.enhance(frame, outscale=2)
|
| 54 |
-
writer.write(up_frame)
|
| 55 |
-
writer.release()
|
| 56 |
-
cap.release()
|
| 57 |
-
|
| 58 |
-
def generate(image: Image, audio=None, text: str = ''):
|
| 59 |
tmp = Path(tempfile.mkdtemp())
|
| 60 |
-
img_path = tmp /
|
| 61 |
-
image.convert(
|
| 62 |
|
| 63 |
-
# handle audio or text
|
| 64 |
if audio is None and text.strip():
|
| 65 |
-
wav_path = tmp /
|
| 66 |
tts.tts_to_file(text=text, file_path=str(wav_path))
|
| 67 |
elif audio is not None:
|
| 68 |
wav_path = Path(audio)
|
| 69 |
else:
|
| 70 |
-
raise gr.Error(
|
| 71 |
|
| 72 |
-
# SadTalker
|
| 73 |
-
vid_path = tmp / f
|
| 74 |
sadtalker.infer(
|
| 75 |
source_image=str(img_path),
|
| 76 |
driven_audio=str(wav_path),
|
| 77 |
result_dir=str(tmp),
|
| 78 |
size=512,
|
| 79 |
-
preprocess=
|
| 80 |
still=False,
|
| 81 |
enhancer=None,
|
| 82 |
write_video=True,
|
| 83 |
output_path=str(vid_path)
|
| 84 |
)
|
| 85 |
|
| 86 |
-
#
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
final_vid = tmp / 'vertical.mp4'
|
| 92 |
-
cmd = [
|
| 93 |
-
'ffmpeg', '-y', '-i', str(upscaled_vid),
|
| 94 |
-
'-vf', 'scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30',
|
| 95 |
-
'-c:v', 'libx264', '-crf', '18', '-pix_fmt', 'yuv420p', str(final_vid)
|
| 96 |
-
]
|
| 97 |
-
subprocess.run(cmd, check=True)
|
| 98 |
-
return str(final_vid)
|
| 99 |
|
| 100 |
demo = gr.Interface(
|
| 101 |
fn=generate,
|
| 102 |
inputs=[
|
| 103 |
-
gr.Image(type=
|
| 104 |
-
gr.Audio(type=
|
| 105 |
-
gr.Textbox(lines=2, placeholder=
|
| 106 |
],
|
| 107 |
-
outputs=gr.Video(label=
|
| 108 |
-
title=
|
| 109 |
)
|
| 110 |
|
| 111 |
-
if __name__ ==
|
| 112 |
-
demo.queue(concurrency_count=
|
|
|
|
| 1 |
+
|
| 2 |
+
import sys, os, subprocess, tempfile, uuid, torch, gradio as gr
|
| 3 |
from pathlib import Path
|
| 4 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
# ensure SadTalker repo is importable
|
| 7 |
+
sys.path.append('/workspace/SadTalker')
|
| 8 |
|
| 9 |
+
from TTS.api import TTS
|
| 10 |
+
from sadtalker.inference import SadTalker
|
| 11 |
|
| 12 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
+
tts = TTS(model_name="tts_models/en/xtts_v2", progress_bar=False).to(DEVICE)
|
| 14 |
sadtalker = SadTalker(device=DEVICE, half=True) # fp16
|
| 15 |
|
| 16 |
+
def generate(image: Image, audio=None, text: str = ""):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
tmp = Path(tempfile.mkdtemp())
|
| 18 |
+
img_path = tmp / "input.png"
|
| 19 |
+
image.convert("RGB").save(img_path)
|
| 20 |
|
| 21 |
+
# 1️⃣ handle audio or text -> wav
|
| 22 |
if audio is None and text.strip():
|
| 23 |
+
wav_path = tmp / "speech.wav"
|
| 24 |
tts.tts_to_file(text=text, file_path=str(wav_path))
|
| 25 |
elif audio is not None:
|
| 26 |
wav_path = Path(audio)
|
| 27 |
else:
|
| 28 |
+
raise gr.Error("Provide either audio or text!")
|
| 29 |
|
| 30 |
+
# 2️⃣ run SadTalker
|
| 31 |
+
vid_path = tmp / f"{uuid.uuid4().hex}.mp4"
|
| 32 |
sadtalker.infer(
|
| 33 |
source_image=str(img_path),
|
| 34 |
driven_audio=str(wav_path),
|
| 35 |
result_dir=str(tmp),
|
| 36 |
size=512,
|
| 37 |
+
preprocess="crop",
|
| 38 |
still=False,
|
| 39 |
enhancer=None,
|
| 40 |
write_video=True,
|
| 41 |
output_path=str(vid_path)
|
| 42 |
)
|
| 43 |
|
| 44 |
+
# 3️⃣ (optional) upscale to ~1024 height with Real-ESRGAN
|
| 45 |
+
upscaled_path = tmp / "up.mp4"
|
| 46 |
+
try:
|
| 47 |
+
subprocess.run([
|
| 48 |
+
"python", "/workspace/Real-ESRGAN/inference_realesrgan_video.py",
|
| 49 |
+
"-n", "RealESRGAN_x4plus",
|
| 50 |
+
"-i", str(vid_path),
|
| 51 |
+
"-o", str(upscaled_path),
|
| 52 |
+
"--fp32"
|
| 53 |
+
], check=True)
|
| 54 |
+
except Exception:
|
| 55 |
+
upscaled_path = vid_path # fallback
|
| 56 |
+
|
| 57 |
+
# 4️⃣ pad/crop to 576×1024 vertical
|
| 58 |
+
out_vid = tmp / "vertical.mp4"
|
| 59 |
+
subprocess.run([
|
| 60 |
+
"ffmpeg", "-y", "-i", str(upscaled_path),
|
| 61 |
+
"-vf", "scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30",
|
| 62 |
+
"-c:v", "libx264", "-crf", "18", "-pix_fmt", "yuv420p", str(out_vid)
|
| 63 |
+
], check=True)
|
| 64 |
|
| 65 |
+
return str(out_vid)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
demo = gr.Interface(
|
| 68 |
fn=generate,
|
| 69 |
inputs=[
|
| 70 |
+
gr.Image(type="pil", label="Portrait 512×512"),
|
| 71 |
+
gr.Audio(type="filepath", optional=True, label="Voice (wav/mp3)"),
|
| 72 |
+
gr.Textbox(lines=2, placeholder="…or paste text", label="Text")
|
| 73 |
],
|
| 74 |
+
outputs=gr.Video(label="576×1024 MP4"),
|
| 75 |
+
title="ZeroGPU SadTalker 9:16"
|
| 76 |
)
|
| 77 |
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
demo.queue(concurrency_count=2, max_size=8).launch()
|
requirements.txt
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
gradio==4.26.0
|
| 2 |
huggingface_hub>=0.23.1
|
| 3 |
transformers>=4.43.0
|
|
@@ -9,4 +10,3 @@ facenet-pytorch
|
|
| 9 |
scipy
|
| 10 |
pydub
|
| 11 |
ninja
|
| 12 |
-
numpy
|
|
|
|
| 1 |
+
|
| 2 |
gradio==4.26.0
|
| 3 |
huggingface_hub>=0.23.1
|
| 4 |
transformers>=4.43.0
|
|
|
|
| 10 |
scipy
|
| 11 |
pydub
|
| 12 |
ninja
|
|
|