Spaces:

Kyo-Kai
/

IVTS

Runtime error

App Files Files Community

Kyo-Kai commited on Jun 20, 2025

Commit

3d0f1dd

verified ·

1 Parent(s): 127bff5

Upload 4 files

Browse files

Files changed (4) hide show

Dockerfile +5 -2
README.md +18 -26
app.py +75 -23
requirements.txt +1 -0

Dockerfile CHANGED Viewed

@@ -29,14 +29,17 @@ RUN pip install --no-cache-dir -r requirements.txt
 # pull SadTalker weights once at build time (saves cold-start)
 RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
     cd SadTalker && \
-    git lfs install && git lfs pull  # grabs checkpoints (~700 MB)
 # ---------- Real-ESRGAN install from source ----------------------------------
 RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
     cd Real-ESRGAN && \
     pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
     pip install -r requirements.txt && \
-    python setup.py develop
 # ---------- app --------------------------------------------------------------
 COPY app.py .

 # pull SadTalker weights once at build time (saves cold-start)
 RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
     cd SadTalker && \
+    git lfs install && git lfs pull && \
+    pip install -e .
 # ---------- Real-ESRGAN install from source ----------------------------------
 RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
     cd Real-ESRGAN && \
     pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
     pip install -r requirements.txt && \
+    python setup.py develop && \
+    mkdir -p weights && \
+    wget -q -O weights/RealESRGAN_x2plus.pth https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.0/RealESRGAN_x2plus.pth
 # ---------- app --------------------------------------------------------------
 COPY app.py .

README.md CHANGED Viewed

@@ -1,30 +1,22 @@
----
-title: Vertical SadTalker
-emoji: 🎤
-colorFrom: blue
-colorTo: pink
-sdk: gradio
-sdk_version: 5.34.2
-app_file: app.py
-pinned: false
----
-# Vertical Talking-Head Generator (SadTalker)
-Hugging Face Space that turns a portrait + audio **or** portrait + text
-into a 576 × 1024 (9∶16) lip‑synced MP4, perfect for TikTok/Shorts.
-## Features
-* Accepts portrait PNG/JPEG (≈512²)
-* Accepts WAV/MP3 **or** raw text (text goes through XTTS‑v2 TTS)
-* Uses SadTalker (Apache 2) for motion & lip‑sync
-* Auto‑upscales, crops & pads to 576×1024
-* Runs on ZeroGPU (H‑200) with fp16 + xformers
-* Optional multi‑GPU via DeepSpeed
-## API example
-```python
-from gradio_client import Client
-cli = Client("your‑org/vertical‑sadtalker")
-out = cli.predict(image="reporter.png", audio=None, text="Hello world!")
-cli.download(out, "clip.mp4")

+---
+title: Vertical SadTalker
+emoji: 🎤
+colorFrom: blue
+colorTo: pink
+sdk: gradio
+sdk_version: "4.26.0"
+app_file: app.py
+pinned: false
+---
+# Vertical Talking-Head Generator (SadTalker + Real-ESRGAN)
+This Space turns a portrait + audio **or** portrait + text into a 576 × 1024 (9∶16) lip‑synced video, perfect for TikTok/Shorts.
+**Pipeline**
+1. XTTS‑v2 (Coqui‑TTS) → WAV (if you supply text)
+2. SadTalker generates 512² talking‑head video
+3. Real‑ESRGAN upscales frames to 1024²
+4. FFmpeg crops/pads to 576 × 1024 and re‑encodes as MP4
+Designed for **ZeroGPU** H‑200 slices; add `deepspeed --num_gpus 2 app.py` if you spin it on a dual‑GPU runtime.

app.py CHANGED Viewed

@@ -1,60 +1,112 @@
-import os, subprocess, tempfile, uuid, torch, gradio as gr
 from pathlib import Path
 from PIL import Image
 from TTS.api import TTS
 from sadtalker.inference import SadTalker
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-tts = TTS(model_name="tts_models/en/xtts_v2", progress_bar=False).to(DEVICE)
 sadtalker = SadTalker(device=DEVICE, half=True)  # fp16
-def generate(image: Image, audio=None, text: str = ""):
     tmp = Path(tempfile.mkdtemp())
-    img_path = tmp / "input.png"
-    image.convert("RGB").save(img_path)
     # handle audio or text
     if audio is None and text.strip():
-        wav_path = tmp / "speech.wav"
         tts.tts_to_file(text=text, file_path=str(wav_path))
     elif audio is not None:
         wav_path = Path(audio)
     else:
-        raise gr.Error("Provide either audio or text!")
-    vid_path = tmp / f"{uuid.uuid4().hex}.mp4"
     sadtalker.infer(
         source_image=str(img_path),
         driven_audio=str(wav_path),
         result_dir=str(tmp),
         size=512,
-        preprocess="crop",
         still=False,
         enhancer=None,
         write_video=True,
         output_path=str(vid_path)
     )
-    # upscale & pad to 576x1024
-    out_vid = tmp / "vertical.mp4"
     cmd = [
-        "ffmpeg", "-y", "-i", str(vid_path),
-        "-vf", "scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30",
-        "-c:v", "libx264", "-crf", "18", "-pix_fmt", "yuv420p", str(out_vid)
     ]
     subprocess.run(cmd, check=True)
-    return str(out_vid)
 demo = gr.Interface(
     fn=generate,
     inputs=[
-        gr.Image(type="pil", label="Portrait 512×512"),
-        gr.Audio(type="filepath", optional=True, label="Voice (wav/mp3)"),
-        gr.Textbox(lines=2, placeholder="…or paste text", label="Text")
     ],
-    outputs=gr.Video(label="576×1024 MP4"),
-    title="ZeroGPU SadTalker 9:16"
 )
-if __name__ == "__main__":
-    demo.queue(concurrency_count=2, max_size=8).launch()

+import os, sys, subprocess, tempfile, uuid, torch, gradio as gr, cv2, numpy as np
 from pathlib import Path
 from PIL import Image
 from TTS.api import TTS
+# add SadTalker to path if not installed as pkg
+SAD_PATH = '/workspace/SadTalker'
+if SAD_PATH not in sys.path and Path(SAD_PATH).exists():
+    sys.path.append(SAD_PATH)
 from sadtalker.inference import SadTalker
+from realesrgan import RealESRGANer
+from basicsr.archs.rrdbnet_arch import RRDBNet
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+# TTS
+tts = TTS(model_name='tts_models/en/xtts_v2', progress_bar=False).to(DEVICE)
+# SadTalker
 sadtalker = SadTalker(device=DEVICE, half=True)  # fp16
+# Real‑ESRGAN upsampler (×2 so 512→1024)
+model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
+upsampler = RealESRGANer(
+    scale=2,
+    model_path='/workspace/Real-ESRGAN/weights/RealESRGAN_x2plus.pth',
+    model=model,
+    tile=0,
+    half=True,
+    gpu_id=0 if DEVICE == 'cuda' else None
+)
+def _upscale_video(in_path: Path, out_path: Path):
+    cap = cv2.VideoCapture(str(in_path))
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25
+    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    # first frame upscale to get new dims
+    ret, frame = cap.read()
+    if not ret:
+        cap.release()
+        raise RuntimeError('Empty video input for upscaling.')
+    up_frame, _ = upsampler.enhance(frame, outscale=2)
+    new_h, new_w = up_frame.shape[:2]
+    writer = cv2.VideoWriter(str(out_path), fourcc, fps, (new_w, new_h))
+    writer.write(up_frame)
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        up_frame, _ = upsampler.enhance(frame, outscale=2)
+        writer.write(up_frame)
+    writer.release()
+    cap.release()
+def generate(image: Image, audio=None, text: str = ''):
     tmp = Path(tempfile.mkdtemp())
+    img_path = tmp / 'input.png'
+    image.convert('RGB').save(img_path)
     # handle audio or text
     if audio is None and text.strip():
+        wav_path = tmp / 'speech.wav'
         tts.tts_to_file(text=text, file_path=str(wav_path))
     elif audio is not None:
         wav_path = Path(audio)
     else:
+        raise gr.Error('Provide either audio or text!')
+    # SadTalker inference
+    vid_path = tmp / f'{uuid.uuid4().hex}_base.mp4'
     sadtalker.infer(
         source_image=str(img_path),
         driven_audio=str(wav_path),
         result_dir=str(tmp),
         size=512,
+        preprocess='crop',
         still=False,
         enhancer=None,
         write_video=True,
         output_path=str(vid_path)
     )
+    # Upscale video with Real-ESRGAN (×2 => 1024×1024)
+    upscaled_vid = tmp / f'{uuid.uuid4().hex}_sr.mp4'
+    _upscale_video(vid_path, upscaled_vid)
+    # scale & pad to 576×1024 portrait
+    final_vid = tmp / 'vertical.mp4'
     cmd = [
+        'ffmpeg', '-y', '-i', str(upscaled_vid),
+        '-vf', 'scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30',
+        '-c:v', 'libx264', '-crf', '18', '-pix_fmt', 'yuv420p', str(final_vid)
     ]
     subprocess.run(cmd, check=True)
+    return str(final_vid)
 demo = gr.Interface(
     fn=generate,
     inputs=[
+        gr.Image(type='pil', label='Portrait 512×512'),
+        gr.Audio(type='filepath', optional=True, label='Voice (wav/mp3)'),
+        gr.Textbox(lines=2, placeholder='…or paste text', label='Text')
     ],
+    outputs=gr.Video(label='576×1024 MP4'),
+    title='ZeroGPU SadTalker 9:16'
 )
+if __name__ == '__main__':
+    demo.queue(concurrency_count=1, max_size=4).launch()

requirements.txt CHANGED Viewed

@@ -9,3 +9,4 @@ facenet-pytorch
 scipy
 pydub
 ninja

 scipy
 pydub
 ninja
+numpy