Kyo-Kai commited on
Commit
87e27bd
·
verified ·
1 Parent(s): 3d0f1dd

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +2 -3
  2. README.md +15 -8
  3. app.py +45 -78
  4. requirements.txt +1 -1
Dockerfile CHANGED
@@ -1,3 +1,4 @@
 
1
  # ---------- base image -------------------------------------------------------
2
  FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
3
 
@@ -37,9 +38,7 @@ RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
37
  cd Real-ESRGAN && \
38
  pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
39
  pip install -r requirements.txt && \
40
- python setup.py develop && \
41
- mkdir -p weights && \
42
- wget -q -O weights/RealESRGAN_x2plus.pth https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.0/RealESRGAN_x2plus.pth
43
 
44
  # ---------- app --------------------------------------------------------------
45
  COPY app.py .
 
1
+
2
  # ---------- base image -------------------------------------------------------
3
  FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
4
 
 
38
  cd Real-ESRGAN && \
39
  pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
40
  pip install -r requirements.txt && \
41
+ python setup.py develop
 
 
42
 
43
  # ---------- app --------------------------------------------------------------
44
  COPY app.py .
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
  ---
2
  title: Vertical SadTalker
3
  emoji: 🎤
@@ -9,14 +10,20 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # Vertical Talking-Head Generator (SadTalker + Real-ESRGAN)
 
 
13
 
14
- This Space turns a portrait + audio **or** portrait + text into a 576 × 1024 (9∶16) lip‑synced video, perfect for TikTok/Shorts.
 
 
 
15
 
16
- **Pipeline**
17
- 1. XTTS‑v2 (Coqui‑TTS) → WAV (if you supply text)
18
- 2. SadTalker generates 512² talking‑head video
19
- 3. Real‑ESRGAN upscales frames to 1024²
20
- 4. FFmpeg crops/pads to 576 × 1024 and re‑encodes as MP4
21
 
22
- Designed for **ZeroGPU** H‑200 slices; add `deepspeed --num_gpus 2 app.py` if you spin it on a dual‑GPU runtime.
 
 
 
 
 
 
1
+
2
  ---
3
  title: Vertical SadTalker
4
  emoji: 🎤
 
10
  pinned: false
11
  ---
12
 
13
+ # Vertical TalkingHead Generator (SadTalker)
14
+
15
+ Turns a portrait + audio **or** portrait + text into a 576 × 1024 MP4 ready for TikTok/Shorts.
16
 
17
+ * XTTS‑v2 TTS when only text is provided
18
+ * SadTalker for lip‑sync & motion
19
+ * Optional Real‑ESRGAN upscaling (512 → 1024)
20
+ * CUDA 12, fp16, xformers; works on one H‑200 or two T4s with `deepspeed --num_gpus 2`
21
 
22
+ ## Simple API
 
 
 
 
23
 
24
+ ```python
25
+ from gradio_client import Client
26
+ cli = Client("your‑hf‑org/vertical‑sadtalker")
27
+ vid = cli.predict(image="face.png", text="Breaking news!")
28
+ cli.download(vid, "clip.mp4")
29
+ ```
app.py CHANGED
@@ -1,112 +1,79 @@
1
- import os, sys, subprocess, tempfile, uuid, torch, gradio as gr, cv2, numpy as np
 
2
  from pathlib import Path
3
  from PIL import Image
4
- from TTS.api import TTS
5
-
6
- # add SadTalker to path if not installed as pkg
7
- SAD_PATH = '/workspace/SadTalker'
8
- if SAD_PATH not in sys.path and Path(SAD_PATH).exists():
9
- sys.path.append(SAD_PATH)
10
- from sadtalker.inference import SadTalker
11
-
12
- from realesrgan import RealESRGANer
13
- from basicsr.archs.rrdbnet_arch import RRDBNet
14
 
15
- DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
 
16
 
17
- # TTS
18
- tts = TTS(model_name='tts_models/en/xtts_v2', progress_bar=False).to(DEVICE)
19
 
20
- # SadTalker
 
21
  sadtalker = SadTalker(device=DEVICE, half=True) # fp16
22
 
23
- # Real‑ESRGAN upsampler (×2 so 512→1024)
24
- model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
25
- upsampler = RealESRGANer(
26
- scale=2,
27
- model_path='/workspace/Real-ESRGAN/weights/RealESRGAN_x2plus.pth',
28
- model=model,
29
- tile=0,
30
- half=True,
31
- gpu_id=0 if DEVICE == 'cuda' else None
32
- )
33
-
34
- def _upscale_video(in_path: Path, out_path: Path):
35
- cap = cv2.VideoCapture(str(in_path))
36
- fps = cap.get(cv2.CAP_PROP_FPS) or 25
37
- fourcc = cv2.VideoWriter_fourcc(*'mp4v')
38
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
39
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
40
- # first frame upscale to get new dims
41
- ret, frame = cap.read()
42
- if not ret:
43
- cap.release()
44
- raise RuntimeError('Empty video input for upscaling.')
45
- up_frame, _ = upsampler.enhance(frame, outscale=2)
46
- new_h, new_w = up_frame.shape[:2]
47
- writer = cv2.VideoWriter(str(out_path), fourcc, fps, (new_w, new_h))
48
- writer.write(up_frame)
49
- while True:
50
- ret, frame = cap.read()
51
- if not ret:
52
- break
53
- up_frame, _ = upsampler.enhance(frame, outscale=2)
54
- writer.write(up_frame)
55
- writer.release()
56
- cap.release()
57
-
58
- def generate(image: Image, audio=None, text: str = ''):
59
  tmp = Path(tempfile.mkdtemp())
60
- img_path = tmp / 'input.png'
61
- image.convert('RGB').save(img_path)
62
 
63
- # handle audio or text
64
  if audio is None and text.strip():
65
- wav_path = tmp / 'speech.wav'
66
  tts.tts_to_file(text=text, file_path=str(wav_path))
67
  elif audio is not None:
68
  wav_path = Path(audio)
69
  else:
70
- raise gr.Error('Provide either audio or text!')
71
 
72
- # SadTalker inference
73
- vid_path = tmp / f'{uuid.uuid4().hex}_base.mp4'
74
  sadtalker.infer(
75
  source_image=str(img_path),
76
  driven_audio=str(wav_path),
77
  result_dir=str(tmp),
78
  size=512,
79
- preprocess='crop',
80
  still=False,
81
  enhancer=None,
82
  write_video=True,
83
  output_path=str(vid_path)
84
  )
85
 
86
- # Upscale video with Real-ESRGAN (×2 => 1024×1024)
87
- upscaled_vid = tmp / f'{uuid.uuid4().hex}_sr.mp4'
88
- _upscale_video(vid_path, upscaled_vid)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # scale & pad to 576×1024 portrait
91
- final_vid = tmp / 'vertical.mp4'
92
- cmd = [
93
- 'ffmpeg', '-y', '-i', str(upscaled_vid),
94
- '-vf', 'scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30',
95
- '-c:v', 'libx264', '-crf', '18', '-pix_fmt', 'yuv420p', str(final_vid)
96
- ]
97
- subprocess.run(cmd, check=True)
98
- return str(final_vid)
99
 
100
  demo = gr.Interface(
101
  fn=generate,
102
  inputs=[
103
- gr.Image(type='pil', label='Portrait 512×512'),
104
- gr.Audio(type='filepath', optional=True, label='Voice (wav/mp3)'),
105
- gr.Textbox(lines=2, placeholder='…or paste text', label='Text')
106
  ],
107
- outputs=gr.Video(label='576×1024 MP4'),
108
- title='ZeroGPU SadTalker 9:16'
109
  )
110
 
111
- if __name__ == '__main__':
112
- demo.queue(concurrency_count=1, max_size=4).launch()
 
1
+
2
+ import sys, os, subprocess, tempfile, uuid, torch, gradio as gr
3
  from pathlib import Path
4
  from PIL import Image
 
 
 
 
 
 
 
 
 
 
5
 
6
+ # ensure SadTalker repo is importable
7
+ sys.path.append('/workspace/SadTalker')
8
 
9
+ from TTS.api import TTS
10
+ from sadtalker.inference import SadTalker
11
 
12
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
13
+ tts = TTS(model_name="tts_models/en/xtts_v2", progress_bar=False).to(DEVICE)
14
  sadtalker = SadTalker(device=DEVICE, half=True) # fp16
15
 
16
+ def generate(image: Image, audio=None, text: str = ""):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  tmp = Path(tempfile.mkdtemp())
18
+ img_path = tmp / "input.png"
19
+ image.convert("RGB").save(img_path)
20
 
21
+ # 1️⃣ handle audio or text -> wav
22
  if audio is None and text.strip():
23
+ wav_path = tmp / "speech.wav"
24
  tts.tts_to_file(text=text, file_path=str(wav_path))
25
  elif audio is not None:
26
  wav_path = Path(audio)
27
  else:
28
+ raise gr.Error("Provide either audio or text!")
29
 
30
+ # 2️⃣ run SadTalker
31
+ vid_path = tmp / f"{uuid.uuid4().hex}.mp4"
32
  sadtalker.infer(
33
  source_image=str(img_path),
34
  driven_audio=str(wav_path),
35
  result_dir=str(tmp),
36
  size=512,
37
+ preprocess="crop",
38
  still=False,
39
  enhancer=None,
40
  write_video=True,
41
  output_path=str(vid_path)
42
  )
43
 
44
+ # 3️⃣ (optional) upscale to ~1024 height with Real-ESRGAN
45
+ upscaled_path = tmp / "up.mp4"
46
+ try:
47
+ subprocess.run([
48
+ "python", "/workspace/Real-ESRGAN/inference_realesrgan_video.py",
49
+ "-n", "RealESRGAN_x4plus",
50
+ "-i", str(vid_path),
51
+ "-o", str(upscaled_path),
52
+ "--fp32"
53
+ ], check=True)
54
+ except Exception:
55
+ upscaled_path = vid_path # fallback
56
+
57
+ # 4️⃣ pad/crop to 576×1024 vertical
58
+ out_vid = tmp / "vertical.mp4"
59
+ subprocess.run([
60
+ "ffmpeg", "-y", "-i", str(upscaled_path),
61
+ "-vf", "scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30",
62
+ "-c:v", "libx264", "-crf", "18", "-pix_fmt", "yuv420p", str(out_vid)
63
+ ], check=True)
64
 
65
+ return str(out_vid)
 
 
 
 
 
 
 
 
66
 
67
  demo = gr.Interface(
68
  fn=generate,
69
  inputs=[
70
+ gr.Image(type="pil", label="Portrait 512×512"),
71
+ gr.Audio(type="filepath", optional=True, label="Voice (wav/mp3)"),
72
+ gr.Textbox(lines=2, placeholder="…or paste text", label="Text")
73
  ],
74
+ outputs=gr.Video(label="576×1024 MP4"),
75
+ title="ZeroGPU SadTalker 9:16"
76
  )
77
 
78
+ if __name__ == "__main__":
79
+ demo.queue(concurrency_count=2, max_size=8).launch()
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  gradio==4.26.0
2
  huggingface_hub>=0.23.1
3
  transformers>=4.43.0
@@ -9,4 +10,3 @@ facenet-pytorch
9
  scipy
10
  pydub
11
  ninja
12
- numpy
 
1
+
2
  gradio==4.26.0
3
  huggingface_hub>=0.23.1
4
  transformers>=4.43.0
 
10
  scipy
11
  pydub
12
  ninja