Kyo-Kai commited on
Commit
3d0f1dd
·
verified ·
1 Parent(s): 127bff5

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +5 -2
  2. README.md +18 -26
  3. app.py +75 -23
  4. requirements.txt +1 -0
Dockerfile CHANGED
@@ -29,14 +29,17 @@ RUN pip install --no-cache-dir -r requirements.txt
29
  # pull SadTalker weights once at build time (saves cold-start)
30
  RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
31
  cd SadTalker && \
32
- git lfs install && git lfs pull # grabs checkpoints (~700 MB)
 
33
 
34
  # ---------- Real-ESRGAN install from source ----------------------------------
35
  RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
36
  cd Real-ESRGAN && \
37
  pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
38
  pip install -r requirements.txt && \
39
- python setup.py develop
 
 
40
 
41
  # ---------- app --------------------------------------------------------------
42
  COPY app.py .
 
29
  # pull SadTalker weights once at build time (saves cold-start)
30
  RUN git clone --depth 1 https://github.com/OpenTalker/SadTalker.git && \
31
  cd SadTalker && \
32
+ git lfs install && git lfs pull && \
33
+ pip install -e .
34
 
35
  # ---------- Real-ESRGAN install from source ----------------------------------
36
  RUN git clone https://github.com/xinntao/Real-ESRGAN.git && \
37
  cd Real-ESRGAN && \
38
  pip install basicsr==1.4.2 facexlib==0.2.5 gfpgan==1.3.8 && \
39
  pip install -r requirements.txt && \
40
+ python setup.py develop && \
41
+ mkdir -p weights && \
42
+ wget -q -O weights/RealESRGAN_x2plus.pth https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.0/RealESRGAN_x2plus.pth
43
 
44
  # ---------- app --------------------------------------------------------------
45
  COPY app.py .
README.md CHANGED
@@ -1,30 +1,22 @@
1
- ---
2
- title: Vertical SadTalker
3
- emoji: 🎤
4
- colorFrom: blue
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.34.2
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- # Vertical Talking-Head Generator (SadTalker)
13
 
14
- Hugging Face Space that turns a portrait + audio **or** portrait + text
15
- into a 576 × 1024 (9∶16) lip‑synced MP4, perfect for TikTok/Shorts.
16
 
17
- ## Features
18
- * Accepts portrait PNG/JPEG (≈512²)
19
- * Accepts WAV/MP3 **or** raw text (text goes through XTTSv2 TTS)
20
- * Uses SadTalker (Apache 2) for motion & lip‑sync
21
- * Auto‑upscales, crops & pads to 576×1024
22
- * Runs on ZeroGPU (H‑200) with fp16 + xformers
23
- * Optional multi‑GPU via DeepSpeed
24
 
25
- ## API example
26
- ```python
27
- from gradio_client import Client
28
- cli = Client("your‑org/vertical‑sadtalker")
29
- out = cli.predict(image="reporter.png", audio=None, text="Hello world!")
30
- cli.download(out, "clip.mp4")
 
1
+ ---
2
+ title: Vertical SadTalker
3
+ emoji: 🎤
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: "4.26.0"
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
 
12
+ # Vertical Talking-Head Generator (SadTalker + Real-ESRGAN)
13
 
14
+ This Space turns a portrait + audio **or** portrait + text into a 576 × 1024 (9∶16) lip‑synced video, perfect for TikTok/Shorts.
 
15
 
16
+ **Pipeline**
17
+ 1. XTTS‑v2 (Coqui‑TTS) WAV (if you supply text)
18
+ 2. SadTalker generates 512² talkinghead video
19
+ 3. Real‑ESRGAN upscales frames to 1024²
20
+ 4. FFmpeg crops/pads to 576×1024 and re‑encodes as MP4
 
 
21
 
22
+ Designed for **ZeroGPU** H‑200 slices; add `deepspeed --num_gpus 2 app.py` if you spin it on a dual‑GPU runtime.
 
 
 
 
 
app.py CHANGED
@@ -1,60 +1,112 @@
1
- import os, subprocess, tempfile, uuid, torch, gradio as gr
2
  from pathlib import Path
3
  from PIL import Image
4
  from TTS.api import TTS
 
 
 
 
 
5
  from sadtalker.inference import SadTalker
6
 
7
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
8
- tts = TTS(model_name="tts_models/en/xtts_v2", progress_bar=False).to(DEVICE)
 
 
 
 
 
 
 
9
  sadtalker = SadTalker(device=DEVICE, half=True) # fp16
10
 
11
- def generate(image: Image, audio=None, text: str = ""):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  tmp = Path(tempfile.mkdtemp())
13
- img_path = tmp / "input.png"
14
- image.convert("RGB").save(img_path)
15
 
16
  # handle audio or text
17
  if audio is None and text.strip():
18
- wav_path = tmp / "speech.wav"
19
  tts.tts_to_file(text=text, file_path=str(wav_path))
20
  elif audio is not None:
21
  wav_path = Path(audio)
22
  else:
23
- raise gr.Error("Provide either audio or text!")
24
 
25
- vid_path = tmp / f"{uuid.uuid4().hex}.mp4"
 
26
  sadtalker.infer(
27
  source_image=str(img_path),
28
  driven_audio=str(wav_path),
29
  result_dir=str(tmp),
30
  size=512,
31
- preprocess="crop",
32
  still=False,
33
  enhancer=None,
34
  write_video=True,
35
  output_path=str(vid_path)
36
  )
37
 
38
- # upscale & pad to 576x1024
39
- out_vid = tmp / "vertical.mp4"
 
 
 
 
40
  cmd = [
41
- "ffmpeg", "-y", "-i", str(vid_path),
42
- "-vf", "scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30",
43
- "-c:v", "libx264", "-crf", "18", "-pix_fmt", "yuv420p", str(out_vid)
44
  ]
45
  subprocess.run(cmd, check=True)
46
- return str(out_vid)
47
 
48
  demo = gr.Interface(
49
  fn=generate,
50
  inputs=[
51
- gr.Image(type="pil", label="Portrait 512×512"),
52
- gr.Audio(type="filepath", optional=True, label="Voice (wav/mp3)"),
53
- gr.Textbox(lines=2, placeholder="…or paste text", label="Text")
54
  ],
55
- outputs=gr.Video(label="576×1024 MP4"),
56
- title="ZeroGPU SadTalker 9:16"
57
  )
58
 
59
- if __name__ == "__main__":
60
- demo.queue(concurrency_count=2, max_size=8).launch()
 
1
+ import os, sys, subprocess, tempfile, uuid, torch, gradio as gr, cv2, numpy as np
2
  from pathlib import Path
3
  from PIL import Image
4
  from TTS.api import TTS
5
+
6
+ # add SadTalker to path if not installed as pkg
7
+ SAD_PATH = '/workspace/SadTalker'
8
+ if SAD_PATH not in sys.path and Path(SAD_PATH).exists():
9
+ sys.path.append(SAD_PATH)
10
  from sadtalker.inference import SadTalker
11
 
12
+ from realesrgan import RealESRGANer
13
+ from basicsr.archs.rrdbnet_arch import RRDBNet
14
+
15
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
16
+
17
+ # TTS
18
+ tts = TTS(model_name='tts_models/en/xtts_v2', progress_bar=False).to(DEVICE)
19
+
20
+ # SadTalker
21
  sadtalker = SadTalker(device=DEVICE, half=True) # fp16
22
 
23
+ # Real‑ESRGAN upsampler (×2 so 512→1024)
24
+ model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
25
+ upsampler = RealESRGANer(
26
+ scale=2,
27
+ model_path='/workspace/Real-ESRGAN/weights/RealESRGAN_x2plus.pth',
28
+ model=model,
29
+ tile=0,
30
+ half=True,
31
+ gpu_id=0 if DEVICE == 'cuda' else None
32
+ )
33
+
34
+ def _upscale_video(in_path: Path, out_path: Path):
35
+ cap = cv2.VideoCapture(str(in_path))
36
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25
37
+ fourcc = cv2.VideoWriter_fourcc(*'mp4v')
38
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
39
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
40
+ # first frame upscale to get new dims
41
+ ret, frame = cap.read()
42
+ if not ret:
43
+ cap.release()
44
+ raise RuntimeError('Empty video input for upscaling.')
45
+ up_frame, _ = upsampler.enhance(frame, outscale=2)
46
+ new_h, new_w = up_frame.shape[:2]
47
+ writer = cv2.VideoWriter(str(out_path), fourcc, fps, (new_w, new_h))
48
+ writer.write(up_frame)
49
+ while True:
50
+ ret, frame = cap.read()
51
+ if not ret:
52
+ break
53
+ up_frame, _ = upsampler.enhance(frame, outscale=2)
54
+ writer.write(up_frame)
55
+ writer.release()
56
+ cap.release()
57
+
58
+ def generate(image: Image, audio=None, text: str = ''):
59
  tmp = Path(tempfile.mkdtemp())
60
+ img_path = tmp / 'input.png'
61
+ image.convert('RGB').save(img_path)
62
 
63
  # handle audio or text
64
  if audio is None and text.strip():
65
+ wav_path = tmp / 'speech.wav'
66
  tts.tts_to_file(text=text, file_path=str(wav_path))
67
  elif audio is not None:
68
  wav_path = Path(audio)
69
  else:
70
+ raise gr.Error('Provide either audio or text!')
71
 
72
+ # SadTalker inference
73
+ vid_path = tmp / f'{uuid.uuid4().hex}_base.mp4'
74
  sadtalker.infer(
75
  source_image=str(img_path),
76
  driven_audio=str(wav_path),
77
  result_dir=str(tmp),
78
  size=512,
79
+ preprocess='crop',
80
  still=False,
81
  enhancer=None,
82
  write_video=True,
83
  output_path=str(vid_path)
84
  )
85
 
86
+ # Upscale video with Real-ESRGAN (×2 => 1024×1024)
87
+ upscaled_vid = tmp / f'{uuid.uuid4().hex}_sr.mp4'
88
+ _upscale_video(vid_path, upscaled_vid)
89
+
90
+ # scale & pad to 576×1024 portrait
91
+ final_vid = tmp / 'vertical.mp4'
92
  cmd = [
93
+ 'ffmpeg', '-y', '-i', str(upscaled_vid),
94
+ '-vf', 'scale=576:-1,pad=576:1024:0:(1024-ih)/2:black,fps=30',
95
+ '-c:v', 'libx264', '-crf', '18', '-pix_fmt', 'yuv420p', str(final_vid)
96
  ]
97
  subprocess.run(cmd, check=True)
98
+ return str(final_vid)
99
 
100
  demo = gr.Interface(
101
  fn=generate,
102
  inputs=[
103
+ gr.Image(type='pil', label='Portrait 512×512'),
104
+ gr.Audio(type='filepath', optional=True, label='Voice (wav/mp3)'),
105
+ gr.Textbox(lines=2, placeholder='…or paste text', label='Text')
106
  ],
107
+ outputs=gr.Video(label='576×1024 MP4'),
108
+ title='ZeroGPU SadTalker 9:16'
109
  )
110
 
111
+ if __name__ == '__main__':
112
+ demo.queue(concurrency_count=1, max_size=4).launch()
requirements.txt CHANGED
@@ -9,3 +9,4 @@ facenet-pytorch
9
  scipy
10
  pydub
11
  ninja
 
 
9
  scipy
10
  pydub
11
  ninja
12
+ numpy