Madras1 commited on
Commit
1e135d7
·
verified ·
1 Parent(s): 0ab5505

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.md +49 -13
  2. app.py +154 -0
  3. requirements.txt +15 -0
README.md CHANGED
@@ -1,13 +1,49 @@
1
- ---
2
- title: Sadtalker Api
3
- emoji: 🚀
4
- colorFrom: green
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 6.2.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: SadTalker API
3
+ emoji: 🎭
4
+ colorFrom: purple
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # SadTalker API
13
+
14
+ Talking head generation API using SadTalker in CPU mode.
15
+
16
+ ## Features
17
+ - Generates video from face image + audio
18
+ - Runs on CPU (no GPU required)
19
+ - Returns base64 encoded video
20
+
21
+ ## Usage
22
+
23
+ ### Via UI
24
+ Upload an image and audio file, click Generate.
25
+
26
+ ### Via API
27
+ ```python
28
+ import requests
29
+ import base64
30
+
31
+ # Read files
32
+ with open("face.png", "rb") as f:
33
+ image_b64 = base64.b64encode(f.read()).decode()
34
+
35
+ with open("audio.mp3", "rb") as f:
36
+ audio_b64 = base64.b64encode(f.read()).decode()
37
+
38
+ # Call API
39
+ response = requests.post(
40
+ "https://your-space.hf.space/api/predict",
41
+ json={"data": [image_b64, audio_b64]}
42
+ )
43
+
44
+ video_b64 = response.json()["data"][0]
45
+ ```
46
+
47
+ ## Notes
48
+ - First run will download ~2GB of model weights
49
+ - Each generation takes 1-2 minutes on CPU
app.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import subprocess
3
+ import tempfile
4
+ import base64
5
+ import os
6
+ import shutil
7
+
8
+ # Clone SadTalker on first run
9
+ SADTALKER_DIR = "/home/user/SadTalker"
10
+
11
+ def setup_sadtalker():
12
+ """Clone and setup SadTalker if not already done"""
13
+ if not os.path.exists(SADTALKER_DIR):
14
+ print("Cloning SadTalker...")
15
+ subprocess.run([
16
+ "git", "clone", "--depth", "1",
17
+ "https://github.com/OpenTalker/SadTalker.git",
18
+ SADTALKER_DIR
19
+ ], check=True)
20
+
21
+ # Download checkpoints
22
+ print("Downloading checkpoints...")
23
+ os.makedirs(f"{SADTALKER_DIR}/checkpoints", exist_ok=True)
24
+
25
+ # Download from HuggingFace
26
+ subprocess.run([
27
+ "pip", "install", "huggingface_hub"
28
+ ], check=True)
29
+
30
+ from huggingface_hub import snapshot_download
31
+ snapshot_download(
32
+ repo_id="vinthony/SadTalker",
33
+ local_dir=f"{SADTALKER_DIR}/checkpoints",
34
+ local_dir_use_symlinks=False
35
+ )
36
+
37
+ return True
38
+
39
+ def generate_video(image_path: str, audio_path: str) -> str:
40
+ """
41
+ Generate talking head video from image and audio
42
+ Returns: path to generated video
43
+ """
44
+ setup_sadtalker()
45
+
46
+ with tempfile.TemporaryDirectory() as tmpdir:
47
+ output_dir = os.path.join(tmpdir, "output")
48
+ os.makedirs(output_dir, exist_ok=True)
49
+
50
+ # Run SadTalker inference
51
+ cmd = [
52
+ "python", f"{SADTALKER_DIR}/inference.py",
53
+ "--driven_audio", audio_path,
54
+ "--source_image", image_path,
55
+ "--result_dir", output_dir,
56
+ "--still", # Less movement, faster
57
+ "--preprocess", "crop",
58
+ "--cpu" # Force CPU mode
59
+ ]
60
+
61
+ print(f"Running: {' '.join(cmd)}")
62
+ result = subprocess.run(cmd, capture_output=True, text=True, cwd=SADTALKER_DIR)
63
+
64
+ if result.returncode != 0:
65
+ print(f"Error: {result.stderr}")
66
+ raise Exception(f"SadTalker failed: {result.stderr}")
67
+
68
+ # Find generated video
69
+ for root, dirs, files in os.walk(output_dir):
70
+ for f in files:
71
+ if f.endswith(".mp4"):
72
+ video_path = os.path.join(root, f)
73
+ # Read and return as base64
74
+ with open(video_path, "rb") as vf:
75
+ return base64.b64encode(vf.read()).decode("utf-8")
76
+
77
+ raise Exception("No video generated")
78
+
79
+ def api_generate(image_base64: str, audio_base64: str) -> dict:
80
+ """API endpoint for generating video"""
81
+ try:
82
+ with tempfile.TemporaryDirectory() as tmpdir:
83
+ # Save image
84
+ image_path = os.path.join(tmpdir, "input.png")
85
+ with open(image_path, "wb") as f:
86
+ f.write(base64.b64decode(image_base64))
87
+
88
+ # Save audio
89
+ audio_path = os.path.join(tmpdir, "input.mp3")
90
+ with open(audio_path, "wb") as f:
91
+ f.write(base64.b64decode(audio_base64))
92
+
93
+ # Generate video
94
+ video_base64 = generate_video(image_path, audio_path)
95
+
96
+ return {"success": True, "video_base64": video_base64}
97
+
98
+ except Exception as e:
99
+ return {"success": False, "error": str(e)}
100
+
101
+ # Gradio interface for testing
102
+ def gradio_generate(image, audio):
103
+ """Gradio interface wrapper"""
104
+ if image is None or audio is None:
105
+ return None
106
+
107
+ with tempfile.TemporaryDirectory() as tmpdir:
108
+ # Save uploaded files
109
+ image_path = os.path.join(tmpdir, "input.png")
110
+ audio_path = os.path.join(tmpdir, "input.mp3")
111
+
112
+ # Handle image (could be numpy array or path)
113
+ if isinstance(image, str):
114
+ shutil.copy(image, image_path)
115
+ else:
116
+ from PIL import Image
117
+ Image.fromarray(image).save(image_path)
118
+
119
+ # Handle audio
120
+ shutil.copy(audio, audio_path)
121
+
122
+ # Generate
123
+ video_base64 = generate_video(image_path, audio_path)
124
+
125
+ # Save to temp file for Gradio
126
+ output_path = os.path.join(tmpdir, "output.mp4")
127
+ with open(output_path, "wb") as f:
128
+ f.write(base64.b64decode(video_base64))
129
+
130
+ return output_path
131
+
132
+ # Create Gradio app with API
133
+ with gr.Blocks() as demo:
134
+ gr.Markdown("# SadTalker API 🎭")
135
+ gr.Markdown("Generate talking head videos from image + audio")
136
+
137
+ with gr.Row():
138
+ with gr.Column():
139
+ image_input = gr.Image(label="Face Image", type="filepath")
140
+ audio_input = gr.Audio(label="Audio", type="filepath")
141
+ generate_btn = gr.Button("Generate", variant="primary")
142
+
143
+ with gr.Column():
144
+ video_output = gr.Video(label="Result")
145
+
146
+ generate_btn.click(
147
+ fn=gradio_generate,
148
+ inputs=[image_input, audio_input],
149
+ outputs=video_output
150
+ )
151
+
152
+ # Launch with API enabled
153
+ if __name__ == "__main__":
154
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ torch --index-url https://download.pytorch.org/whl/cpu
3
+ torchvision --index-url https://download.pytorch.org/whl/cpu
4
+ torchaudio --index-url https://download.pytorch.org/whl/cpu
5
+ numpy
6
+ scipy
7
+ opencv-python-headless
8
+ imageio
9
+ imageio-ffmpeg
10
+ pydub
11
+ gfpgan
12
+ face_alignment
13
+ dlib-bin
14
+ huggingface_hub
15
+ Pillow