HiMind commited on
Commit
f1576aa
·
verified ·
1 Parent(s): a1dd1a0

Upload 3 files

Browse files
Files changed (3) hide show
  1. PackedAvatar.py +1425 -0
  2. README.md +594 -3
  3. requirements.txt +26 -0
PackedAvatar.py ADDED
@@ -0,0 +1,1425 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import hashlib
5
+ import importlib.util
6
+ import io
7
+ import json
8
+ import os
9
+ import platform
10
+ import shutil
11
+ import subprocess
12
+ import sys
13
+ import tempfile
14
+ import time
15
+ import uuid
16
+ import zipfile
17
+ from dataclasses import dataclass
18
+ from pathlib import Path
19
+ from typing import Any, Dict, List, Optional, Tuple
20
+
21
+ import cv2
22
+ import numpy as np
23
+ import torch
24
+ import zstandard as zstd
25
+ from PIL import Image
26
+ from pydub import AudioSegment
27
+ from scipy.io import loadmat, savemat
28
+
29
+
30
+ # ============================================================
31
+ # GENERAL HELPERS
32
+ # ============================================================
33
+
34
+ REAL_STYLE_ALIASES = {"real", "realistic", "photo", "photoreal", "liveaction"}
35
+
36
+
37
+ def ensure_dir(path: Path) -> None:
38
+ path.mkdir(parents=True, exist_ok=True)
39
+
40
+
41
+ def utc_now_iso() -> str:
42
+ from datetime import datetime, timezone
43
+
44
+ return datetime.now(timezone.utc).isoformat(timespec="seconds")
45
+
46
+
47
+ def sha256_bytes(data: bytes) -> str:
48
+ return hashlib.sha256(data).hexdigest()
49
+
50
+
51
+ def sha256_file(path: Path, chunk_size: int = 1024 * 1024) -> str:
52
+ h = hashlib.sha256()
53
+ with path.open("rb") as f:
54
+ for chunk in iter(lambda: f.read(chunk_size), b""):
55
+ h.update(chunk)
56
+ return h.hexdigest()
57
+
58
+
59
+ def tensor_to_bytes(obj: Any) -> bytes:
60
+ if isinstance(obj, (bytes, bytearray)):
61
+ return bytes(obj)
62
+ if torch.is_tensor(obj):
63
+ return obj.detach().cpu().contiguous().numpy().tobytes()
64
+ raise TypeError(f"Expected bytes or tensor, got {type(obj)!r}")
65
+
66
+
67
+ def bytes_to_tensor(data: bytes) -> torch.Tensor:
68
+ try:
69
+ return torch.frombuffer(memoryview(data), dtype=torch.uint8).clone()
70
+ except Exception:
71
+ return torch.tensor(list(data), dtype=torch.uint8)
72
+
73
+
74
+ def decode_png_or_zstd_image(blob: bytes) -> Image.Image:
75
+ """Decode a preview blob that may be a raw PNG or zstd-compressed PNG bytes."""
76
+ try:
77
+ raw = zstd.ZstdDecompressor().decompress(blob)
78
+ except Exception:
79
+ raw = blob
80
+ return Image.open(io.BytesIO(raw)).convert("RGB")
81
+
82
+
83
+ def pil_to_numpy_rgb(img: Image.Image) -> np.ndarray:
84
+ return np.asarray(img.convert("RGB"), dtype=np.uint8)
85
+
86
+
87
+ def normalize_style_name(style: Optional[str]) -> str:
88
+ return (style or "").strip().lower()
89
+
90
+
91
+ def normalize_gender_name(gender: Optional[str]) -> str:
92
+ return (gender or "").strip().lower()
93
+
94
+
95
+ def safe_load_bundle(path_or_bundle: Any) -> Optional[Dict[str, Any]]:
96
+ if path_or_bundle is None:
97
+ return None
98
+ if isinstance(path_or_bundle, dict):
99
+ return path_or_bundle
100
+ if isinstance(path_or_bundle, (str, os.PathLike)):
101
+ p = Path(path_or_bundle)
102
+ ext = p.suffix.lower()
103
+ if ext in {".pt", ".pth"}:
104
+ return torch.load(str(p), map_location="cpu", weights_only=False)
105
+ if ext == ".mat":
106
+ return loadmat(str(p))
107
+ raise TypeError("Conditioning input must be None, a dict, or a .pt/.pth/.mat path")
108
+ def _resolve_checkpoint(self):
109
+ candidates = [
110
+ "SadTalker_V0.0.2_512.safetensors",
111
+ "SadTalker_V0.0.2_256.safetensors",
112
+ "SadTalker_V0.0.2_512.pth",
113
+ "SadTalker_V0.0.2_256.pth",
114
+ ]
115
+
116
+ for name in candidates:
117
+ p = Path(self.checkpoint_path) / name
118
+ if p.exists():
119
+ return str(p)
120
+
121
+ raise FileNotFoundError(
122
+ f"No SadTalker checkpoint found in {self.checkpoint_path}"
123
+ )
124
+
125
+ def composite_alpha_to_rgb(image_path: Path, bg_rgb=(255, 255, 255)) -> Path:
126
+ """If the input image has alpha, composite it to RGB and return a new PNG path."""
127
+ with Image.open(image_path) as im:
128
+ im = im.convert("RGBA")
129
+ bg = Image.new("RGBA", im.size, (*bg_rgb, 255))
130
+ out = Image.alpha_composite(bg, im).convert("RGB")
131
+
132
+ out_path = image_path.with_name(f"{image_path.stem}_rgb.png")
133
+ out.save(out_path)
134
+ return out_path
135
+
136
+
137
+ def prepare_image_for_sadtalker(image_path: Path, remove_background_result: Optional[Path] = None) -> Path:
138
+ if remove_background_result is None:
139
+ with Image.open(image_path) as im:
140
+ if im.mode in {"RGBA", "LA"} or ("transparency" in im.info):
141
+ return composite_alpha_to_rgb(image_path)
142
+ return image_path
143
+ return composite_alpha_to_rgb(remove_background_result)
144
+
145
+
146
+ # ============================================================
147
+ # ARCHIVE EXTRACTION
148
+ # ============================================================
149
+
150
+ @dataclass
151
+ class MountedArchive:
152
+ name: str
153
+ zip_sha256: str
154
+ target_dir: Path
155
+ marker_path: Path
156
+
157
+
158
+ def extract_zip_bytes_to_dir(zip_bytes: bytes, dest_dir: Path) -> None:
159
+ ensure_dir(dest_dir)
160
+ with zipfile.ZipFile(io.BytesIO(zip_bytes), "r") as zf:
161
+ zf.extractall(dest_dir)
162
+
163
+
164
+ def mount_zip_payload(zip_bytes: bytes, zip_sha256: str, target_dir: Path, marker_name: str) -> MountedArchive:
165
+ ensure_dir(target_dir)
166
+ marker_path = target_dir / marker_name
167
+
168
+ if marker_path.exists():
169
+ try:
170
+ existing = json.loads(marker_path.read_text(encoding="utf-8"))
171
+ if existing.get("zip_sha256") == zip_sha256 and existing.get("mounted") is True:
172
+ return MountedArchive(
173
+ name=existing.get("name", marker_name),
174
+ zip_sha256=zip_sha256,
175
+ target_dir=target_dir,
176
+ marker_path=marker_path,
177
+ )
178
+ except Exception:
179
+ pass
180
+
181
+ # Clear any stale contents before extracting.
182
+ for child in list(target_dir.iterdir()):
183
+ if child == marker_path:
184
+ continue
185
+ if child.is_dir():
186
+ shutil.rmtree(child, ignore_errors=True)
187
+ else:
188
+ try:
189
+ child.unlink()
190
+ except Exception:
191
+ pass
192
+
193
+ extract_zip_bytes_to_dir(zip_bytes, target_dir)
194
+ marker_path.write_text(
195
+ json.dumps(
196
+ {
197
+ "mounted": True,
198
+ "zip_sha256": zip_sha256,
199
+ "name": marker_name,
200
+ "created_at": utc_now_iso(),
201
+ },
202
+ indent=2,
203
+ ),
204
+ encoding="utf-8",
205
+ )
206
+ return MountedArchive(
207
+ name=marker_name,
208
+ zip_sha256=zip_sha256,
209
+ target_dir=target_dir,
210
+ marker_path=marker_path,
211
+ )
212
+
213
+
214
+ # ============================================================
215
+ # AVATAR BANK RUNTIME
216
+ # ============================================================
217
+
218
+ class AvatarBankRuntime:
219
+ def __init__(self, payload: Dict[str, Any], defaults: Optional[Dict[str, Any]] = None):
220
+ self.index: Dict[str, Dict[str, Any]] = payload.get("index", {}) or {}
221
+ self.embeddings: Dict[str, Dict[str, Any]] = payload.get("embeddings", {}) or {}
222
+ self.previews: Dict[str, Any] = payload.get("previews", {}) or {}
223
+ self.defaults = defaults or {}
224
+
225
+ @classmethod
226
+ def load(cls, path: Path, defaults: Optional[Dict[str, Any]] = None) -> "AvatarBankRuntime":
227
+ payload = torch.load(str(path), map_location="cpu", weights_only=False)
228
+ if not isinstance(payload, dict):
229
+ raise ValueError(f"Avatar bank file did not contain a dictionary: {path}")
230
+ return cls(payload, defaults=defaults)
231
+
232
+ def available_ids(self) -> List[str]:
233
+ return list(self.index.keys())
234
+
235
+ def _preview_to_numpy(self, avatar_id: str) -> Optional[np.ndarray]:
236
+ blob = self.previews.get(avatar_id)
237
+ if blob is None:
238
+ return None
239
+ try:
240
+ img = decode_png_or_zstd_image(blob)
241
+ return pil_to_numpy_rgb(img)
242
+ except Exception:
243
+ return None
244
+
245
+ def _style_is_real(self, style: Optional[str]) -> bool:
246
+ return normalize_style_name(style) in REAL_STYLE_ALIASES
247
+
248
+ def resolve_default_avatar_id(self) -> str:
249
+ if not self.index:
250
+ raise RuntimeError("Avatar bank is empty.")
251
+
252
+ default_voice = self.defaults.get("default_avatar")
253
+ if default_voice and default_voice in self.index:
254
+ return default_voice
255
+
256
+ # Prefer first real male.
257
+ for avatar_id, meta in self.index.items():
258
+ if normalize_gender_name(meta.get("gender")) == "male" and self._style_is_real(meta.get("style")):
259
+ return avatar_id
260
+
261
+ # Then any real-style avatar.
262
+ for avatar_id, meta in self.index.items():
263
+ if self._style_is_real(meta.get("style")):
264
+ return avatar_id
265
+
266
+ # Then any male avatar.
267
+ for avatar_id, meta in self.index.items():
268
+ if normalize_gender_name(meta.get("gender")) == "male":
269
+ return avatar_id
270
+
271
+ # Then any complete avatar.
272
+ for avatar_id, emb in self.embeddings.items():
273
+ if emb is not None:
274
+ return avatar_id
275
+
276
+ # Finally first available entry.
277
+ return next(iter(self.index.keys()))
278
+
279
+ def build_avatar_condition(self, avatar_id: str) -> Dict[str, Any]:
280
+ if avatar_id not in self.embeddings:
281
+ raise KeyError(f"Avatar not found: {avatar_id}")
282
+
283
+ meta = self.index.get(avatar_id, {}) or {}
284
+ emb = self.embeddings[avatar_id] or {}
285
+
286
+ coeff = emb.get("motion_3dmm")
287
+ if coeff is None:
288
+ coeff = emb.get("full_3dmm")
289
+ if coeff is None:
290
+ raise ValueError(f"Avatar '{avatar_id}' is missing motion_3dmm/full_3dmm")
291
+
292
+ crop_preview = emb.get("crop_preview")
293
+ if crop_preview is None:
294
+ crop_preview = self._preview_to_numpy(avatar_id)
295
+ else:
296
+ if torch.is_tensor(crop_preview):
297
+ crop_preview = crop_preview.detach().cpu()
298
+ elif isinstance(crop_preview, np.ndarray):
299
+ crop_preview = crop_preview
300
+ else:
301
+ crop_preview = np.asarray(crop_preview)
302
+
303
+ out = {
304
+ "avatar_id": avatar_id,
305
+ "gender": meta.get("gender"),
306
+ "style": meta.get("style"),
307
+ "coeff_3dmm": coeff.detach().cpu() if torch.is_tensor(coeff) else coeff,
308
+ "motion_3dmm": emb.get("motion_3dmm"),
309
+ "full_3dmm": emb.get("full_3dmm"),
310
+ "crop_info": emb.get("crop_info"),
311
+ "crop_preview": crop_preview,
312
+ }
313
+ if torch.is_tensor(out["motion_3dmm"]):
314
+ out["motion_3dmm"] = out["motion_3dmm"].detach().cpu()
315
+ if torch.is_tensor(out["full_3dmm"]):
316
+ out["full_3dmm"] = out["full_3dmm"].detach().cpu()
317
+ return out
318
+
319
+
320
+ # ============================================================
321
+ # BRIA RMBG BACKGROUND REMOVER (BEST-EFFORT)
322
+ # ============================================================
323
+
324
+ class BriaBackgroundRemover:
325
+ """
326
+ Best-effort loader for the packed briaaiRMBG-2.0 directory.
327
+
328
+ It searches for a likely inference script and tries callable or CLI-based
329
+ execution patterns. If the local folder layout differs, the search list
330
+ below is the only part that usually needs adjustment.
331
+ """
332
+
333
+ def __init__(self, root: Path):
334
+ self.root = root
335
+ self.entrypoint = self._discover_entrypoint()
336
+
337
+ def _discover_entrypoint(self) -> Optional[Path]:
338
+ if not self.root.exists():
339
+ return None
340
+
341
+ preferred = [
342
+ "inference.py",
343
+ "predict.py",
344
+ "app.py",
345
+ "main.py",
346
+ "run.py",
347
+ ]
348
+ for name in preferred:
349
+ hits = list(self.root.rglob(name))
350
+ if hits:
351
+ return hits[0]
352
+
353
+ # Fall back to any Python file with a likely folder name.
354
+ for p in self.root.rglob("*.py"):
355
+ lower = str(p).lower()
356
+ if "bria" in lower or "rmbg" in lower or "background" in lower:
357
+ return p
358
+ return None
359
+
360
+ def _import_module_from_path(self, py_file: Path):
361
+ module_name = f"packed_bria_{sha256_bytes(str(py_file).encode('utf-8'))[:12]}"
362
+ spec = importlib.util.spec_from_file_location(module_name, str(py_file))
363
+ if spec is None or spec.loader is None:
364
+ raise RuntimeError(f"Could not import module from {py_file}")
365
+ module = importlib.util.module_from_spec(spec)
366
+ spec.loader.exec_module(module)
367
+ return module
368
+
369
+ def _call_module_callable(self, module, image_path: Path, output_path: Path) -> bool:
370
+ candidates = [
371
+ "remove_background",
372
+ "predict_image",
373
+ "predict",
374
+ "run",
375
+ "inference",
376
+ "main",
377
+ ]
378
+ callables = [getattr(module, name, None) for name in candidates]
379
+ callables = [fn for fn in callables if callable(fn)]
380
+ for fn in callables:
381
+ attempts = [
382
+ (str(image_path), str(output_path)),
383
+ (str(image_path),),
384
+ (Image.open(image_path),),
385
+ (),
386
+ ]
387
+ for args in attempts:
388
+ try:
389
+ result = fn(*args)
390
+ if isinstance(result, (str, os.PathLike)):
391
+ result_path = Path(result)
392
+ if result_path.exists():
393
+ shutil.copy2(result_path, output_path)
394
+ return True
395
+ elif isinstance(result, Image.Image):
396
+ result.save(output_path)
397
+ return True
398
+ elif torch.is_tensor(result):
399
+ arr = result.detach().cpu().numpy()
400
+ if arr.ndim == 3 and arr.shape[-1] in (3, 4):
401
+ img = Image.fromarray(arr.astype(np.uint8))
402
+ img.save(output_path)
403
+ return True
404
+ elif result is None and output_path.exists():
405
+ return True
406
+ except Exception:
407
+ continue
408
+ return False
409
+
410
+ def _call_cli_with_patterns(self, image_path: Path, output_path: Path) -> bool:
411
+ if self.entrypoint is None:
412
+ return False
413
+
414
+ cmd_patterns = [
415
+ [str(self.entrypoint), str(image_path), str(output_path)],
416
+ [str(self.entrypoint), "--input", str(image_path), "--output", str(output_path)],
417
+ [str(self.entrypoint), "--image", str(image_path), "--output", str(output_path)],
418
+ [str(self.entrypoint), "--input_path", str(image_path), "--output_path", str(output_path)],
419
+ [str(self.entrypoint), "-i", str(image_path), "-o", str(output_path)],
420
+ ]
421
+
422
+ for args in cmd_patterns:
423
+ try:
424
+ proc = subprocess.run(
425
+ [sys.executable, *args],
426
+ cwd=str(self.root),
427
+ stdout=subprocess.DEVNULL,
428
+ stderr=subprocess.DEVNULL,
429
+ check=False,
430
+ )
431
+ if proc.returncode == 0 and output_path.exists():
432
+ return True
433
+ except Exception:
434
+ continue
435
+ return False
436
+
437
+ def remove_background(self, image_path: Path, output_dir: Path) -> Path:
438
+ if self.entrypoint is None:
439
+ raise RuntimeError(
440
+ f"No usable background-removal entrypoint found under {self.root}."
441
+ )
442
+
443
+ ensure_dir(output_dir)
444
+ output_path = output_dir / f"{image_path.stem}_rmbg.png"
445
+
446
+ try:
447
+ module = self._import_module_from_path(self.entrypoint)
448
+ if self._call_module_callable(module, image_path, output_path):
449
+ return output_path
450
+ except Exception:
451
+ pass
452
+
453
+ if self._call_cli_with_patterns(image_path, output_path):
454
+ return output_path
455
+
456
+ raise RuntimeError(
457
+ f"Could not execute background removal with entrypoint {self.entrypoint}. "
458
+ f"You may need to adjust the call patterns in BriaBackgroundRemover."
459
+ )
460
+
461
+
462
+ # ============================================================
463
+ # SADTALKER CORE RUNTIME
464
+ # ============================================================
465
+
466
+ class SadTalkerRunner:
467
+ def __init__(self, checkpoint_path: str, config_path: str, device: str = "cpu"):
468
+ self.checkpoint_path = checkpoint_path
469
+ self.config_path = config_path
470
+ self.device = device
471
+ self._mods_loaded = False
472
+ self._load_modules()
473
+
474
+ def _load_modules(self):
475
+ if self._mods_loaded:
476
+ return
477
+
478
+ from SadTalker.src.facerender.pirender_animate import AnimateFromCoeff_PIRender
479
+ from SadTalker.src.utils.preprocess import CropAndExtract
480
+ from SadTalker.src.test_audio2coeff import Audio2Coeff
481
+ from SadTalker.src.facerender.animate import AnimateFromCoeff
482
+ from SadTalker.src.generate_batch import get_data
483
+ from SadTalker.src.generate_facerender_batch import get_facerender_data
484
+ from SadTalker.src.utils.init_path import init_path
485
+
486
+ self.AnimateFromCoeff_PIRender = AnimateFromCoeff_PIRender
487
+ self.CropAndExtract = CropAndExtract
488
+ self.Audio2Coeff = Audio2Coeff
489
+ self.AnimateFromCoeff = AnimateFromCoeff
490
+ self.get_data = get_data
491
+ self.get_facerender_data = get_facerender_data
492
+ self.init_path = init_path
493
+ self._mods_loaded = True
494
+
495
+ @staticmethod
496
+ def _mp3_to_wav(mp3_filename: str, wav_filename: str, frame_rate: int):
497
+ mp3_file = AudioSegment.from_file(file=mp3_filename)
498
+ mp3_file.set_frame_rate(frame_rate).export(wav_filename, format="wav")
499
+
500
+ def _to_numpy(self, x):
501
+ if x is None:
502
+ return None
503
+ if isinstance(x, np.ndarray):
504
+ return x
505
+ if torch.is_tensor(x):
506
+ return x.detach().cpu().numpy()
507
+ return np.asarray(x)
508
+
509
+ def _save_png_from_bundle(self, bundle, out_path):
510
+ for key in ("crop_preview", "aligned_face", "image", "png"):
511
+ if key in bundle and bundle[key] is not None:
512
+ arr = self._to_numpy(bundle[key])
513
+ if arr.ndim == 3 and arr.shape[-1] in (1, 3, 4):
514
+ if arr.dtype != np.uint8:
515
+ arr = np.clip(arr, 0, 255).astype(np.uint8)
516
+ if arr.shape[-1] == 4:
517
+ img = Image.fromarray(arr, mode="RGBA").convert("RGB")
518
+ else:
519
+ img = Image.fromarray(arr, mode="RGB")
520
+ img.save(out_path)
521
+ return out_path
522
+ raise ValueError(
523
+ "Avatar conditioning bundle needs at least one image-like field such as crop_preview or aligned_face."
524
+ )
525
+
526
+ def _save_mat_from_avatar_bundle(self, bundle, out_path):
527
+ coeff_3dmm = bundle.get("coeff_3dmm", None)
528
+ if coeff_3dmm is None:
529
+ coeff_3dmm = bundle.get("motion_3dmm", None)
530
+ if coeff_3dmm is None:
531
+ coeff_3dmm = bundle.get("full_3dmm", None)
532
+ if coeff_3dmm is None:
533
+ raise ValueError("Avatar bundle must contain coeff_3dmm, motion_3dmm, or full_3dmm.")
534
+
535
+ mat_dict = {"coeff_3dmm": self._to_numpy(coeff_3dmm)}
536
+ full_3dmm = bundle.get("full_3dmm", None)
537
+ if full_3dmm is not None:
538
+ mat_dict["full_3dmm"] = self._to_numpy(full_3dmm)
539
+
540
+ savemat(out_path, mat_dict)
541
+ return out_path
542
+
543
+ def _save_mat_from_motion_bundle(self, bundle, out_path):
544
+ motion = bundle.get("motion_3dmm", None)
545
+ if motion is None:
546
+ motion = bundle.get("coeff_3dmm", None)
547
+ if motion is None:
548
+ motion = bundle.get("full_3dmm_seq", None)
549
+ if motion is None:
550
+ motion = bundle.get("full_3dmm", None)
551
+
552
+ if motion is None:
553
+ raise ValueError(
554
+ "Motion bundle must contain motion_3dmm, coeff_3dmm, full_3dmm_seq, or full_3dmm."
555
+ )
556
+
557
+ mat_dict = {"coeff_3dmm": self._to_numpy(motion)}
558
+
559
+ if "full_3dmm" in bundle and bundle["full_3dmm"] is not None:
560
+ mat_dict["full_3dmm"] = self._to_numpy(bundle["full_3dmm"])
561
+ elif "full_3dmm_seq" in bundle and bundle["full_3dmm_seq"] is not None:
562
+ seq = self._to_numpy(bundle["full_3dmm_seq"])
563
+ if seq.ndim >= 3:
564
+ mat_dict["full_3dmm"] = seq[0]
565
+ else:
566
+ mat_dict["full_3dmm"] = seq
567
+
568
+ savemat(out_path, mat_dict)
569
+ return out_path
570
+
571
+
572
+ def _bundle_from_preprocess_output(
573
+ self,
574
+ coeff_path,
575
+ crop_pic_path,
576
+ crop_info,
577
+ ):
578
+ bundle = {}
579
+
580
+ # Load whatever the SadTalker preprocessing wrote to disk.
581
+ if coeff_path is not None and os.path.isfile(coeff_path):
582
+ try:
583
+ raw = loadmat(coeff_path)
584
+ for key, value in raw.items():
585
+ if not key.startswith("__"):
586
+ bundle[key] = value
587
+ except Exception:
588
+ pass
589
+
590
+ # Preserve the paths used to generate the bundle.
591
+ if coeff_path is not None:
592
+ bundle["coeff_path"] = str(coeff_path)
593
+ if crop_pic_path is not None:
594
+ bundle["crop_pic_path"] = str(crop_pic_path)
595
+ if crop_info is not None:
596
+ bundle["crop_info"] = crop_info
597
+
598
+ # Keep a usable preview in memory.
599
+ try:
600
+ if crop_pic_path is not None and os.path.isfile(crop_pic_path):
601
+ with Image.open(crop_pic_path) as im:
602
+ bundle["crop_preview"] = pil_to_numpy_rgb(im)
603
+ except Exception:
604
+ pass
605
+
606
+ # Normalize common aliases so downstream code can rely on them.
607
+ if "coeff_3dmm" in bundle and "motion_3dmm" not in bundle:
608
+ bundle["motion_3dmm"] = bundle["coeff_3dmm"]
609
+ if "motion_3dmm" in bundle and "coeff_3dmm" not in bundle:
610
+ bundle["coeff_3dmm"] = bundle["motion_3dmm"]
611
+
612
+ if "full_3dmm" not in bundle:
613
+ if "full_3dmm_seq" in bundle:
614
+ seq = bundle["full_3dmm_seq"]
615
+ try:
616
+ if hasattr(seq, "ndim") and seq.ndim >= 3:
617
+ bundle["full_3dmm"] = seq[0]
618
+ else:
619
+ bundle["full_3dmm"] = seq
620
+ except Exception:
621
+ bundle["full_3dmm"] = seq
622
+ elif "motion_3dmm" in bundle:
623
+ bundle["full_3dmm"] = bundle["motion_3dmm"]
624
+
625
+ if "landmarks" in bundle:
626
+ bundle["landmarks"] = bundle["landmarks"]
627
+
628
+ return bundle
629
+
630
+ def extract_embeddings(
631
+ self,
632
+ input_path,
633
+ crop_or_resize: str = "crop",
634
+ pic_size: int = 256,
635
+ save_dir: Optional[str] = None,
636
+ ):
637
+ """
638
+ Public preprocessing helper.
639
+
640
+ Accepts either a source image or a reference video, runs the packed
641
+ SadTalker preprocessing, and returns the extracted conditioning bundle.
642
+ """
643
+ self._load_modules()
644
+ self._ensure_models(size=pic_size, preprocess=crop_or_resize, facerender="facevid2vid")
645
+
646
+ input_path = Path(input_path)
647
+ if not input_path.exists():
648
+ raise FileNotFoundError(str(input_path))
649
+
650
+ if save_dir is None:
651
+ save_dir = tempfile.mkdtemp(prefix="packedavatar_embeddings_")
652
+ else:
653
+ ensure_dir(Path(save_dir))
654
+
655
+ work_dir = Path(save_dir)
656
+ input_ext = input_path.suffix.lower()
657
+ video_exts = {".mp4", ".mov", ".avi", ".mkv", ".webm", ".flv", ".wmv", ".m4v", ".gif"}
658
+
659
+ if input_ext in video_exts:
660
+ frame_dir = work_dir / f"{input_path.stem}_frames"
661
+ ensure_dir(frame_dir)
662
+ coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
663
+ str(input_path),
664
+ str(frame_dir),
665
+ crop_or_resize,
666
+ source_image_flag=False,
667
+ )
668
+ else:
669
+ staged = work_dir / input_path.name
670
+ shutil.copy2(input_path, staged)
671
+
672
+ first_frame_dir = work_dir / "first_frame_dir"
673
+ ensure_dir(first_frame_dir)
674
+ coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
675
+ str(staged),
676
+ str(first_frame_dir),
677
+ crop_or_resize,
678
+ True,
679
+ pic_size,
680
+ )
681
+
682
+ return self._bundle_from_preprocess_output(coeff_path, crop_pic_path, crop_info)
683
+
684
+ def ExtractEmbeddings(
685
+ self,
686
+ input_path,
687
+ crop_or_resize: str = "crop",
688
+ pic_size: int = 256,
689
+ save_dir: Optional[str] = None,
690
+ ):
691
+ return self.extract_embeddings(
692
+ input_path=input_path,
693
+ crop_or_resize=crop_or_resize,
694
+ pic_size=pic_size,
695
+ save_dir=save_dir,
696
+ )
697
+
698
+ def _materialize_avatar_condition(self, avatar_condition, save_dir):
699
+ bundle = safe_load_bundle(avatar_condition)
700
+ if bundle is None:
701
+ return None, None, None
702
+
703
+ coeff_path = bundle.get("coeff_path", None)
704
+ crop_pic_path = bundle.get("crop_pic_path", None)
705
+ crop_info = bundle.get("crop_info", None)
706
+
707
+ if coeff_path is None or not os.path.isfile(coeff_path):
708
+ coeff_path = os.path.join(save_dir, "avatar_condition.mat")
709
+ self._save_mat_from_avatar_bundle(bundle, coeff_path)
710
+
711
+ if crop_pic_path is None or not os.path.isfile(crop_pic_path):
712
+ crop_pic_path = os.path.join(save_dir, "avatar_condition.png")
713
+ self._save_png_from_bundle(bundle, crop_pic_path)
714
+
715
+ return coeff_path, crop_pic_path, crop_info
716
+
717
+ def _materialize_motion_condition(self, motion_condition, save_dir):
718
+ bundle = safe_load_bundle(motion_condition)
719
+ if bundle is None:
720
+ return None
721
+
722
+ coeff_path = bundle.get("coeff_path", None)
723
+ if coeff_path is not None and os.path.isfile(coeff_path):
724
+ return coeff_path
725
+
726
+ coeff_path = os.path.join(save_dir, "motion_condition.mat")
727
+ self._save_mat_from_motion_bundle(bundle, coeff_path)
728
+ return coeff_path
729
+
730
+ def _resolve_checkpoint(self):
731
+ candidates = [
732
+ "SadTalker_V0.0.2_512.safetensors",
733
+ "SadTalker_V0.0.2_256.safetensors",
734
+ "SadTalker_V0.0.2_512.pth",
735
+ "SadTalker_V0.0.2_256.pth",
736
+ ]
737
+
738
+ for name in candidates:
739
+ p = Path(self.checkpoint_path) / name
740
+ if p.exists():
741
+ return str(p)
742
+
743
+ raise FileNotFoundError(
744
+ f"No SadTalker checkpoint found in {self.checkpoint_path}"
745
+ )
746
+
747
+ def _ensure_models(self, size: int, preprocess: str, facerender: str):
748
+ self.sadtalker_paths = self.init_path(
749
+ self.checkpoint_path,
750
+ self.config_path,
751
+ size,
752
+ False,
753
+ preprocess,
754
+ )
755
+
756
+ # override whatever init_path guessed
757
+ self.sadtalker_paths["checkpoint"] = self._resolve_checkpoint()
758
+
759
+ print("\n[PackedAvatar] Using checkpoint:")
760
+ print(self.sadtalker_paths["checkpoint"])
761
+
762
+ self.audio_to_coeff = self.Audio2Coeff(
763
+ self.sadtalker_paths,
764
+ self.device
765
+ )
766
+
767
+ self.preprocess_model = self.CropAndExtract(
768
+ self.sadtalker_paths,
769
+ self.device
770
+ )
771
+
772
+ if facerender == "facevid2vid" and self.device != "mps":
773
+ self.animate_from_coeff = self.AnimateFromCoeff(
774
+ self.sadtalker_paths,
775
+ self.device
776
+ )
777
+ else:
778
+ self.animate_from_coeff = self.AnimateFromCoeff_PIRender(
779
+ self.sadtalker_paths,
780
+ self.device
781
+ )
782
+
783
+ def generate(
784
+ self,
785
+ source_image=None,
786
+ driven_audio=None,
787
+ preprocess="crop",
788
+ still_mode=False,
789
+ use_enhancer=False,
790
+ batch_size=1,
791
+ size=256,
792
+ pose_style=0,
793
+ facerender="facevid2vid",
794
+ exp_scale=1.0,
795
+ use_ref_video=False,
796
+ ref_video=None,
797
+ ref_info=None,
798
+ use_idle_mode=False,
799
+ length_of_audio=0,
800
+ use_blink=True,
801
+ result_dir="./results/",
802
+ avatar_condition=None,
803
+ motion_condition=None,
804
+ ):
805
+ self._load_modules()
806
+ self._ensure_models(size=size, preprocess=preprocess, facerender=facerender)
807
+
808
+ time_tag = str(uuid.uuid4())
809
+ save_dir = os.path.join(result_dir, time_tag)
810
+ os.makedirs(save_dir, exist_ok=True)
811
+
812
+ input_dir = os.path.join(save_dir, "input")
813
+ os.makedirs(input_dir, exist_ok=True)
814
+
815
+ # -----------------------------
816
+ # Audio handling
817
+ # -----------------------------
818
+ if driven_audio is not None and os.path.isfile(driven_audio):
819
+ audio_name = os.path.basename(driven_audio)
820
+ audio_path = os.path.join(input_dir, audio_name)
821
+
822
+ if audio_name.lower().endswith(".mp3"):
823
+ wav_path = os.path.splitext(audio_path)[0] + ".wav"
824
+ self._mp3_to_wav(driven_audio, wav_path, 16000)
825
+ audio_path = wav_path
826
+ else:
827
+ shutil.copy2(driven_audio, audio_path)
828
+
829
+ elif use_idle_mode:
830
+ audio_path = os.path.join(input_dir, f"idlemode_{str(length_of_audio)}.wav")
831
+ one_sec_segment = AudioSegment.silent(duration=1000 * length_of_audio)
832
+ one_sec_segment.export(audio_path, format="wav")
833
+ else:
834
+ assert use_ref_video is True and ref_info == "all", (
835
+ "Either driven_audio, use_idle_mode, or use_ref_video/ref_info='all' must be provided."
836
+ )
837
+
838
+ if use_ref_video and ref_info == "all" and ref_video is not None:
839
+ ref_video_videoname = os.path.basename(ref_video)
840
+ audio_path = os.path.join(save_dir, ref_video_videoname + ".wav")
841
+ cmd = f'ffmpeg -y -hide_banner -loglevel error -i "{ref_video}" "{audio_path}"'
842
+ os.system(cmd)
843
+
844
+ # -----------------------------
845
+ # Avatar / source conditioning
846
+ # -----------------------------
847
+ if avatar_condition is not None:
848
+ first_coeff_path, crop_pic_path, crop_info = self._materialize_avatar_condition(
849
+ avatar_condition, save_dir
850
+ )
851
+ if first_coeff_path is None:
852
+ raise AttributeError("Invalid avatar_condition bundle.")
853
+ pic_path = crop_pic_path
854
+ else:
855
+ if source_image is None:
856
+ raise ValueError("source_image is required when avatar_condition is not provided.")
857
+
858
+ pic_path = os.path.join(input_dir, os.path.basename(source_image))
859
+ shutil.copy2(source_image, pic_path)
860
+
861
+ first_frame_dir = os.path.join(save_dir, "first_frame_dir")
862
+ os.makedirs(first_frame_dir, exist_ok=True)
863
+
864
+ first_coeff_path, crop_pic_path, crop_info = self.preprocess_model.generate(
865
+ pic_path,
866
+ first_frame_dir,
867
+ preprocess,
868
+ True,
869
+ size,
870
+ )
871
+
872
+ if first_coeff_path is None:
873
+ raise AttributeError("No face is detected")
874
+
875
+ # -----------------------------
876
+ # Motion conditioning / reference video
877
+ # -----------------------------
878
+ if motion_condition is not None:
879
+ ref_video_coeff_path = self._materialize_motion_condition(motion_condition, save_dir)
880
+ ref_pose_coeff_path = ref_video_coeff_path
881
+ ref_eyeblink_coeff_path = ref_video_coeff_path
882
+ elif use_ref_video and ref_video is not None:
883
+ ref_video_videoname = os.path.splitext(os.path.split(ref_video)[-1])[0]
884
+ ref_video_frame_dir = os.path.join(save_dir, ref_video_videoname)
885
+ os.makedirs(ref_video_frame_dir, exist_ok=True)
886
+
887
+ print("3DMM Extraction for the reference video providing pose")
888
+ ref_video_coeff_path, _, _ = self.preprocess_model.generate(
889
+ ref_video,
890
+ ref_video_frame_dir,
891
+ preprocess,
892
+ source_image_flag=False,
893
+ )
894
+
895
+ if use_ref_video:
896
+ if ref_info == "pose":
897
+ ref_pose_coeff_path = ref_video_coeff_path
898
+ ref_eyeblink_coeff_path = None
899
+ elif ref_info == "blink":
900
+ ref_pose_coeff_path = None
901
+ ref_eyeblink_coeff_path = ref_video_coeff_path
902
+ elif ref_info == "pose+blink":
903
+ ref_pose_coeff_path = ref_video_coeff_path
904
+ ref_eyeblink_coeff_path = ref_video_coeff_path
905
+ elif ref_info == "all":
906
+ ref_pose_coeff_path = None
907
+ ref_eyeblink_coeff_path = None
908
+ else:
909
+ raise ValueError("error in ref_info")
910
+ else:
911
+ ref_pose_coeff_path = None
912
+ ref_eyeblink_coeff_path = None
913
+ else:
914
+ ref_video_coeff_path = None
915
+ ref_pose_coeff_path = None
916
+ ref_eyeblink_coeff_path = None
917
+
918
+ # -----------------------------
919
+ # Audio -> coeff
920
+ # -----------------------------
921
+ if use_ref_video and ref_info == "all" and ref_video_coeff_path is not None:
922
+ coeff_path = ref_video_coeff_path
923
+ else:
924
+ batch = self.get_data(
925
+ first_coeff_path,
926
+ audio_path,
927
+ self.device,
928
+ ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
929
+ still=still_mode,
930
+ idlemode=use_idle_mode,
931
+ length_of_audio=length_of_audio,
932
+ use_blink=use_blink,
933
+ )
934
+ coeff_path = self.audio_to_coeff.generate(
935
+ batch,
936
+ save_dir,
937
+ pose_style,
938
+ ref_pose_coeff_path,
939
+ )
940
+
941
+ # -----------------------------
942
+ # coeff -> video
943
+ # -----------------------------
944
+ data = self.get_facerender_data(
945
+ coeff_path,
946
+ crop_pic_path,
947
+ first_coeff_path,
948
+ audio_path,
949
+ batch_size,
950
+ still_mode=still_mode,
951
+ preprocess=preprocess,
952
+ size=size,
953
+ expression_scale=exp_scale,
954
+ facemodel=facerender,
955
+ )
956
+
957
+ return_path = self.animate_from_coeff.generate(
958
+ data,
959
+ save_dir,
960
+ crop_pic_path if avatar_condition is not None else pic_path,
961
+ crop_info,
962
+ enhancer="gfpgan" if use_enhancer else None,
963
+ preprocess=preprocess,
964
+ img_size=size,
965
+ )
966
+
967
+ video_name = data.get("video_name", "output")
968
+ print(f"The generated video is named {video_name} in {save_dir}")
969
+
970
+ del self.preprocess_model
971
+ del self.audio_to_coeff
972
+ del self.animate_from_coeff
973
+
974
+ if torch.cuda.is_available():
975
+ torch.cuda.empty_cache()
976
+ torch.cuda.synchronize()
977
+
978
+ import gc
979
+
980
+ gc.collect()
981
+
982
+ return return_path, audio_path, save_dir
983
+
984
+
985
+ # ============================================================
986
+ # PACKED AVATAR ORCHESTRATOR
987
+ # ============================================================
988
+
989
+ class PackedAvatar:
990
+ def __init__(
991
+ self,
992
+ packed_pt_path: str = None,
993
+ cache_dir: Optional[str] = None,
994
+ device: Optional[str] = None,
995
+ ):
996
+ self.packed_pt_path = Path(packed_pt_path or (Path(__file__).resolve().parent / "checkpoints" / "PackedAvatar.pt"))
997
+ if not self.packed_pt_path.exists():
998
+ raise FileNotFoundError(f"Packed bundle not found: {self.packed_pt_path}")
999
+
1000
+ self.device = device or (
1001
+ "cuda" if torch.cuda.is_available() else ("mps" if platform.system() == "Darwin" else "cpu")
1002
+ )
1003
+
1004
+ self.cache_dir = Path(cache_dir) if cache_dir else Path(tempfile.gettempdir()) / "PackedAvatarCache"
1005
+ ensure_dir(self.cache_dir)
1006
+
1007
+ self.bundle = self._load_bundle(self.packed_pt_path)
1008
+ self.manifest = self.bundle.get("manifest", {}) or {}
1009
+
1010
+ self._extract_and_mount()
1011
+ self._mount_python_path()
1012
+
1013
+ self.avatar_bank = self._load_avatar_bank()
1014
+ self.bria_root = self.extracted_root / "checkpoints" / "briaaiRMBG-2.0"
1015
+ self.background_remover = BriaBackgroundRemover(self.bria_root)
1016
+ self._runner_cache: Dict[Tuple[int, str, str], SadTalkerRunner] = {}
1017
+
1018
+ @staticmethod
1019
+ def _load_bundle(path: Path) -> Dict[str, Any]:
1020
+ bundle = torch.load(str(path), map_location="cpu", weights_only=False)
1021
+ if not isinstance(bundle, dict):
1022
+ raise ValueError("PackedAvatar.pt did not contain a dictionary bundle.")
1023
+ return bundle
1024
+
1025
+ def _asset_bytes(self, key: str) -> bytes:
1026
+ asset = self.bundle.get("assets", {}).get(key)
1027
+ if asset is None:
1028
+ raise KeyError(f"Missing asset in bundle: {key}")
1029
+ return tensor_to_bytes(asset)
1030
+
1031
+ def _bundle_id(self) -> str:
1032
+ ck_hash = self.manifest.get("archives", {}).get("checkpoints_zip", {}).get("sha256", "")
1033
+ sd_hash = self.manifest.get("archives", {}).get("sadtalker_zip", {}).get("sha256", "")
1034
+ seed = f"{ck_hash}:{sd_hash}".encode("utf-8")
1035
+ return sha256_bytes(seed)[:16]
1036
+
1037
+ def _extract_and_mount(self) -> None:
1038
+ bundle_id = self._bundle_id()
1039
+ runtime_root = self.cache_dir / f"packedavatar_{bundle_id}"
1040
+ self.runtime_root = runtime_root
1041
+ self.extracted_root = runtime_root / "extracted"
1042
+ ensure_dir(self.extracted_root)
1043
+
1044
+ marker = runtime_root / "mount.json"
1045
+ expected = {
1046
+ "bundle_id": bundle_id,
1047
+ "checkpoints_sha256": self.manifest.get("archives", {}).get("checkpoints_zip", {}).get("sha256"),
1048
+ "sadtalker_sha256": self.manifest.get("archives", {}).get("sadtalker_zip", {}).get("sha256"),
1049
+ }
1050
+
1051
+ if marker.exists():
1052
+ try:
1053
+ existing = json.loads(marker.read_text(encoding="utf-8"))
1054
+ if existing == expected:
1055
+ self.checkpoints_dir = self.extracted_root / "checkpoints"
1056
+ self.sadtalker_dir = self.extracted_root / "SadTalker"
1057
+ return
1058
+ except Exception:
1059
+ pass
1060
+
1061
+ # Reset stale extraction if the bundle changed.
1062
+ if self.extracted_root.exists():
1063
+ for child in list(self.extracted_root.iterdir()):
1064
+ if child.is_dir():
1065
+ shutil.rmtree(child, ignore_errors=True)
1066
+ else:
1067
+ try:
1068
+ child.unlink()
1069
+ except Exception:
1070
+ pass
1071
+
1072
+ checkpoints_zip = self._asset_bytes("checkpoints_zip")
1073
+ sadtalker_zip = self._asset_bytes("sadtalker_zip")
1074
+
1075
+ # Extract both archives into the same extracted root.
1076
+ extract_zip_bytes_to_dir(checkpoints_zip, self.extracted_root)
1077
+ extract_zip_bytes_to_dir(sadtalker_zip, self.extracted_root)
1078
+
1079
+ marker.write_text(json.dumps(expected, indent=2), encoding="utf-8")
1080
+
1081
+ self.checkpoints_dir = self.extracted_root / "checkpoints"
1082
+ self.sadtalker_dir = self.extracted_root / "SadTalker"
1083
+
1084
+ if not self.checkpoints_dir.exists():
1085
+ raise RuntimeError(f"checkpoints folder missing after extraction: {self.checkpoints_dir}")
1086
+ if not self.sadtalker_dir.exists():
1087
+ raise RuntimeError(f"SadTalker folder missing after extraction: {self.sadtalker_dir}")
1088
+
1089
+ def _mount_python_path(self) -> None:
1090
+ extracted = str(self.extracted_root)
1091
+ if extracted not in sys.path:
1092
+ sys.path.insert(0, extracted)
1093
+
1094
+ def _load_avatar_bank(self) -> AvatarBankRuntime:
1095
+ bank_path = self.checkpoints_dir / "AvatarBank.pt"
1096
+ if not bank_path.exists():
1097
+ raise FileNotFoundError(f"AvatarBank.pt not found inside packed checkpoints: {bank_path}")
1098
+ defaults = {
1099
+ "default_avatar": self.manifest.get("defaults", {}).get("default_avatar", ""),
1100
+ "real_style_aliases": self.manifest.get("defaults", {}).get("real_style_aliases", list(REAL_STYLE_ALIASES)),
1101
+ }
1102
+ return AvatarBankRuntime.load(bank_path, defaults=defaults)
1103
+
1104
+ def _get_runner(self, size: int, preprocess: str, facerender: str) -> SadTalkerRunner:
1105
+ key = (int(size), preprocess, facerender)
1106
+ runner = self._runner_cache.get(key)
1107
+ if runner is None:
1108
+ runner = SadTalkerRunner(
1109
+ checkpoint_path=str(self.checkpoints_dir),
1110
+ config_path=str(self.sadtalker_dir / "src" / "config"),
1111
+ device=self.device,
1112
+ )
1113
+ self._runner_cache[key] = runner
1114
+ return runner
1115
+
1116
+
1117
+ def extract_embeddings(
1118
+ self,
1119
+ input_path: str,
1120
+ crop_or_resize: str = "crop",
1121
+ pic_size: int = 256,
1122
+ save_dir: Optional[str] = None,
1123
+ ) -> Dict[str, Any]:
1124
+ """
1125
+ Extract a conditioning bundle from a source image or reference video.
1126
+
1127
+ The returned dictionary is the same kind of bundle the runtime uses
1128
+ internally for avatar conditioning and motion conditioning.
1129
+ """
1130
+ runner = self._get_runner(size=pic_size, preprocess=crop_or_resize, facerender="facevid2vid")
1131
+ return runner.extract_embeddings(
1132
+ input_path=input_path,
1133
+ crop_or_resize=crop_or_resize,
1134
+ pic_size=pic_size,
1135
+ save_dir=save_dir,
1136
+ )
1137
+
1138
+ def ExtractEmbeddings(
1139
+ self,
1140
+ input_path: str,
1141
+ crop_or_resize: str = "crop",
1142
+ pic_size: int = 256,
1143
+ save_dir: Optional[str] = None,
1144
+ ) -> Dict[str, Any]:
1145
+ return self.extract_embeddings(
1146
+ input_path=input_path,
1147
+ crop_or_resize=crop_or_resize,
1148
+ pic_size=pic_size,
1149
+ save_dir=save_dir,
1150
+ )
1151
+
1152
+ def _resolve_avatar_condition_from_bank(self, avatar_id: Optional[str]) -> Dict[str, Any]:
1153
+ if avatar_id is None:
1154
+ avatar_id = self.avatar_bank.resolve_default_avatar_id()
1155
+ return self.avatar_bank.build_avatar_condition(avatar_id)
1156
+
1157
+ def _normalize_avatar_condition(self, avatar_condition: Any) -> Optional[Dict[str, Any]]:
1158
+ bundle = safe_load_bundle(avatar_condition)
1159
+ if bundle is None:
1160
+ return None
1161
+ if "coeff_3dmm" not in bundle:
1162
+ if "motion_3dmm" in bundle and bundle["motion_3dmm"] is not None:
1163
+ bundle["coeff_3dmm"] = bundle["motion_3dmm"]
1164
+ elif "full_3dmm" in bundle and bundle["full_3dmm"] is not None:
1165
+ bundle["coeff_3dmm"] = bundle["full_3dmm"]
1166
+ return bundle
1167
+
1168
+ def _remove_background_if_requested(
1169
+ self,
1170
+ source_image: Optional[str],
1171
+ remove_background: bool,
1172
+ work_dir: Path,
1173
+ ) -> Optional[Path]:
1174
+ if source_image is None:
1175
+ return None
1176
+
1177
+ src = Path(source_image)
1178
+ if not src.exists():
1179
+ raise FileNotFoundError(str(src))
1180
+
1181
+ ensure_dir(work_dir)
1182
+ staged = work_dir / src.name
1183
+ shutil.copy2(src, staged)
1184
+
1185
+ if not remove_background:
1186
+ return prepare_image_for_sadtalker(staged)
1187
+
1188
+ # Best-effort background removal using the packed Bria folder.
1189
+ try:
1190
+ removed = self.background_remover.remove_background(staged, work_dir)
1191
+ return prepare_image_for_sadtalker(staged, removed)
1192
+ except Exception as e:
1193
+ raise RuntimeError(
1194
+ f"remove_background=True was requested, but Bria RMBG execution failed: {e}"
1195
+ ) from e
1196
+
1197
+ def _run_wav2lip_gan(
1198
+ self,
1199
+ face_video: str,
1200
+ audio_path: str,
1201
+ save_dir: str,
1202
+ wav2lip_repo: Optional[str] = None,
1203
+ ) -> str:
1204
+ wav2lip_checkpoint = self.checkpoints_dir / "wav2lip_gan.pth"
1205
+ if not wav2lip_checkpoint.is_file():
1206
+ raise FileNotFoundError(
1207
+ f"Could not find bundled Wav2Lip GAN checkpoint at: {wav2lip_checkpoint}"
1208
+ )
1209
+
1210
+ candidate_repos = []
1211
+ if wav2lip_repo:
1212
+ candidate_repos.append(Path(wav2lip_repo))
1213
+
1214
+ # Prefer packed locations first.
1215
+ candidate_repos.extend([
1216
+ self.checkpoints_dir / "Wav2Lip",
1217
+ self.sadtalker_dir / "Wav2Lip",
1218
+ Path(__file__).resolve().parent / "Wav2Lip",
1219
+ ])
1220
+
1221
+ repo = None
1222
+ for candidate in candidate_repos:
1223
+ if candidate is None:
1224
+ continue
1225
+ inference_py = candidate / "inference.py"
1226
+ if inference_py.is_file():
1227
+ repo = candidate
1228
+ break
1229
+
1230
+ # No error just because wav2lip_repo was not passed.
1231
+ # If we cannot find runnable Wav2Lip code anywhere, fall back gracefully.
1232
+ if repo is None:
1233
+ print(
1234
+ "[PackedAvatar] Wav2Lip inference code was not found; "
1235
+ "skipping Wav2Lip post-processing and returning the SadTalker video."
1236
+ )
1237
+ return face_video
1238
+
1239
+ inference_py = repo / "inference.py"
1240
+
1241
+ out_video = os.path.join(save_dir, f"{Path(face_video).stem}_wav2lip_gan.mp4")
1242
+ cmd = [
1243
+ sys.executable,
1244
+ str(inference_py),
1245
+ "--checkpoint_path",
1246
+ str(wav2lip_checkpoint),
1247
+ "--face",
1248
+ str(face_video),
1249
+ "--audio",
1250
+ str(audio_path),
1251
+ "--outfile",
1252
+ str(out_video),
1253
+ ]
1254
+ subprocess.run(cmd, cwd=str(repo), check=True)
1255
+ return out_video
1256
+
1257
+ def generate(
1258
+ self,
1259
+ source_image: Optional[str] = None,
1260
+ driven_audio: Optional[str] = None,
1261
+ preprocess: str = "crop",
1262
+ still_mode: bool = False,
1263
+ use_enhancer: bool = False,
1264
+ batch_size: int = 1,
1265
+ size: int = 256,
1266
+ pose_style: int = 0,
1267
+ facerender: str = "facevid2vid",
1268
+ exp_scale: float = 1.0,
1269
+ use_ref_video: bool = False,
1270
+ ref_video: Optional[str] = None,
1271
+ ref_info: Optional[str] = None,
1272
+ use_idle_mode: bool = False,
1273
+ length_of_audio: int = 0,
1274
+ use_blink: bool = True,
1275
+ result_dir: str = "./results/",
1276
+ avatar_id: Optional[str] = None,
1277
+ avatar_condition: Optional[Any] = None,
1278
+ motion_condition: Optional[Any] = None,
1279
+ remove_background: bool = False,
1280
+ use_wav2lip: bool = False,
1281
+ wav2lip_repo: Optional[str] = None,
1282
+ ) -> str:
1283
+ runner = self._get_runner(size=size, preprocess=preprocess, facerender=facerender)
1284
+ ensure_dir(Path(result_dir))
1285
+
1286
+ # If the caller did not provide a source image or explicit avatar condition,
1287
+ # use the bank. If a source image is provided, it stays in the SadTalker path.
1288
+ resolved_avatar_condition = self._normalize_avatar_condition(avatar_condition)
1289
+ source_image_for_runner: Optional[str] = source_image
1290
+
1291
+ if resolved_avatar_condition is None:
1292
+ if source_image_for_runner is None:
1293
+ resolved_avatar_condition = self._resolve_avatar_condition_from_bank(avatar_id)
1294
+ else:
1295
+ # source_image path will be used directly by SadTalker; optionally background remove it.
1296
+ source_work_dir = self.runtime_root / "source_work"
1297
+ ensure_dir(source_work_dir)
1298
+ prepared = self._remove_background_if_requested(source_image_for_runner, remove_background, source_work_dir)
1299
+ source_image_for_runner = str(prepared) if prepared is not None else source_image_for_runner
1300
+ else:
1301
+ # If an explicit avatar_condition is supplied, it supersedes source_image-driven conditioning.
1302
+ source_image_for_runner = None
1303
+
1304
+ # When avatar_id is explicitly selected and no source_image/condition was given,
1305
+ # build the corresponding condition from the packed AvatarBank.
1306
+ if resolved_avatar_condition is None and source_image_for_runner is None:
1307
+ resolved_avatar_condition = self._resolve_avatar_condition_from_bank(avatar_id)
1308
+
1309
+ return_path, audio_path, save_dir = runner.generate(
1310
+ source_image=source_image_for_runner,
1311
+ driven_audio=driven_audio,
1312
+ preprocess=preprocess,
1313
+ still_mode=still_mode,
1314
+ use_enhancer=use_enhancer,
1315
+ batch_size=batch_size,
1316
+ size=size,
1317
+ pose_style=pose_style,
1318
+ facerender=facerender,
1319
+ exp_scale=exp_scale,
1320
+ use_ref_video=use_ref_video,
1321
+ ref_video=ref_video,
1322
+ ref_info=ref_info,
1323
+ use_idle_mode=use_idle_mode,
1324
+ length_of_audio=length_of_audio,
1325
+ use_blink=use_blink,
1326
+ result_dir=result_dir,
1327
+ avatar_condition=resolved_avatar_condition,
1328
+ motion_condition=motion_condition,
1329
+ )
1330
+
1331
+ if use_wav2lip:
1332
+ return_path = self._run_wav2lip_gan(
1333
+ face_video=return_path,
1334
+ audio_path=audio_path,
1335
+ save_dir=save_dir,
1336
+ wav2lip_repo=wav2lip_repo,
1337
+ )
1338
+
1339
+ return return_path
1340
+
1341
+
1342
+ PackedAvatarModel = PackedAvatar
1343
+
1344
+
1345
+ # ============================================================
1346
+ # CLI
1347
+ # ============================================================
1348
+
1349
+ def build_parser() -> argparse.ArgumentParser:
1350
+ p = argparse.ArgumentParser(description="Run the packed avatar bundle.")
1351
+ p.add_argument("--packed-pt", type=Path, default=Path(__file__).resolve().parent / "PackedAvatar.pt")
1352
+ p.add_argument("--cache-dir", type=Path, default=None)
1353
+ p.add_argument("--device", type=str, default=None)
1354
+ p.add_argument("--source-image", type=Path, default=None)
1355
+ p.add_argument("--driven-audio", type=Path, default="speech.wav")
1356
+ p.add_argument("--avatar-id", type=str, default=None)
1357
+ p.add_argument("--avatar-condition", type=Path, default=None)
1358
+ p.add_argument("--motion-condition", type=Path, default=None)
1359
+ p.add_argument("--remove-background", action="store_true")
1360
+ p.add_argument("--use-wav2lip", action="store_true", default=True)
1361
+ p.add_argument("--wav2lip-repo", type=Path, default=None)
1362
+ p.add_argument("--result-dir", type=Path, default=Path("./results"))
1363
+ p.add_argument("--preprocess", type=str, default="crop")
1364
+ p.add_argument("--size", type=int, default=256)
1365
+ p.add_argument("--facerender", type=str, default="facevid2vid")
1366
+ p.add_argument("--still-mode", action="store_true")
1367
+ p.add_argument("--use-enhancer", action="store_true")
1368
+ p.add_argument("--batch-size", type=int, default=1)
1369
+ p.add_argument("--pose-style", type=int, default=0)
1370
+ p.add_argument("--exp-scale", type=float, default=1.0)
1371
+ p.add_argument("--use-ref-video", action="store_true")
1372
+ p.add_argument("--ref-video", type=Path, default=None)
1373
+ p.add_argument("--ref-info", type=str, default=None)
1374
+ p.add_argument("--use-idle-mode", action="store_true")
1375
+ p.add_argument("--length-of-audio", type=int, default=0)
1376
+ p.add_argument("--use-blink", action="store_true", default=True)
1377
+ p.add_argument("--no-blink", action="store_false", dest="use_blink")
1378
+ p.add_argument("--manual-audio", action="store_true", help="Alias for driven-audio handling; kept for clarity.")
1379
+ return p
1380
+
1381
+
1382
+ def main() -> None:
1383
+ parser = build_parser()
1384
+ args = parser.parse_args()
1385
+
1386
+ model = PackedAvatar(
1387
+ packed_pt_path=str(args.packed_pt),
1388
+ cache_dir=str(args.cache_dir) if args.cache_dir else None,
1389
+ device=args.device,
1390
+ )
1391
+
1392
+ avatar_condition = args.avatar_condition if args.avatar_condition else None
1393
+ motion_condition = args.motion_condition if args.motion_condition else None
1394
+
1395
+ output = model.generate(
1396
+ source_image=str(args.source_image) if args.source_image else None,
1397
+ driven_audio=str(args.driven_audio) if args.driven_audio else None,
1398
+ preprocess=args.preprocess,
1399
+ still_mode=args.still_mode,
1400
+ use_enhancer=args.use_enhancer,
1401
+ batch_size=args.batch_size,
1402
+ size=args.size,
1403
+ pose_style=args.pose_style,
1404
+ facerender=args.facerender,
1405
+ exp_scale=args.exp_scale,
1406
+ use_ref_video=args.use_ref_video,
1407
+ ref_video=str(args.ref_video) if args.ref_video else None,
1408
+ ref_info=args.ref_info,
1409
+ use_idle_mode=args.use_idle_mode,
1410
+ length_of_audio=args.length_of_audio,
1411
+ use_blink=args.use_blink,
1412
+ result_dir=str(args.result_dir),
1413
+ avatar_id=args.avatar_id,
1414
+ avatar_condition=str(avatar_condition) if avatar_condition else None,
1415
+ motion_condition=str(motion_condition) if motion_condition else None,
1416
+ remove_background=args.remove_background,
1417
+ use_wav2lip=args.use_wav2lip,
1418
+ wav2lip_repo=str(args.wav2lip_repo) if args.wav2lip_repo else None,
1419
+ )
1420
+
1421
+ print(output)
1422
+
1423
+
1424
+ if __name__ == "__main__":
1425
+ main()
README.md CHANGED
@@ -1,3 +1,594 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+
4
+ language:
5
+ - en
6
+
7
+ tags:
8
+ - talking-head
9
+ - face-animation
10
+ - avatar
11
+ - image-to-video
12
+ - audio-to-video
13
+ - motion-transfer
14
+ - lip-sync
15
+ - face-synthesis
16
+ - video-generation
17
+ - generative-ai
18
+ - multimodal
19
+ - pytorch
20
+ - sad-talker
21
+ - wav2lip
22
+ - rmbg
23
+ - packed-model
24
+ ---
25
+
26
+ # PackedAvatar
27
+
28
+ PackedAvatar is a **self-contained talking-head generation runtime** that bundles the SadTalker-based avatar pipeline into a single `.pt` artifact.
29
+
30
+ It supports generating animated talking avatars from:
31
+
32
+ * a single image + audio
33
+ * a prebuilt AvatarBank identity
34
+ * explicit avatar conditioning bundles
35
+ * motion transfer bundles
36
+ * reference-video driving
37
+ * optional Wav2Lip post-processing
38
+
39
+ All core runtime assets are packaged inside `PackedAvatar.pt`.
40
+
41
+ Core model assets are bundled, but a few auxiliary helper weights may still be downloaded on the first run if they are not already cached locally.
42
+
43
+ ---
44
+
45
+ # What is included
46
+
47
+ `PackedAvatar.pt` contains:
48
+
49
+ * SadTalker source code snapshot
50
+ * SadTalker checkpoints
51
+ * AvatarBank identity system
52
+ * Bria RMBG 2.0 background removal assets
53
+ * Wav2Lip GAN checkpoint
54
+ * BFM / face model assets
55
+ * configuration files
56
+ * runtime manifests and hashes
57
+ * cached avatar metadata
58
+
59
+ This is a **runtime artifact**, not a training checkpoint.
60
+
61
+ ---
62
+
63
+ # Repository contents
64
+
65
+ * `PackedAvatar.pt` — full bundled runtime
66
+ * `PackedAvatar.py` — loader + inference engine
67
+ * `requirements.txt` — dependencies
68
+ * `README.md` — usage guide
69
+
70
+ ---
71
+
72
+ # Features
73
+
74
+ * Single-file deployment (`.pt`) for the main runtime
75
+ * Full SadTalker pipeline bundled
76
+ * AvatarBank identity system
77
+ * Image / avatar / motion / video conditioning
78
+ * Automatic background removal (Bria RMBG)
79
+ * Optional Wav2Lip GAN post-processing
80
+ * CPU / CUDA
81
+ * Automatic caching and extraction system
82
+ * CLI + Python API support
83
+
84
+ ---
85
+
86
+ # Requirements
87
+
88
+ * Python 3.10+
89
+ * PyTorch
90
+ * FFmpeg (for reference-video audio extraction)
91
+ * Dependencies listed in `requirements.txt`
92
+
93
+ GPU is recommended; CPU is supported.
94
+
95
+ ---
96
+
97
+ # Quick start
98
+
99
+ ## 1) Install dependencies
100
+
101
+ ```bash
102
+ pip install -r requirements.txt
103
+ ```
104
+
105
+ ## 2) Place the bundle
106
+
107
+ ```text
108
+ PackedAvatar.pt
109
+ ```
110
+
111
+ ## 3) Basic generation
112
+
113
+ ```python
114
+ from PackedAvatar import PackedAvatar
115
+
116
+ model = PackedAvatar("PackedAvatar.pt")
117
+
118
+ video = model.generate(
119
+ source_image="person.jpg",
120
+ driven_audio="speech.wav"
121
+ )
122
+
123
+ print(video)
124
+ ```
125
+
126
+ ---
127
+
128
+ # AvatarBank usage
129
+
130
+ Generate directly from a prebuilt identity:
131
+
132
+ ```python
133
+ video = model.generate(
134
+ avatar_id="Rebecca",
135
+ driven_audio="speech.wav"
136
+ )
137
+ ```
138
+
139
+ No source image is required for this path.
140
+
141
+ If `avatar_id` is omitted, the runtime selects a default avatar from the packed bank.
142
+
143
+ ---
144
+
145
+ # Prepacked AvatarBank table
146
+
147
+ The following avatars are prepacked in the bank.
148
+
149
+ ## Female
150
+
151
+ | Style | Names |
152
+ | ----- | --------------------------------------------------------------------------------------------------------------------------------- |
153
+ | anime | Alison, Amber, Andrea, Angela, Christine, Cynthia, Heidi, Jennifer, Karla, Kristen, Laura, Nancy, Patricia, Rebecca, Sandra, Tara |
154
+ | cyber | Amanda, Brenda, Christina, Janet, Jill, Julie, Lisa, Mallory, Mandy, Martha, Melissa, Michelle, Regina |
155
+ | drawn | Alyssa, Danielle, Joan, Kaitlyn, Kimberly, Marie, Samantha, Veronica |
156
+ | paint | Alejandra, Barbara, Briana, Brittany, Emily, Jacqueline, Jodi, Mary, Rhonda, Savannah, Tammy, Victoria, Yolanda |
157
+ | real | Amy, Ann, Ashley, Colleen, Heather, Holly, Jordan, Kristin, Kristine, Mariah, Pamela, Sara, Sharon |
158
+
159
+ ## Male
160
+
161
+ | Style | Names |
162
+ | ----- | ----------------------------------------------------------------- |
163
+ | anime | Brad, Brian, David, Gregory, John, Jose, Lawrence, Robert |
164
+ | cyber | Daniel, Hayden, James, Jeremy, Paul, Ryan, Sean |
165
+ | drawn | Bobby, George, Gregg, Kevin, Matthew, Ricky, Thomas |
166
+ | paint | Jacob, Justin, Michael, Nicholas, Steven, William, Zachary |
167
+ | real | Aaron, Andrew, Benjamin, Christopher, Derek, Frank, Jesse, Joseph |
168
+
169
+ There are **100 avatars total** in the bank.
170
+
171
+ ---
172
+
173
+ # Default avatar
174
+
175
+ If no avatar is explicitly selected, the runtime resolves a default in this order:
176
+
177
+ 1. `defaults.default_avatar` from the manifest, if present and valid
178
+ 2. first real-style male avatar
179
+ 3. any real-style avatar
180
+ 4. any male avatar
181
+ 5. any avatar with embeddings
182
+ 6. first available avatar entry
183
+
184
+ ---
185
+
186
+ # Source image mode
187
+
188
+ ```python
189
+ video = model.generate(
190
+ source_image="portrait.png",
191
+ driven_audio="speech.wav"
192
+ )
193
+ ```
194
+
195
+ Pipeline:
196
+
197
+ ```text
198
+ image → face detection → crop → 3DMM extraction → animation
199
+ ```
200
+
201
+ ---
202
+
203
+ # Background removal (Bria RMBG)
204
+
205
+ ```python
206
+ video = model.generate(
207
+ source_image="portrait.png",
208
+ driven_audio="speech.wav",
209
+ remove_background=True
210
+ )
211
+ ```
212
+
213
+ Pipeline:
214
+
215
+ ```text
216
+ image → Bria RMBG → foreground → SadTalker → video
217
+ ```
218
+
219
+ ---
220
+
221
+ # Explicit avatar conditioning
222
+
223
+ `avatar_condition` may be:
224
+
225
+ * a Python `dict`
226
+ * a `.pt` / `.pth` file
227
+ * a `.mat` file
228
+
229
+ When `avatar_condition` is provided, it supersedes `source_image`-driven conditioning.
230
+
231
+ ```python
232
+ video = model.generate(
233
+ avatar_condition="my_avatar_condition.pt",
234
+ driven_audio="speech.wav"
235
+ )
236
+ ```
237
+
238
+ A valid avatar bundle can include fields such as:
239
+
240
+ * `coeff_3dmm`
241
+ * `motion_3dmm`
242
+ * `full_3dmm`
243
+ * `crop_preview`
244
+ * `crop_info`
245
+
246
+ ---
247
+
248
+ # Motion conditioning
249
+
250
+ ```python
251
+ video = model.generate(
252
+ source_image="portrait.png",
253
+ driven_audio="speech.wav",
254
+ motion_condition="motion.pt"
255
+ )
256
+ ```
257
+
258
+ Supported motion inputs include:
259
+
260
+ * `motion_3dmm`
261
+ * `coeff_3dmm`
262
+ * `full_3dmm_seq`
263
+ * `full_3dmm`
264
+
265
+ ---
266
+
267
+ # Reference-video driving
268
+
269
+ ```python
270
+ video = model.generate(
271
+ source_image="portrait.png",
272
+ driven_audio="speech.wav",
273
+ use_ref_video=True,
274
+ ref_video="reference.mp4",
275
+ ref_info="pose"
276
+ )
277
+ ```
278
+
279
+ Supported `ref_info` values:
280
+
281
+ * `pose`
282
+ * `blink`
283
+ * `pose+blink`
284
+ * `all`
285
+
286
+ When `ref_info="all"`, the runtime uses the reference video coefficients directly.
287
+
288
+ ---
289
+
290
+ # Wav2Lip GAN (optional)
291
+
292
+ ```python
293
+ video = model.generate(
294
+ source_image="portrait.png",
295
+ driven_audio="speech.wav",
296
+ use_wav2lip=True,
297
+ wav2lip_repo="/path/to/Wav2Lip"
298
+ )
299
+ ```
300
+
301
+ Post-processes the SadTalker output for improved lip sync.
302
+
303
+ The bundled checkpoint `checkpoints/wav2lip_gan.pth` is used automatically.
304
+
305
+ If no runnable Wav2Lip inference code is found, the runtime falls back to the SadTalker video instead of crashing.
306
+
307
+ ---
308
+
309
+ # Idle mode
310
+
311
+ Generate with silent audio instead of an input file:
312
+
313
+ ```python
314
+ video = model.generate(
315
+ avatar_id="Aaron",
316
+ use_idle_mode=True,
317
+ length_of_audio=4
318
+ )
319
+ ```
320
+
321
+ ---
322
+
323
+ # Still mode
324
+
325
+ Reduces head movement:
326
+
327
+ ```python
328
+ still_mode=True
329
+ ```
330
+
331
+ ---
332
+
333
+ # Expression control
334
+
335
+ ```python
336
+ exp_scale=1.2
337
+ ```
338
+
339
+ * higher values → more expressive motion
340
+ * lower values → more neutral motion
341
+
342
+ ---
343
+
344
+ # Face render backend
345
+
346
+ ```python
347
+ facerender="facevid2vid"
348
+ ```
349
+
350
+ ---
351
+
352
+ # Device selection
353
+
354
+ Automatically chooses:
355
+
356
+ * CUDA when available
357
+ * Apple Silicon MPS on macOS when available
358
+ * CPU fallback otherwise
359
+
360
+ Override:
361
+
362
+ ```python
363
+ PackedAvatar(device="cuda")
364
+ ```
365
+
366
+ ---
367
+
368
+ # Python API (full example)
369
+
370
+ ```python
371
+ from PackedAvatar import PackedAvatar
372
+
373
+ model = PackedAvatar(
374
+ packed_pt_path="PackedAvatar.pt",
375
+ device="cuda",
376
+ cache_dir="./cache"
377
+ )
378
+
379
+ video = model.generate(
380
+ source_image="speaker.png",
381
+ driven_audio="speech.wav",
382
+ remove_background=True,
383
+ use_wav2lip=True,
384
+ size=512,
385
+ exp_scale=1.2,
386
+ pose_style=1,
387
+ still_mode=False
388
+ )
389
+
390
+ print(video)
391
+ ```
392
+
393
+ ---
394
+
395
+ # Preprocessing helpers
396
+
397
+ The runtime exposes an embedding extraction helper for image or video conditioning:
398
+
399
+ ```python
400
+ bundle = model.extract_embeddings(
401
+ input_path="test_image.png",
402
+ crop_or_resize="crop",
403
+ pic_size=256
404
+ )
405
+ ```
406
+
407
+ Camel-case alias:
408
+
409
+ ```python
410
+ bundle = model.ExtractEmbeddings("test_image.png")
411
+ ```
412
+
413
+ The returned bundle can be saved and reused as `avatar_condition` or `motion_condition`.
414
+
415
+ ---
416
+
417
+ # CLI usage
418
+
419
+ ## Basic
420
+
421
+ ```bash
422
+ python PackedAvatar.py \
423
+ --source-image person.jpg \
424
+ --driven-audio speech.wav
425
+ ```
426
+
427
+ ## AvatarBank
428
+
429
+ ```bash
430
+ python PackedAvatar.py \
431
+ --avatar-id Rebecca \
432
+ --driven-audio speech.wav
433
+ ```
434
+
435
+ ## Background removal
436
+
437
+ ```bash
438
+ python PackedAvatar.py \
439
+ --source-image portrait.png \
440
+ --driven-audio speech.wav \
441
+ --remove-background
442
+ ```
443
+
444
+ ## Wav2Lip
445
+
446
+ ```bash
447
+ python PackedAvatar.py \
448
+ --source-image portrait.png \
449
+ --driven-audio speech.wav \
450
+ --use-wav2lip \
451
+ --wav2lip-repo /path/to/Wav2Lip
452
+ ```
453
+
454
+ ## Reference video driving
455
+
456
+ ```bash
457
+ python PackedAvatar.py \
458
+ --source-image portrait.png \
459
+ --driven-audio speech.wav \
460
+ --use-ref-video \
461
+ --ref-video reference.mp4 \
462
+ --ref-info pose+blink
463
+ ```
464
+
465
+ ## Idle mode
466
+
467
+ ```bash
468
+ python PackedAvatar.py \
469
+ --avatar-id Aaron \
470
+ --use-idle-mode \
471
+ --length-of-audio 5
472
+ ```
473
+
474
+ ## Explicit avatar conditioning bundle
475
+
476
+ ```bash
477
+ python PackedAvatar.py \
478
+ --avatar-condition avatar_condition.pt \
479
+ --driven-audio speech.wav
480
+ ```
481
+
482
+ ## Motion conditioning bundle
483
+
484
+ ```bash
485
+ python PackedAvatar.py \
486
+ --motion-condition motion_condition.pt \
487
+ --driven-audio speech.wav
488
+ ```
489
+
490
+ ---
491
+
492
+ # How it works
493
+
494
+ PackedAvatar runs a full multimodal pipeline.
495
+
496
+ ## 1. Asset extraction
497
+
498
+ * extracts SadTalker + checkpoints from `.pt`
499
+ * verifies SHA256 hashes
500
+ * builds the runtime cache
501
+
502
+ ## 2. Avatar resolution
503
+
504
+ Priority:
505
+
506
+ ```text
507
+ avatar_condition
508
+ → source_image-driven SadTalker path
509
+ → avatar_id / default AvatarBank resolution
510
+ ```
511
+
512
+ If `avatar_condition` is provided, it supersedes `source_image` conditioning.
513
+
514
+ ## 3. Preprocessing
515
+
516
+ * face detection
517
+ * cropping
518
+ * 3DMM extraction
519
+
520
+ ## 4. Motion generation
521
+
522
+ * audio → facial coefficients
523
+ * or motion transfer injection
524
+
525
+ ## 5. Rendering
526
+
527
+ * SadTalker / PIRender animation
528
+ * frame synthesis
529
+
530
+ ## 6. Optional post-processing
531
+
532
+ * Wav2Lip GAN lip-sync enhancement
533
+
534
+ ---
535
+
536
+ # First run vs later runs
537
+
538
+ ### First run
539
+
540
+ * extract bundle
541
+ * build cache
542
+ * initialize models
543
+ * download a couple of auxiliary face-analysis weights if they are not already cached locally
544
+
545
+ ### Later runs
546
+
547
+ * reuse cache
548
+ * skip the auxiliary downloads when the files are already present
549
+ * faster startup
550
+
551
+ ---
552
+
553
+ # Performance notes
554
+
555
+ * GPU is strongly recommended for 512 resolution
556
+ * CPU is supported but slower
557
+ * Wav2Lip increases runtime cost
558
+ * RMBG adds preprocessing overhead
559
+
560
+ ---
561
+
562
+ # Why PackedAvatar?
563
+
564
+ Compared to a standard SadTalker setup:
565
+
566
+ * single `.pt` deployment artifact
567
+ * no model downloads for the main runtime
568
+ * no external repos required for core use
569
+ * built-in AvatarBank system
570
+ * built-in background removal
571
+ * optional lip-sync enhancement
572
+ * fully offline execution after first-run helper caching
573
+ * reproducible runtime via bundle hashing
574
+
575
+ ---
576
+
577
+ # Notes
578
+
579
+ * This repo is inference-only
580
+ * Bundles are treated as trusted artifacts
581
+ * Cache is auto-invalidated when the bundle changes
582
+ * All runtime dependencies are resolved internally
583
+
584
+ ---
585
+
586
+ # Credits
587
+
588
+ Built on top of:
589
+
590
+ * SadTalker
591
+ * FaceVid2Vid / PIRender
592
+ * Wav2Lip GAN
593
+ * Bria RMBG
594
+
requirements.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ torchvision
3
+ torchaudio
4
+ numpy
5
+ face_alignment
6
+ imageio
7
+ imageio-ffmpeg
8
+ librosa
9
+ numba
10
+ resampy
11
+ pydub
12
+ scipy
13
+ kornia
14
+ tqdm
15
+ yacs
16
+ pyyaml
17
+ joblib
18
+ scikit-image
19
+ git+https://github.com/XPixelGroup/BasicSR
20
+ git+https://github.com/TencentARC/GFPGAN
21
+ facexlib
22
+ dlib-bin
23
+ av
24
+ safetensors
25
+ TTS
26
+ zstandard