banao-tech commited on
Commit
8b65f54
·
verified ·
1 Parent(s): 85c5a8d

More robust App.py

Browse files
Files changed (1) hide show
  1. app.py +40 -132
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import sys
3
  import subprocess
4
  from pathlib import Path
5
  from datetime import datetime
@@ -7,68 +6,39 @@ from datetime import datetime
7
  import gradio as gr
8
  from huggingface_hub import snapshot_download
9
 
10
-
11
- # -----------------------------
12
- # Paths
13
- # -----------------------------
14
  ROOT = Path(__file__).parent.resolve()
15
  REPO_DIR = ROOT / "LatentSync"
16
- CHECKPOINTS_DIR = REPO_DIR / "checkpoints"
17
  TEMP_DIR = REPO_DIR / "temp"
 
 
 
18
 
19
- # LatentSync 1.5 checkpoint repo (fits T4 16GB)
20
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
21
 
22
- # For LatentSync 1.5, config is typically stage2.yaml (256 resolution)
23
- # (LatentSync has multiple configs; stage2_512.yaml is for 1.6 / 512 training)
24
- CONFIG_REL_PATH = Path("configs/unet/stage2.yaml")
25
- CKPT_REL_PATH = Path("checkpoints/latentsync_unet.pt")
26
-
27
-
28
  def run(cmd, cwd=None):
29
- print("Running:", " ".join(map(str, cmd)))
30
  subprocess.check_call(cmd, cwd=cwd)
31
 
32
-
33
- def ensure_repo():
34
  if not REPO_DIR.exists():
35
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
36
 
37
-
38
- def ensure_checkpoints():
39
- CHECKPOINTS_DIR.mkdir(parents=True, exist_ok=True)
40
-
41
- # Download checkpoint + whisper tiny into LatentSync/checkpoints
42
- # HF repo tree includes `latentsync_unet.pt` and `whisper/...` :contentReference[oaicite:4]{index=4}
43
  snapshot_download(
44
  repo_id=HF_CKPT_REPO,
45
- local_dir=str(CHECKPOINTS_DIR),
46
  local_dir_use_symlinks=False,
47
- allow_patterns=[
48
- "latentsync_unet.pt",
49
- "whisper/*",
50
- ],
51
  )
52
 
53
- ckpt = CHECKPOINTS_DIR / "latentsync_unet.pt"
54
- whisper_tiny = CHECKPOINTS_DIR / "whisper" / "tiny.pt"
55
- if not ckpt.exists():
56
- raise FileNotFoundError(f"Missing checkpoint: {ckpt}")
57
- if not whisper_tiny.exists():
58
- raise FileNotFoundError(f"Missing whisper tiny: {whisper_tiny}")
59
-
60
-
61
- def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
62
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
63
- out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
64
 
65
- # Create a video by looping the image and cutting to audio length.
66
- # Also scale/crop to a square size (256) to match stage2.yaml typical setting.
67
- # If you switch to 1.6 later, you'd scale/crop to 512 and use stage2_512.yaml.
68
  cmd = [
69
  "ffmpeg", "-y",
70
- "-loop", "1",
71
- "-i", image_path,
72
  "-i", audio_path,
73
  "-shortest",
74
  "-r", str(fps),
@@ -81,104 +51,42 @@ def make_still_video(image_path: str, audio_path: str, fps: int = 25) -> str:
81
  run(cmd)
82
  return str(out_path)
83
 
 
 
84
 
85
- def latentsync_infer(video_path: str, audio_path: str, inference_steps: int, guidance_scale: float, seed: int) -> str:
86
- # Import LatentSync inference code
87
- sys.path.insert(0, str(REPO_DIR))
88
- os.chdir(str(REPO_DIR))
89
 
90
- from omegaconf import OmegaConf
91
- from scripts.inference import main
92
- import argparse
93
 
94
- config_path = (REPO_DIR / CONFIG_REL_PATH).resolve()
95
- ckpt_path = (REPO_DIR / CKPT_REL_PATH).resolve()
96
 
97
- if not config_path.exists():
98
- raise FileNotFoundError(f"Config not found: {config_path}")
99
- if not ckpt_path.exists():
100
- raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
101
-
102
- TEMP_DIR.mkdir(parents=True, exist_ok=True)
103
- out_path = TEMP_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
104
-
105
- config = OmegaConf.load(str(config_path))
106
- config["run"].update(
107
- {
108
- "guidance_scale": float(guidance_scale),
109
- "inference_steps": int(inference_steps),
110
- }
111
- )
112
 
113
- parser = argparse.ArgumentParser()
114
- parser.add_argument("--inference_ckpt_path", type=str, required=True)
115
- parser.add_argument("--video_path", type=str, required=True)
116
- parser.add_argument("--audio_path", type=str, required=True)
117
- parser.add_argument("--video_out_path", type=str, required=True)
118
- parser.add_argument("--inference_steps", type=int, default=20)
119
- parser.add_argument("--guidance_scale", type=float, default=1.5)
120
- parser.add_argument("--temp_dir", type=str, default="temp")
121
- parser.add_argument("--seed", type=int, default=1247)
122
- parser.add_argument("--enable_deepcache", action="store_true")
123
-
124
- args = parser.parse_args(
125
- [
126
- "--inference_ckpt_path",
127
- str(ckpt_path),
128
- "--video_path",
129
- str(Path(video_path).resolve()),
130
- "--audio_path",
131
- str(Path(audio_path).resolve()),
132
- "--video_out_path",
133
- str(out_path.resolve()),
134
- "--inference_steps",
135
- str(inference_steps),
136
- "--guidance_scale",
137
- str(guidance_scale),
138
- "--seed",
139
- str(seed),
140
- "--temp_dir",
141
- "temp",
142
- "--enable_deepcache",
143
- ]
144
- )
145
 
146
- main(config=config, args=args)
147
  return str(out_path)
148
 
149
-
150
- def generate(avatar_img, audio_wav, inference_steps, guidance_scale, seed):
151
- ensure_repo()
152
- ensure_checkpoints()
153
-
154
- # avatar_img is a filepath (type="filepath")
155
- # audio_wav is a filepath (type="filepath")
156
- still_video = make_still_video(avatar_img, audio_wav, fps=25)
157
- result = latentsync_infer(still_video, audio_wav, inference_steps, guidance_scale, seed)
158
- return result
159
-
160
-
161
- with gr.Blocks(title="LatentSync (avatar.jpg + audio.wav → lip-sync mp4)") as demo:
162
- gr.Markdown(
163
- """
164
- # LatentSync (HF Space)
165
- Upload **avatar.jpg** + **audio.wav** → get lip-sync **mp4**.
166
- (Uses **LatentSync 1.5** to fit **T4 16GB VRAM**.)
167
- """
168
- )
169
-
170
- with gr.Row():
171
- avatar = gr.Image(label="Avatar Image (jpg/png)", type="filepath")
172
- audio = gr.Audio(label="Audio (wav)", type="filepath")
173
-
174
  with gr.Row():
175
- guidance = gr.Slider(1.0, 3.0, value=1.5, step=0.1, label="Guidance Scale")
176
- steps = gr.Slider(10, 50, value=20, step=1, label="Inference Steps")
177
- seed = gr.Number(value=1247, precision=0, label="Seed")
178
-
179
- btn = gr.Button("Generate Lip-Sync Video")
180
- out = gr.Video(label="Output MP4")
181
-
182
- btn.click(fn=generate, inputs=[avatar, audio, steps, guidance, seed], outputs=out)
183
 
184
  demo.launch()
 
1
  import os
 
2
  import subprocess
3
  from pathlib import Path
4
  from datetime import datetime
 
6
  import gradio as gr
7
  from huggingface_hub import snapshot_download
8
 
 
 
 
 
9
  ROOT = Path(__file__).parent.resolve()
10
  REPO_DIR = ROOT / "LatentSync"
 
11
  TEMP_DIR = REPO_DIR / "temp"
12
+ INPUT_DIR = REPO_DIR / "inputs"
13
+ OUTPUT_DIR = REPO_DIR / "outputs"
14
+ CKPT_DIR = REPO_DIR / "checkpoints"
15
 
 
16
  HF_CKPT_REPO = "ByteDance/LatentSync-1.5"
17
 
 
 
 
 
 
 
18
  def run(cmd, cwd=None):
19
+ print(" ".join(map(str, cmd)))
20
  subprocess.check_call(cmd, cwd=cwd)
21
 
22
+ def setup():
 
23
  if not REPO_DIR.exists():
24
  run(["git", "clone", "--depth", "1", "https://github.com/bytedance/LatentSync.git", str(REPO_DIR)])
25
 
26
+ CKPT_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
27
  snapshot_download(
28
  repo_id=HF_CKPT_REPO,
29
+ local_dir=str(CKPT_DIR),
30
  local_dir_use_symlinks=False,
 
 
 
 
31
  )
32
 
33
+ INPUT_DIR.mkdir(parents=True, exist_ok=True)
34
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
35
  TEMP_DIR.mkdir(parents=True, exist_ok=True)
 
36
 
37
+ def make_still_video(img_path: str, audio_path: str, fps: int = 25) -> str:
38
+ out_path = TEMP_DIR / f"still_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
 
39
  cmd = [
40
  "ffmpeg", "-y",
41
+ "-loop", "1", "-i", img_path,
 
42
  "-i", audio_path,
43
  "-shortest",
44
  "-r", str(fps),
 
51
  run(cmd)
52
  return str(out_path)
53
 
54
+ def generate(avatar_img, audio_wav):
55
+ setup()
56
 
57
+ img_path = str(Path(avatar_img).resolve())
58
+ wav_path = str(Path(audio_wav).resolve())
 
 
59
 
60
+ # create video from image+audio
61
+ still_video = make_still_video(img_path, wav_path, fps=25)
 
62
 
63
+ # run LatentSync inference (use repo script directly)
64
+ out_path = OUTPUT_DIR / f"result_{datetime.now().strftime('%Y%m%d_%H%M%S')}.mp4"
65
 
66
+ # NOTE:
67
+ # LatentSync repo sometimes provides "gradio_app.py" or "predict.py" with different args.
68
+ # We call the official inference entry if available.
69
+ # If your build fails here, paste the Space logs and I’ll adjust to exact script/args.
70
+ cmd = [
71
+ "python", "predict.py",
72
+ "--image_path", img_path,
73
+ "--audio_path", wav_path,
74
+ "--output_path", str(out_path),
75
+ ]
 
 
 
 
 
76
 
77
+ # Some LatentSync versions require video instead of image; if this fails we’ll swap
78
+ # to their video-based inference script.
79
+ run(cmd, cwd=str(REPO_DIR))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
 
 
81
  return str(out_path)
82
 
83
+ with gr.Blocks() as demo:
84
+ gr.Markdown("# LatentSync (avatar.jpg + audio.wav mp4)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Row():
86
+ avatar = gr.Image(type="filepath", label="avatar.jpg/png")
87
+ audio = gr.Audio(type="filepath", label="audio.wav", format="wav")
88
+ btn = gr.Button("Generate")
89
+ out = gr.Video(label="Output")
90
+ btn.click(generate, inputs=[avatar, audio], outputs=out)
 
 
 
91
 
92
  demo.launch()