alexnasa commited on
Commit
1a20a46
·
verified ·
1 Parent(s): 295978e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +435 -435
app.py CHANGED
@@ -1,435 +1,435 @@
1
- import spaces
2
- import gradio as gr
3
- import sys
4
- import os
5
- import subprocess
6
- import uuid
7
- import shutil
8
-
9
-
10
-
11
- from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
12
- import importlib, site
13
-
14
-
15
- # Re-discover all .pth/.egg-link files
16
- for sitedir in site.getsitepackages():
17
- site.addsitedir(sitedir)
18
-
19
- # Clear caches so importlib will pick up new modules
20
- importlib.invalidate_caches()
21
-
22
- def sh(cmd): subprocess.check_call(cmd, shell=True)
23
-
24
- flash_attention_installed = False
25
-
26
- try:
27
- flash_attention_wheel = hf_hub_download(
28
- repo_id="alexnasa/flash-attn-3",
29
- repo_type="model",
30
- filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
31
- )
32
-
33
- sh(f"pip install {flash_attention_wheel}")
34
- print("Attempting to download and install FlashAttention wheel...")
35
- # sh("pip install flash-attn")
36
- sh("pip install --no-build-isolation transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl")
37
-
38
- # tell Python to re-scan site-packages now that the egg-link exists
39
- import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
40
-
41
- flash_attention_installed = True
42
-
43
- except Exception as e:
44
- print(f"⚠️ Could not install FlashAttention: {e}")
45
- print("Continuing without FlashAttention...")
46
-
47
- try:
48
- te_wheel = hf_hub_download(
49
- repo_id="alexnasa/transformer_engine_wheels",
50
- repo_type="model",
51
- filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl",
52
- )
53
-
54
- sh(f"pip install {te_wheel}")
55
- print("Attempting to download and install Transformer Engine wheel...")
56
-
57
- # tell Python to re-scan site-packages now that the egg-link exists
58
- import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
59
-
60
- except Exception as e:
61
- print(f"⚠️ Could not install Transformer Engine : {e}")
62
- print("Continuing without Transformer Engine ...")
63
-
64
- import torch
65
- print(f"Torch version: {torch.__version__}")
66
- print(f"FlashAttention available: {flash_attention_installed}")
67
-
68
- import tempfile
69
- from pathlib import Path
70
- from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir
71
- from huggingface_hub import HfApi
72
-
73
-
74
- snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo")
75
- snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B")
76
- snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3")
77
-
78
- os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
79
-
80
- path_to_insert = "humo"
81
- if path_to_insert not in sys.path:
82
- sys.path.insert(0, path_to_insert)
83
-
84
- from common.config import load_config, create_object
85
-
86
- config = load_config(
87
- "./humo/configs/inference/generate.yaml",
88
- [
89
- "dit.sp_size=1",
90
- "generation.frames=97",
91
- "generation.scale_t=5.5",
92
- "generation.scale_a=5.0",
93
- "generation.mode=TIA",
94
- "generation.height=480",
95
- "generation.width=832",
96
- ],
97
- )
98
- runner = create_object(config)
99
-
100
-
101
- os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space") # or another writable path
102
-
103
- def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
104
- path_in_repo: str = "inductor_cache", repo_type: str = "model",
105
- hf_token: str | None = None):
106
- cache_root = Path(_inductor_cache_dir()).resolve()
107
- cache_root.mkdir(parents=True, exist_ok=True)
108
- zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}",
109
- repo_type=repo_type, token=hf_token)
110
- shutil.unpack_archive(zip_path, extract_dir=str(cache_root))
111
- print(f"✓ Restored cache into {cache_root}")
112
-
113
-
114
- # restore_inductor_cache_from_hub("alexnasa/humo-compiled")
115
-
116
-
117
- def get_duration(prompt_text, steps, image_file, audio_file_path, tea_cache_l1_thresh, max_duration, session_id):
118
-
119
- return calculate_required_time(steps, max_duration)
120
-
121
- def calculate_required_time(steps, max_duration):
122
-
123
- warmup_s = 60
124
-
125
- max_duration_duration_mapping = {
126
- 1: 8,
127
- 2: 8,
128
- 3: 11,
129
- 4: 20,
130
- 5: 30,
131
- }
132
- each_step_s = max_duration_duration_mapping[max_duration]
133
- duration_s = (each_step_s * steps) + warmup_s
134
-
135
- print(f'estimated duration:{duration_s}')
136
-
137
- return int(duration_s)
138
-
139
- def get_required_time_string(steps, max_duration):
140
-
141
- duration_s = calculate_required_time(steps, max_duration)
142
- duration_m = duration_s / 60
143
-
144
- return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>"
145
-
146
- def update_required_time(steps, max_duration):
147
-
148
- return get_required_time_string(steps, max_duration)
149
-
150
-
151
- def generate_scene(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration = 2, session_id = None):
152
-
153
- print(image_paths)
154
- prompt_text_check = (prompt_text or "").strip()
155
- if not prompt_text_check:
156
- raise gr.Error("Please enter a prompt.")
157
-
158
- if not audio_file_path and not image_paths:
159
- raise gr.Error("Please provide a reference image or a lipsync audio.")
160
-
161
- return run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration, session_id)
162
-
163
-
164
-
165
- def upload_inductor_cache_to_hub(
166
- repo_id: str,
167
- path_in_repo: str = "inductor_cache",
168
- repo_type: str = "model", # or "dataset" if you prefer
169
- hf_token: str | None = None,
170
- ):
171
- """
172
- Zips the current TorchInductor cache and uploads it to the given repo path.
173
- Assumes the model was already run once with torch.compile() so the cache exists.
174
- """
175
-
176
- cache_dir = Path(_inductor_cache_dir()).resolve()
177
- if not cache_dir.exists():
178
- raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. "
179
- "Run a compiled model once to populate it.")
180
-
181
- # Create a zip archive of the entire cache directory
182
- with tempfile.TemporaryDirectory() as tmpdir:
183
- archive_base = Path(tmpdir) / "torch_compile_cache"
184
- archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir))
185
- archive_path = Path(archive_path)
186
-
187
- # Upload to Hub
188
- api = HfApi(token=hf_token)
189
- api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
190
- # Put each artifact under path_in_repo, including a tiny metadata stamp for traceability
191
- # Upload the zip
192
- dest_path = f"{path_in_repo}/{archive_path.name}"
193
- api.upload_file(
194
- path_or_fileobj=str(archive_path),
195
- path_in_repo=dest_path,
196
- repo_id=repo_id,
197
- repo_type=repo_type,
198
- )
199
- # Upload a small metadata file (optional but handy)
200
- meta_txt = (
201
- f"pytorch={torch.__version__}\n"
202
- f"inductor_cache_dir={cache_dir}\n"
203
- f"cuda_available={torch.cuda.is_available()}\n"
204
- f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n"
205
- )
206
- api.upload_file(
207
- path_or_fileobj=meta_txt.encode(),
208
- path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt",
209
- repo_id=repo_id,
210
- repo_type=repo_type,
211
- )
212
-
213
- print("✔ Uploaded TorchInductor cache to the Hub.")
214
-
215
-
216
- @spaces.GPU(duration=get_duration)
217
- def run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh = 0.0, max_duration = 2, session_id = None):
218
-
219
- if session_id is None:
220
- session_id = uuid.uuid4().hex
221
-
222
- inference_mode = "TIA"
223
-
224
- # Validate inputs
225
- prompt_text = (prompt_text or "").strip()
226
- if not prompt_text:
227
- raise gr.Error("Please enter a prompt.")
228
-
229
- if not audio_file_path and not image_paths:
230
- raise gr.Error("Please provide a reference image or a lipsync audio.")
231
-
232
- if not audio_file_path:
233
- inference_mode = "TI"
234
- audio_path = None
235
- else:
236
- audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path))
237
-
238
- if not image_paths:
239
- inference_mode = "TA"
240
- img_paths = None
241
- else:
242
- img_paths = [image_data[0] for image_data in image_paths]
243
-
244
-
245
- # Prepare output
246
- output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
247
- os.makedirs(output_dir, exist_ok=True)
248
-
249
- # Random filename
250
- filename = f"gen_{uuid.uuid4().hex[:10]}"
251
- width, height = 832, 480
252
-
253
- duration_frame_mapping = {
254
- 1:25,
255
- 2:45,
256
- 3:70,
257
- 4:97,
258
- 5:129
259
- }
260
-
261
- # Run inference
262
- runner.inference_loop(
263
- prompt_text,
264
- img_paths,
265
- audio_path,
266
- output_dir,
267
- filename,
268
- inference_mode,
269
- width,
270
- height,
271
- steps,
272
- frames = int(duration_frame_mapping[max_duration]),
273
- tea_cache_l1_thresh = tea_cache_l1_thresh,
274
- )
275
-
276
- # Return resulting video path
277
- video_path = os.path.join(output_dir, f"{filename}.mp4")
278
- if os.path.exists(video_path):
279
-
280
- # upload_inductor_cache_to_hub("alexnasa/humo-compiled")
281
-
282
- return video_path
283
- else:
284
- candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
285
- if candidates:
286
- return max(candidates, key=lambda p: os.path.getmtime(p))
287
- return None
288
-
289
- css = """
290
- #col-container {
291
- margin: 0 auto;
292
- width: 100%;
293
- max-width: 720px;
294
- }
295
- """
296
-
297
- def cleanup(request: gr.Request):
298
-
299
- sid = request.session_hash
300
- if sid:
301
- d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
302
- shutil.rmtree(d1, ignore_errors=True)
303
-
304
- def start_session(request: gr.Request):
305
-
306
- return request.session_hash
307
-
308
- with gr.Blocks(css=css) as demo:
309
-
310
- session_state = gr.State()
311
- demo.load(start_session, outputs=[session_state])
312
-
313
- with gr.Sidebar(width=400):
314
-
315
-
316
- gr.HTML(
317
- """
318
- <div style="text-align: center;">
319
- <p style="font-size:16px; display: inline; margin: 0;">
320
- <strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning
321
- </p>
322
- <a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
323
- [Github]
324
- </a>
325
- </div>
326
- """
327
- )
328
-
329
- gr.Markdown("**REFERENCE IMAGES**")
330
-
331
- img_input = gr.Gallery(
332
- show_label=False,
333
- label="",
334
- interactive=True,
335
- rows=1, columns=3, object_fit="contain", height="280",
336
- file_types=['image']
337
- )
338
-
339
- gr.Markdown("**LIPSYNC AUDIO**")
340
-
341
- audio_input = gr.Audio(
342
- sources=["upload"],
343
- show_label=False,
344
- type="filepath",
345
- )
346
-
347
- gr.Markdown("**SETTINGS**")
348
-
349
- default_steps = 10
350
- default_max_duration = 2
351
-
352
- max_duration = gr.Slider(minimum=2, maximum=5, value=default_max_duration, step=1, label="Max Duration")
353
- steps_input = gr.Slider(minimum=5, maximum=50, value=default_steps, step=5, label="Diffusion Steps")
354
- tea_cache_l1_thresh = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.01, label="Cache", visible=False)
355
-
356
-
357
-
358
- with gr.Column(elem_id="col-container"):
359
-
360
- gr.HTML(
361
- """
362
- <div style="text-align: center;">
363
- <strong>HF Space by:</strong>
364
- <a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
365
- <img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
366
- </a>
367
- </div>
368
- """
369
- )
370
-
371
- video_output = gr.Video(show_label=False)
372
-
373
- gr.Markdown("<center><h2>PROMPT</h2></center>")
374
-
375
- prompt_tb = gr.Textbox(
376
- show_label=False,
377
- lines=5,
378
- placeholder="Describe the scene and the person talking....",
379
- )
380
-
381
- gr.Markdown("")
382
- time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration))
383
- run_btn = gr.Button("🎬 Action", variant="primary")
384
-
385
- gr.Examples(
386
- examples=[
387
-
388
- [
389
- "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead. She speaks with intensity.",
390
- 5,
391
- ["./examples/naomi.png"],
392
- "./examples/dream.mp3",
393
- ],
394
-
395
- [
396
- "A reddish-brown haired and bearded man sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and his thoughtful pose evoke a Post-Impressionist style in a studio-like setting.",
397
- 10,
398
- ["./examples/vangogh.jpg"],
399
- "./examples/art.wav",
400
- ],
401
-
402
- [
403
- "A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. The clip is in black and white and patchy as she is explaining something to someone standing opposite her",
404
- 10,
405
- ["./examples/naomi.png"],
406
- "./examples/science.wav",
407
- ],
408
-
409
- [
410
- "A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.",
411
- 50,
412
- ["./examples/amber.png", "./examples/jacket.png"],
413
- "./examples/fictional.mp3",
414
- ],
415
-
416
- ],
417
- inputs=[prompt_tb, steps_input, img_input, audio_input],
418
- outputs=[video_output],
419
- fn=run_pipeline,
420
- cache_examples=True,
421
- )
422
- max_duration.change(update_required_time, [steps_input, max_duration], time_required)
423
- steps_input.change(update_required_time, [steps_input, max_duration], time_required)
424
-
425
- run_btn.click(
426
- fn=generate_scene,
427
- inputs=[prompt_tb, steps_input, img_input, audio_input, tea_cache_l1_thresh, max_duration, session_state],
428
- outputs=[video_output],
429
- )
430
-
431
-
432
- if __name__ == "__main__":
433
- demo.unload(cleanup)
434
- demo.queue()
435
- demo.launch(ssr_mode=False)
 
1
+ import spaces
2
+ import gradio as gr
3
+ import sys
4
+ import os
5
+ import subprocess
6
+ import uuid
7
+ import shutil
8
+
9
+
10
+
11
+ from huggingface_hub import snapshot_download, list_repo_files, hf_hub_download
12
+ import importlib, site
13
+
14
+
15
+ # Re-discover all .pth/.egg-link files
16
+ for sitedir in site.getsitepackages():
17
+ site.addsitedir(sitedir)
18
+
19
+ # Clear caches so importlib will pick up new modules
20
+ importlib.invalidate_caches()
21
+
22
+ def sh(cmd): subprocess.check_call(cmd, shell=True)
23
+
24
+ flash_attention_installed = False
25
+
26
+ try:
27
+ flash_attention_wheel = hf_hub_download(
28
+ repo_id="alexnasa/flash-attn-3",
29
+ repo_type="model",
30
+ filename="128/flash_attn_3-3.0.0b1-cp39-abi3-linux_x86_64.whl",
31
+ )
32
+
33
+ sh(f"pip install {flash_attention_wheel}")
34
+ print("Attempting to download and install FlashAttention wheel...")
35
+ # sh("pip install flash-attn")
36
+ sh("pip install --no-build-isolation transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl")
37
+
38
+ # tell Python to re-scan site-packages now that the egg-link exists
39
+ import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
40
+
41
+ flash_attention_installed = True
42
+
43
+ except Exception as e:
44
+ print(f"⚠️ Could not install FlashAttention: {e}")
45
+ print("Continuing without FlashAttention...")
46
+
47
+ try:
48
+ te_wheel = hf_hub_download(
49
+ repo_id="alexnasa/transformer_engine_wheels",
50
+ repo_type="model",
51
+ filename="transformer_engine-2.5.0+f05f12c9-cp310-cp310-linux_x86_64.whl",
52
+ )
53
+
54
+ sh(f"pip install {te_wheel}")
55
+ print("Attempting to download and install Transformer Engine wheel...")
56
+
57
+ # tell Python to re-scan site-packages now that the egg-link exists
58
+ import importlib, site; site.addsitedir(site.getsitepackages()[0]); importlib.invalidate_caches()
59
+
60
+ except Exception as e:
61
+ print(f"⚠️ Could not install Transformer Engine : {e}")
62
+ print("Continuing without Transformer Engine ...")
63
+
64
+ import torch
65
+ print(f"Torch version: {torch.__version__}")
66
+ print(f"FlashAttention available: {flash_attention_installed}")
67
+
68
+ import tempfile
69
+ from pathlib import Path
70
+ from torch._inductor.runtime.runtime_utils import cache_dir as _inductor_cache_dir
71
+ from huggingface_hub import HfApi
72
+
73
+
74
+ snapshot_download(repo_id="bytedance-research/HuMo", local_dir="./weights/HuMo")
75
+ snapshot_download(repo_id="Wan-AI/Wan2.1-T2V-1.3B", local_dir="./weights/Wan2.1-T2V-1.3B")
76
+ snapshot_download(repo_id="openai/whisper-large-v3", local_dir="./weights/whisper-large-v3")
77
+
78
+ os.environ["PROCESSED_RESULTS"] = f"{os.getcwd()}/proprocess_results"
79
+
80
+ path_to_insert = "humo"
81
+ if path_to_insert not in sys.path:
82
+ sys.path.insert(0, path_to_insert)
83
+
84
+ from common.config import load_config, create_object
85
+
86
+ config = load_config(
87
+ "./humo/configs/inference/generate.yaml",
88
+ [
89
+ "dit.sp_size=1",
90
+ "generation.frames=97",
91
+ "generation.scale_t=5.5",
92
+ "generation.scale_a=5.0",
93
+ "generation.mode=TIA",
94
+ "generation.height=480",
95
+ "generation.width=832",
96
+ ],
97
+ )
98
+ runner = create_object(config)
99
+
100
+
101
+ os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", f"{os.getcwd()}/torchinductor_space") # or another writable path
102
+
103
+ def restore_inductor_cache_from_hub(repo_id: str, filename: str = "torch_compile_cache.zip",
104
+ path_in_repo: str = "inductor_cache", repo_type: str = "model",
105
+ hf_token: str | None = None):
106
+ cache_root = Path(_inductor_cache_dir()).resolve()
107
+ cache_root.mkdir(parents=True, exist_ok=True)
108
+ zip_path = hf_hub_download(repo_id=repo_id, filename=f"{path_in_repo}/{filename}",
109
+ repo_type=repo_type, token=hf_token)
110
+ shutil.unpack_archive(zip_path, extract_dir=str(cache_root))
111
+ print(f"✓ Restored cache into {cache_root}")
112
+
113
+
114
+ # restore_inductor_cache_from_hub("alexnasa/humo-compiled")
115
+
116
+
117
+ def get_duration(prompt_text, steps, image_file, audio_file_path, tea_cache_l1_thresh, max_duration, session_id):
118
+
119
+ return calculate_required_time(steps, max_duration)
120
+
121
+ def calculate_required_time(steps, max_duration):
122
+
123
+ warmup_s = 60
124
+
125
+ max_duration_duration_mapping = {
126
+ 1: 8,
127
+ 2: 8,
128
+ 3: 11,
129
+ 4: 20,
130
+ 5: 30,
131
+ }
132
+ each_step_s = max_duration_duration_mapping[max_duration]
133
+ duration_s = (each_step_s * steps) + warmup_s
134
+
135
+ print(f'estimated duration:{duration_s}')
136
+
137
+ return int(duration_s)
138
+
139
+ def get_required_time_string(steps, max_duration):
140
+
141
+ duration_s = calculate_required_time(steps, max_duration)
142
+ duration_m = duration_s / 60
143
+
144
+ return f"<center>⌚ Zero GPU Required: ~{duration_s}.0s ({duration_m:.1f} mins)</center>"
145
+
146
+ def update_required_time(steps, max_duration):
147
+
148
+ return get_required_time_string(steps, max_duration)
149
+
150
+
151
+ def generate_scene(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration = 2, session_id = None):
152
+
153
+ print(image_paths)
154
+ prompt_text_check = (prompt_text or "").strip()
155
+ if not prompt_text_check:
156
+ raise gr.Error("Please enter a prompt.")
157
+
158
+ if not audio_file_path and not image_paths:
159
+ raise gr.Error("Please provide a reference image or a lipsync audio.")
160
+
161
+ return run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh, max_duration, session_id)
162
+
163
+
164
+
165
+ def upload_inductor_cache_to_hub(
166
+ repo_id: str,
167
+ path_in_repo: str = "inductor_cache",
168
+ repo_type: str = "model", # or "dataset" if you prefer
169
+ hf_token: str | None = None,
170
+ ):
171
+ """
172
+ Zips the current TorchInductor cache and uploads it to the given repo path.
173
+ Assumes the model was already run once with torch.compile() so the cache exists.
174
+ """
175
+
176
+ cache_dir = Path(_inductor_cache_dir()).resolve()
177
+ if not cache_dir.exists():
178
+ raise FileNotFoundError(f"TorchInductor cache not found at {cache_dir}. "
179
+ "Run a compiled model once to populate it.")
180
+
181
+ # Create a zip archive of the entire cache directory
182
+ with tempfile.TemporaryDirectory() as tmpdir:
183
+ archive_base = Path(tmpdir) / "torch_compile_cache"
184
+ archive_path = shutil.make_archive(str(archive_base), "zip", root_dir=str(cache_dir))
185
+ archive_path = Path(archive_path)
186
+
187
+ # Upload to Hub
188
+ api = HfApi(token=hf_token)
189
+ api.create_repo(repo_id=repo_id, repo_type=repo_type, exist_ok=True)
190
+ # Put each artifact under path_in_repo, including a tiny metadata stamp for traceability
191
+ # Upload the zip
192
+ dest_path = f"{path_in_repo}/{archive_path.name}"
193
+ api.upload_file(
194
+ path_or_fileobj=str(archive_path),
195
+ path_in_repo=dest_path,
196
+ repo_id=repo_id,
197
+ repo_type=repo_type,
198
+ )
199
+ # Upload a small metadata file (optional but handy)
200
+ meta_txt = (
201
+ f"pytorch={torch.__version__}\n"
202
+ f"inductor_cache_dir={cache_dir}\n"
203
+ f"cuda_available={torch.cuda.is_available()}\n"
204
+ f"cuda_device={torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'cpu'}\n"
205
+ )
206
+ api.upload_file(
207
+ path_or_fileobj=meta_txt.encode(),
208
+ path_in_repo=f"{path_in_repo}/INDUCTOR_CACHE_METADATA.txt",
209
+ repo_id=repo_id,
210
+ repo_type=repo_type,
211
+ )
212
+
213
+ print("✔ Uploaded TorchInductor cache to the Hub.")
214
+
215
+
216
+ @spaces.GPU(duration=get_duration)
217
+ def run_pipeline(prompt_text, steps, image_paths, audio_file_path, tea_cache_l1_thresh = 0.0, max_duration = 2, session_id = None):
218
+
219
+ if session_id is None:
220
+ session_id = uuid.uuid4().hex
221
+
222
+ inference_mode = "TIA"
223
+
224
+ # Validate inputs
225
+ prompt_text = (prompt_text or "").strip()
226
+ if not prompt_text:
227
+ raise gr.Error("Please enter a prompt.")
228
+
229
+ if not audio_file_path and not image_paths:
230
+ raise gr.Error("Please provide a reference image or a lipsync audio.")
231
+
232
+ if not audio_file_path:
233
+ inference_mode = "TI"
234
+ audio_path = None
235
+ else:
236
+ audio_path = audio_file_path if isinstance(audio_file_path, str) else getattr(audio_file_path, "name", str(audio_file_path))
237
+
238
+ if not image_paths:
239
+ inference_mode = "TA"
240
+ img_paths = None
241
+ else:
242
+ img_paths = [image_data[0] for image_data in image_paths]
243
+
244
+
245
+ # Prepare output
246
+ output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
247
+ os.makedirs(output_dir, exist_ok=True)
248
+
249
+ # Random filename
250
+ filename = f"gen_{uuid.uuid4().hex[:10]}"
251
+ width, height = 832, 480
252
+
253
+ duration_frame_mapping = {
254
+ 1:25,
255
+ 2:45,
256
+ 3:70,
257
+ 4:97,
258
+ 5:129
259
+ }
260
+
261
+ # Run inference
262
+ runner.inference_loop(
263
+ prompt_text,
264
+ img_paths,
265
+ audio_path,
266
+ output_dir,
267
+ filename,
268
+ inference_mode,
269
+ width,
270
+ height,
271
+ steps,
272
+ frames = int(duration_frame_mapping[max_duration]),
273
+ tea_cache_l1_thresh = tea_cache_l1_thresh,
274
+ )
275
+
276
+ # Return resulting video path
277
+ video_path = os.path.join(output_dir, f"{filename}.mp4")
278
+ if os.path.exists(video_path):
279
+
280
+ # upload_inductor_cache_to_hub("alexnasa/humo-compiled")
281
+
282
+ return video_path
283
+ else:
284
+ candidates = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".mp4")]
285
+ if candidates:
286
+ return max(candidates, key=lambda p: os.path.getmtime(p))
287
+ return None
288
+
289
+ css = """
290
+ #col-container {
291
+ margin: 0 auto;
292
+ width: 100%;
293
+ max-width: 720px;
294
+ }
295
+ """
296
+
297
+ def cleanup(request: gr.Request):
298
+
299
+ sid = request.session_hash
300
+ if sid:
301
+ d1 = os.path.join(os.environ["PROCESSED_RESULTS"], sid)
302
+ shutil.rmtree(d1, ignore_errors=True)
303
+
304
+ def start_session(request: gr.Request):
305
+
306
+ return request.session_hash
307
+
308
+ with gr.Blocks(css=css) as demo:
309
+
310
+ session_state = gr.State()
311
+ demo.load(start_session, outputs=[session_state])
312
+
313
+ with gr.Sidebar(width=400):
314
+
315
+
316
+ gr.HTML(
317
+ """
318
+ <div style="text-align: center;">
319
+ <p style="font-size:16px; display: inline; margin: 0;">
320
+ <strong>HuMo</strong> – Human-Centric Video Generation via Collaborative Multi-Modal Conditioning
321
+ </p>
322
+ <a href="https://github.com/Phantom-video/HuMo" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
323
+ [Github]
324
+ </a>
325
+ </div>
326
+ """
327
+ )
328
+
329
+ gr.Markdown("**REFERENCE IMAGES**")
330
+
331
+ img_input = gr.Gallery(
332
+ show_label=False,
333
+ label="",
334
+ interactive=True,
335
+ rows=1, columns=3, object_fit="contain", height="280",
336
+ file_types=['image']
337
+ )
338
+
339
+ gr.Markdown("**LIPSYNC AUDIO**")
340
+
341
+ audio_input = gr.Audio(
342
+ sources=["upload"],
343
+ show_label=False,
344
+ type="filepath",
345
+ )
346
+
347
+ gr.Markdown("**SETTINGS**")
348
+
349
+ default_steps = 10
350
+ default_max_duration = 2
351
+
352
+ max_duration = gr.Slider(minimum=2, maximum=5, value=default_max_duration, step=1, label="Max Duration")
353
+ steps_input = gr.Slider(minimum=5, maximum=50, value=default_steps, step=5, label="Diffusion Steps")
354
+ tea_cache_l1_thresh = gr.Slider(minimum=0.0, maximum=1.0, value=0.0, step=0.01, label="Cache", visible=False)
355
+
356
+
357
+
358
+ with gr.Column(elem_id="col-container"):
359
+
360
+ gr.HTML(
361
+ """
362
+ <div style="text-align: center;">
363
+ <strong>HF Space by:</strong>
364
+ <a href="https://twitter.com/alexandernasa/" style="display: inline-block; vertical-align: middle; margin-left: 0.5em;">
365
+ <img src="https://img.shields.io/twitter/url/https/twitter.com/cloudposse.svg?style=social&label=Follow Me" alt="GitHub Repo">
366
+ </a>
367
+ </div>
368
+ """
369
+ )
370
+
371
+ video_output = gr.Video(show_label=False)
372
+
373
+ gr.Markdown("<center><h2>PROMPT</h2></center>")
374
+
375
+ prompt_tb = gr.Textbox(
376
+ show_label=False,
377
+ lines=5,
378
+ placeholder="Describe the scene and the person talking....",
379
+ )
380
+
381
+ gr.Markdown("")
382
+ time_required = gr.Markdown(get_required_time_string(default_steps, default_max_duration))
383
+ run_btn = gr.Button("🎬 Action", variant="primary")
384
+
385
+ gr.Examples(
386
+ examples=[
387
+
388
+ [
389
+ "A handheld tracking shot follows a female warrior walking through a cave. Her determined eyes are locked straight ahead. She speaks with intensity.",
390
+ 5,
391
+ ["./examples/naomi.png"],
392
+ "./examples/dream.mp3",
393
+ ],
394
+
395
+ [
396
+ "A reddish-brown haired and bearded man sits pensively against swirling blue-and-white brushstrokes, dressed in a blue coat and dark waistcoat. The artistic backdrop and his thoughtful pose evoke a Post-Impressionist style in a studio-like setting.",
397
+ 10,
398
+ ["./examples/vangogh.jpg"],
399
+ "./examples/art.wav",
400
+ ],
401
+
402
+ [
403
+ "A handheld tracking shot follows a female through a science lab. Her determined eyes are locked straight ahead. The clip is in black and white and patchy as she is explaining something to someone standing opposite her",
404
+ 10,
405
+ ["./examples/naomi.png"],
406
+ "./examples/science.wav",
407
+ ],
408
+
409
+ [
410
+ "A woman with long, wavy dark hair looking at a person sitting opposite her whilst holding a book, wearing a leather jacket, long-sleeved jacket with a semi purple color one seen on a photo. Warm, window-like light bathes her figure, highlighting the outfit's elegant design and her graceful movements.",
411
+ 50,
412
+ ["./examples/amber.png", "./examples/jacket.png"],
413
+ "./examples/fictional.wav",
414
+ ],
415
+
416
+ ],
417
+ inputs=[prompt_tb, steps_input, img_input, audio_input],
418
+ outputs=[video_output],
419
+ fn=run_pipeline,
420
+ cache_examples=True,
421
+ )
422
+ max_duration.change(update_required_time, [steps_input, max_duration], time_required)
423
+ steps_input.change(update_required_time, [steps_input, max_duration], time_required)
424
+
425
+ run_btn.click(
426
+ fn=generate_scene,
427
+ inputs=[prompt_tb, steps_input, img_input, audio_input, tea_cache_l1_thresh, max_duration, session_state],
428
+ outputs=[video_output],
429
+ )
430
+
431
+
432
+ if __name__ == "__main__":
433
+ demo.unload(cleanup)
434
+ demo.queue()
435
+ demo.launch(ssr_mode=False)