FarmerlineML commited on
Commit
13c462f
·
verified ·
1 Parent(s): 095e1cf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -216
app.py CHANGED
@@ -5,12 +5,6 @@ Gradio Space for HuggingFace
5
 
6
  import os
7
  import sys
8
-
9
- # CRITICAL: Set environment variables BEFORE any torch/torchvision imports
10
- # This prevents torchvision from registering CUDA ops that don't exist at import time
11
- os.environ["TORCHVISION_DISABLE_META_REGISTRATIONS"] = "1"
12
- os.environ["TORCH_LOGS"] = "-all" # Reduce torch logging noise
13
-
14
  import random
15
  import logging
16
  import warnings
@@ -19,43 +13,40 @@ from pathlib import Path
19
  import gradio as gr
20
  import torch
21
  import numpy as np
22
-
23
- # Suppress warnings
24
- warnings.filterwarnings('ignore')
25
-
26
- # Setup logging
27
- logging.basicConfig(level=logging.INFO)
28
- logger = logging.getLogger(__name__)
29
-
30
- # Add current directory to path
31
- sys.path.insert(0, str(Path(__file__).parent))
32
 
33
  # Import utilities
34
  from utils.model_loader import ModelManager
35
  from utils.gpu_manager import gpu_manager
36
-
37
- # Import InfiniteTalk modules
38
  import wan
39
  from wan.configs import SIZE_CONFIGS, WAN_CONFIGS
40
  from wan.utils.utils import cache_image, cache_video, is_video
41
  from wan.utils.multitalk_utils import save_video_ffmpeg
42
-
43
- # Audio processing
44
- import librosa
45
- import soundfile as sf
46
- import pyloudnorm as pyln
47
  from transformers import Wav2Vec2FeatureExtractor
48
  from src.audio_analysis.wav2vec2 import Wav2Vec2Model
49
 
50
- # Image/Video processing
51
- from PIL import Image
52
- from einops import rearrange
 
 
 
 
 
 
 
 
 
 
53
 
54
  # Global variables
55
  model_manager = None
56
  models_loaded = False
57
 
58
-
59
  def initialize_models(progress=gr.Progress()):
60
  """Initialize models on first use"""
61
  global model_manager, models_loaded
@@ -98,16 +89,7 @@ def loudness_norm(audio_array, sr=16000, lufs=-20.0):
98
 
99
 
100
  def process_audio(audio_path, target_sr=16000):
101
- """
102
- Process audio file for InfiniteTalk (matches audio_prepare_single from reference)
103
-
104
- Args:
105
- audio_path: Path to audio file
106
- target_sr: Target sample rate
107
-
108
- Returns:
109
- Processed audio array and sample rate
110
- """
111
  try:
112
  # Load audio with librosa
113
  audio, sr = librosa.load(audio_path, sr=target_sr)
@@ -155,27 +137,12 @@ def generate_video(
155
  seed=-1,
156
  progress=gr.Progress()
157
  ):
158
- """
159
- Generate talking video from image or dub existing video
160
-
161
- Args:
162
- image_or_video: Input image or video file
163
- audio_file: Audio file for lip-sync
164
- resolution: Output resolution (480p or 720p)
165
- steps: Number of diffusion steps
166
- audio_guide_scale: Audio conditioning strength
167
- seed: Random seed for reproducibility
168
- progress: Gradio progress tracker
169
-
170
- Returns:
171
- Path to generated video
172
- """
173
  try:
174
  # Check if GPU is available
175
  if not torch.cuda.is_available():
176
  raise gr.Error(
177
- "⚠️ GPU not available. This Space requires GPU hardware to generate videos. "
178
- "Please apply for a Community GPU Grant in the Space settings, or run this app locally with a GPU."
179
  )
180
 
181
  # Initialize models if needed
@@ -195,11 +162,6 @@ def generate_video(
195
  audio_duration = len(audio) / sr
196
  logger.info(f"Audio duration: {audio_duration:.2f}s")
197
 
198
- # Calculate ZeroGPU duration
199
- zerogpu_duration = gpu_manager.calculate_duration_for_zerogpu(
200
- audio_duration, resolution
201
- )
202
-
203
  progress(0.2, desc="Loading models...")
204
 
205
  # Load models
@@ -228,7 +190,7 @@ def generate_video(
228
 
229
  progress(0.4, desc="Extracting audio features...")
230
 
231
- # Extract audio features (matches get_embedding from reference)
232
  audio_duration = len(audio) / sr
233
  video_length = audio_duration * 25 # Assume 25 FPS
234
 
@@ -246,8 +208,7 @@ def generate_video(
246
  if len(embeddings) == 0 or not hasattr(embeddings, 'hidden_states'):
247
  raise gr.Error("Failed to extract audio embeddings")
248
 
249
- # Stack hidden states (matches reference implementation)
250
- from einops import rearrange
251
  audio_embeddings = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
252
  audio_embeddings = rearrange(audio_embeddings, "b s d -> s b d")
253
  audio_embeddings = audio_embeddings.cpu().detach()
@@ -255,7 +216,7 @@ def generate_video(
255
  logger.info(f"Audio embeddings shape: {audio_embeddings.shape}")
256
  gpu_manager.print_memory_usage("After audio processing - ")
257
 
258
- progress(0.5, desc="Generating video (this may take a minute)...")
259
 
260
  # Set random seed
261
  if seed == -1:
@@ -265,69 +226,18 @@ def generate_video(
265
  if torch.cuda.is_available():
266
  torch.cuda.manual_seed(seed)
267
 
268
- # Generate video with InfiniteTalk
269
  output_path = f"/tmp/output_{seed}.mp4"
270
 
271
- # Prepare input for pipeline (following generate_infinitetalk.py structure)
272
- with torch.no_grad():
273
- logger.info(f"Generating {resolution} video with {steps} steps...")
274
-
275
- # Save audio embeddings to temporary file (pipeline expects file path)
276
- import tempfile
277
- os.makedirs("/tmp/audio_embeddings", exist_ok=True)
278
- emb_path = "/tmp/audio_embeddings/1.pt"
279
- audio_wav_path = "/tmp/audio_embeddings/sum.wav"
280
-
281
- torch.save(audio_embeddings, emb_path)
282
- sf.write(audio_wav_path, audio, sr)
283
-
284
- # Prepare input dictionary (matches generate_infinitetalk.py format)
285
- input_clip = {
286
- "prompt": "", # Empty prompt for talking head
287
- "cond_video": image_or_video,
288
- "cond_audio": {
289
- "person1": emb_path
290
- },
291
- "video_audio": audio_wav_path
292
- }
293
-
294
- # Calculate sample_shift based on resolution
295
- sample_shift = 7 if resolution == "480p" else 11
296
-
297
- # Call InfiniteTalk pipeline
298
- video_tensor = wan_pipeline.generate_infinitetalk(
299
- input_clip,
300
- size_buckget=size,
301
- motion_frame=9, # Default motion frame
302
- frame_num=81, # Default frame num (4n+1 format)
303
- shift=sample_shift,
304
- sampling_steps=steps,
305
- text_guide_scale=5.0, # Default text guidance
306
- audio_guide_scale=audio_guide_scale,
307
- seed=seed,
308
- offload_model=True,
309
- max_frames_num=81, # For clip mode
310
- color_correction_strength=1.0,
311
- extra_args=None
312
- )
313
-
314
- # Save video with audio
315
- from wan.utils.multitalk_utils import save_video_ffmpeg
316
-
317
- save_video_ffmpeg(
318
- video_tensor,
319
- output_path.replace(".mp4", ""), # Function adds .mp4 extension
320
- [audio_wav_path],
321
- high_quality_save=False
322
- )
323
-
324
- progress(0.9, desc="Finalizing...")
325
-
326
- # Cleanup
327
- gpu_manager.cleanup()
328
 
329
  progress(1.0, desc="Complete!")
330
-
331
  logger.info(f"Video generated successfully: {output_path}")
332
  return output_path
333
 
@@ -340,7 +250,7 @@ def generate_video(
340
  def create_interface():
341
  """Create Gradio interface"""
342
 
343
- with gr.Blocks(title="InfiniteTalk - Talking Video Generator", theme=gr.themes.Soft()) as demo:
344
  gr.Markdown("""
345
  # 🎬 InfiniteTalk - Talking Video Generator
346
 
@@ -355,101 +265,30 @@ def create_interface():
355
  gr.Markdown("Transform a static portrait into a talking video")
356
 
357
  with gr.Row():
358
- with gr.Column():
359
- image_input = gr.Image(
360
- type="filepath",
361
- label="Upload Portrait Image (clear face visibility recommended)"
362
- )
363
- audio_input_i2v = gr.Audio(
364
- type="filepath",
365
- label="Upload Audio (MP3, WAV, or FLAC)"
366
- )
367
-
368
- with gr.Accordion("Advanced Settings", open=False):
369
- resolution_i2v = gr.Radio(
370
- choices=["480p", "720p"],
371
- value="480p",
372
- label="Resolution (480p faster, 720p higher quality)"
373
- )
374
- steps_i2v = gr.Slider(
375
- minimum=20,
376
- maximum=50,
377
- value=40,
378
- step=1,
379
- label="Diffusion Steps (more = higher quality but slower)"
380
- )
381
- audio_scale_i2v = gr.Slider(
382
- minimum=1.0,
383
- maximum=5.0,
384
- value=3.0,
385
- step=0.5,
386
- label="Audio Guide Scale (2-4 recommended)"
387
- )
388
- seed_i2v = gr.Number(
389
- value=-1,
390
- label="Seed (-1 for random)"
391
- )
392
-
393
- generate_btn_i2v = gr.Button("🎬 Generate Video", variant="primary", size="lg")
394
-
395
- with gr.Column():
396
- output_video_i2v = gr.Video(label="Generated Video")
397
- gr.Markdown("**💡 Tip**: Use high-quality portrait images with clear facial features for best results")
398
-
399
- generate_btn_i2v.click(
400
  fn=generate_video,
401
- inputs=[image_input, audio_input_i2v, resolution_i2v, steps_i2v, audio_scale_i2v, seed_i2v],
402
- outputs=output_video_i2v
403
  )
404
 
405
  # Tab 2: Video Dubbing
406
  with gr.Tab("🎥 Video Dubbing"):
407
- gr.Markdown("Dub an existing video with new audio while maintaining natural movements")
408
 
409
- with gr.Row():
410
- with gr.Column():
411
- video_input = gr.Video(
412
- label="Upload Video (with visible face)"
413
- )
414
- audio_input_v2v = gr.Audio(
415
- type="filepath",
416
- label="Upload New Audio (MP3, WAV, or FLAC)"
417
- )
418
-
419
- with gr.Accordion("Advanced Settings", open=False):
420
- resolution_v2v = gr.Radio(
421
- choices=["480p", "720p"],
422
- value="480p",
423
- label="Resolution"
424
- )
425
- steps_v2v = gr.Slider(
426
- minimum=20,
427
- maximum=50,
428
- value=40,
429
- step=1,
430
- label="Diffusion Steps"
431
- )
432
- audio_scale_v2v = gr.Slider(
433
- minimum=1.0,
434
- maximum=5.0,
435
- value=3.0,
436
- step=0.5,
437
- label="Audio Guide Scale"
438
- )
439
- seed_v2v = gr.Number(
440
- value=-1,
441
- label="Seed"
442
- )
443
-
444
- generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
445
-
446
- with gr.Column():
447
- output_video_v2v = gr.Video(label="Dubbed Video")
448
- gr.Markdown("**💡 Tip**: For best results, use videos with consistent face visibility throughout")
449
 
450
  generate_btn_v2v.click(
451
  fn=generate_video,
452
- inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
453
  outputs=output_video_v2v
454
  )
455
 
@@ -460,12 +299,6 @@ def create_interface():
460
  Powered by [InfiniteTalk](https://github.com/MeiGen-AI/InfiniteTalk) - Apache 2.0 License
461
 
462
  ⚠️ **Note**: This Space requires GPU hardware to generate videos. Apply for a Community GPU Grant in Settings.
463
-
464
- 💡 **Tips**:
465
- - First generation downloads models (~15GB) and may take 2-3 minutes
466
- - Use 480p for faster generation (~40s for 10s video)
467
- - Use 720p for higher quality (slower but better results)
468
- - Clear, well-lit images produce the best results
469
  """)
470
 
471
  return demo
@@ -473,5 +306,4 @@ def create_interface():
473
 
474
  if __name__ == "__main__":
475
  demo = create_interface()
476
- demo.queue(max_size=10)
477
  demo.launch()
 
5
 
6
  import os
7
  import sys
 
 
 
 
 
 
8
  import random
9
  import logging
10
  import warnings
 
13
  import gradio as gr
14
  import torch
15
  import numpy as np
16
+ import librosa
17
+ import soundfile as sf
18
+ import pyloudnorm as pyln
19
+ from PIL import Image
20
+ from einops import rearrange
 
 
 
 
 
21
 
22
  # Import utilities
23
  from utils.model_loader import ModelManager
24
  from utils.gpu_manager import gpu_manager
 
 
25
  import wan
26
  from wan.configs import SIZE_CONFIGS, WAN_CONFIGS
27
  from wan.utils.utils import cache_image, cache_video, is_video
28
  from wan.utils.multitalk_utils import save_video_ffmpeg
 
 
 
 
 
29
  from transformers import Wav2Vec2FeatureExtractor
30
  from src.audio_analysis.wav2vec2 import Wav2Vec2Model
31
 
32
+ # Set environment variables before importing Torch
33
+ os.environ["TORCHVISION_DISABLE_META_REGISTRATIONS"] = "1"
34
+ os.environ["TORCH_LOGS"] = "-all" # Reduce torch logging noise
35
+
36
+ # Suppress warnings
37
+ warnings.filterwarnings('ignore')
38
+
39
+ # Setup logging
40
+ logging.basicConfig(level=logging.INFO)
41
+ logger = logging.getLogger(__name__)
42
+
43
+ # Add current directory to path
44
+ sys.path.insert(0, str(Path(__file__).parent))
45
 
46
  # Global variables
47
  model_manager = None
48
  models_loaded = False
49
 
 
50
  def initialize_models(progress=gr.Progress()):
51
  """Initialize models on first use"""
52
  global model_manager, models_loaded
 
89
 
90
 
91
  def process_audio(audio_path, target_sr=16000):
92
+ """Process audio file for InfiniteTalk"""
 
 
 
 
 
 
 
 
 
93
  try:
94
  # Load audio with librosa
95
  audio, sr = librosa.load(audio_path, sr=target_sr)
 
137
  seed=-1,
138
  progress=gr.Progress()
139
  ):
140
+ """Generate talking video from image or dub existing video"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  try:
142
  # Check if GPU is available
143
  if not torch.cuda.is_available():
144
  raise gr.Error(
145
+ "⚠️ GPU not available. This Space requires GPU hardware to generate videos."
 
146
  )
147
 
148
  # Initialize models if needed
 
162
  audio_duration = len(audio) / sr
163
  logger.info(f"Audio duration: {audio_duration:.2f}s")
164
 
 
 
 
 
 
165
  progress(0.2, desc="Loading models...")
166
 
167
  # Load models
 
190
 
191
  progress(0.4, desc="Extracting audio features...")
192
 
193
+ # Extract audio features
194
  audio_duration = len(audio) / sr
195
  video_length = audio_duration * 25 # Assume 25 FPS
196
 
 
208
  if len(embeddings) == 0 or not hasattr(embeddings, 'hidden_states'):
209
  raise gr.Error("Failed to extract audio embeddings")
210
 
211
+ # Stack hidden states
 
212
  audio_embeddings = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
213
  audio_embeddings = rearrange(audio_embeddings, "b s d -> s b d")
214
  audio_embeddings = audio_embeddings.cpu().detach()
 
216
  logger.info(f"Audio embeddings shape: {audio_embeddings.shape}")
217
  gpu_manager.print_memory_usage("After audio processing - ")
218
 
219
+ progress(0.5, desc="Generating video...")
220
 
221
  # Set random seed
222
  if seed == -1:
 
226
  if torch.cuda.is_available():
227
  torch.cuda.manual_seed(seed)
228
 
229
+ # Generate video
230
  output_path = f"/tmp/output_{seed}.mp4"
231
 
232
+ # Save video with audio
233
+ save_video_ffmpeg(
234
+ video_tensor,
235
+ output_path.replace(".mp4", ""),
236
+ [audio_wav_path],
237
+ high_quality_save=False
238
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  progress(1.0, desc="Complete!")
 
241
  logger.info(f"Video generated successfully: {output_path}")
242
  return output_path
243
 
 
250
  def create_interface():
251
  """Create Gradio interface"""
252
 
253
+ with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
254
  gr.Markdown("""
255
  # 🎬 InfiniteTalk - Talking Video Generator
256
 
 
265
  gr.Markdown("Transform a static portrait into a talking video")
266
 
267
  with gr.Row():
268
+ image_input = gr.Image(type="filepath", label="Upload Portrait Image")
269
+ audio_input = gr.Audio(type="filepath", label="Upload Audio")
270
+
271
+ generate_btn = gr.Button("🎬 Generate Video")
272
+ output_video = gr.Video(label="Generated Video")
273
+
274
+ generate_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  fn=generate_video,
276
+ inputs=[image_input, audio_input],
277
+ outputs=output_video
278
  )
279
 
280
  # Tab 2: Video Dubbing
281
  with gr.Tab("🎥 Video Dubbing"):
282
+ gr.Markdown("Dub an existing video with new audio")
283
 
284
+ video_input = gr.Video(label="Upload Video")
285
+ audio_input_v2v = gr.Audio(type="filepath", label="Upload New Audio")
286
+ generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video")
287
+ output_video_v2v = gr.Video(label="Dubbed Video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  generate_btn_v2v.click(
290
  fn=generate_video,
291
+ inputs=[video_input, audio_input_v2v],
292
  outputs=output_video_v2v
293
  )
294
 
 
299
  Powered by [InfiniteTalk](https://github.com/MeiGen-AI/InfiniteTalk) - Apache 2.0 License
300
 
301
  ⚠️ **Note**: This Space requires GPU hardware to generate videos. Apply for a Community GPU Grant in Settings.
 
 
 
 
 
 
302
  """)
303
 
304
  return demo
 
306
 
307
  if __name__ == "__main__":
308
  demo = create_interface()
 
309
  demo.launch()