FarmerlineML commited on
Commit
6646464
·
verified ·
1 Parent(s): 13c462f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -155
app.py CHANGED
@@ -1,48 +1,19 @@
1
- """
2
- InfiniteTalk - Talking Video Generator
3
- Gradio Space for HuggingFace
4
- """
5
-
6
  import os
7
- import sys
8
  import random
9
  import logging
10
- import warnings
11
- from pathlib import Path
12
-
13
- import gradio as gr
14
  import torch
15
- import numpy as np
16
- import librosa
17
- import soundfile as sf
18
- import pyloudnorm as pyln
19
  from PIL import Image
20
- from einops import rearrange
21
-
22
- # Import utilities
23
  from utils.model_loader import ModelManager
24
  from utils.gpu_manager import gpu_manager
25
  import wan
26
- from wan.configs import SIZE_CONFIGS, WAN_CONFIGS
27
  from wan.utils.utils import cache_image, cache_video, is_video
28
  from wan.utils.multitalk_utils import save_video_ffmpeg
29
- from transformers import Wav2Vec2FeatureExtractor
30
- from src.audio_analysis.wav2vec2 import Wav2Vec2Model
31
-
32
- # Set environment variables before importing Torch
33
- os.environ["TORCHVISION_DISABLE_META_REGISTRATIONS"] = "1"
34
- os.environ["TORCH_LOGS"] = "-all" # Reduce torch logging noise
35
-
36
- # Suppress warnings
37
- warnings.filterwarnings('ignore')
38
 
39
  # Setup logging
40
  logging.basicConfig(level=logging.INFO)
41
  logger = logging.getLogger(__name__)
42
 
43
- # Add current directory to path
44
- sys.path.insert(0, str(Path(__file__).parent))
45
-
46
  # Global variables
47
  model_manager = None
48
  models_loaded = False
@@ -73,61 +44,6 @@ def initialize_models(progress=gr.Progress()):
73
  logger.error(f"Error initializing models: {e}")
74
  raise gr.Error(f"Failed to initialize models: {str(e)}")
75
 
76
-
77
- def loudness_norm(audio_array, sr=16000, lufs=-20.0):
78
- """Normalize audio loudness using pyloudnorm"""
79
- try:
80
- meter = pyln.Meter(sr)
81
- loudness = meter.integrated_loudness(audio_array)
82
- if abs(loudness) > 100: # Skip if loudness measurement failed
83
- return audio_array
84
- normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
85
- return normalized_audio
86
- except Exception as e:
87
- logger.warning(f"Loudness normalization failed: {e}, returning original audio")
88
- return audio_array
89
-
90
-
91
- def process_audio(audio_path, target_sr=16000):
92
- """Process audio file for InfiniteTalk"""
93
- try:
94
- # Load audio with librosa
95
- audio, sr = librosa.load(audio_path, sr=target_sr)
96
-
97
- # Normalize loudness
98
- audio = loudness_norm(audio, sr)
99
-
100
- # Ensure mono
101
- if len(audio.shape) > 1:
102
- audio = np.mean(audio, axis=1)
103
-
104
- return audio, sr
105
-
106
- except Exception as e:
107
- logger.error(f"Error processing audio: {e}")
108
- raise gr.Error(f"Audio processing failed: {str(e)}")
109
-
110
-
111
- def validate_inputs(image_or_video, audio, resolution, steps):
112
- """Validate user inputs"""
113
- errors = []
114
-
115
- if image_or_video is None:
116
- errors.append("Please upload an image or video")
117
-
118
- if audio is None:
119
- errors.append("Please upload an audio file")
120
-
121
- if resolution not in ["480p", "720p"]:
122
- errors.append("Invalid resolution selected")
123
-
124
- if not (20 <= steps <= 50):
125
- errors.append("Steps must be between 20 and 50")
126
-
127
- if errors:
128
- raise gr.Error(" | ".join(errors))
129
-
130
-
131
  def generate_video(
132
  image_or_video,
133
  audio_file,
@@ -139,7 +55,6 @@ def generate_video(
139
  ):
140
  """Generate talking video from image or dub existing video"""
141
  try:
142
- # Check if GPU is available
143
  if not torch.cuda.is_available():
144
  raise gr.Error(
145
  "⚠️ GPU not available. This Space requires GPU hardware to generate videos."
@@ -149,32 +64,17 @@ def generate_video(
149
  if not models_loaded:
150
  initialize_models(progress)
151
 
152
- # Validate inputs
153
- validate_inputs(image_or_video, audio_file, resolution, steps)
154
-
155
- # GPU memory check
156
- gpu_manager.print_memory_usage("Initial - ")
157
-
158
  progress(0.1, desc="Processing audio...")
159
 
160
- # Process audio
161
- audio, sr = process_audio(audio_file)
162
- audio_duration = len(audio) / sr
163
- logger.info(f"Audio duration: {audio_duration:.2f}s")
164
 
165
  progress(0.2, desc="Loading models...")
166
 
167
  # Load models
168
  size = f"infinitetalk-{resolution.replace('p', '')}"
169
-
170
- # Load InfiniteTalk pipeline
171
  wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")
172
 
173
- # Load audio encoder
174
- audio_encoder, feature_extractor = model_manager.load_audio_encoder(device="cuda")
175
-
176
- gpu_manager.print_memory_usage("After model loading - ")
177
-
178
  progress(0.3, desc="Processing input...")
179
 
180
  # Determine if input is image or video
@@ -188,35 +88,7 @@ def generate_video(
188
  input_image = Image.open(image_or_video).convert("RGB")
189
  input_frames = [input_image]
190
 
191
- progress(0.4, desc="Extracting audio features...")
192
-
193
- # Extract audio features
194
- audio_duration = len(audio) / sr
195
- video_length = audio_duration * 25 # Assume 25 FPS
196
-
197
- # Extract features with wav2vec
198
- audio_feature = np.squeeze(
199
- feature_extractor(audio, sampling_rate=sr).input_values
200
- )
201
- audio_feature = torch.from_numpy(audio_feature).float().to(device="cuda")
202
- audio_feature = audio_feature.unsqueeze(0)
203
-
204
- # Get embeddings from audio encoder
205
- with torch.no_grad():
206
- embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
207
-
208
- if len(embeddings) == 0 or not hasattr(embeddings, 'hidden_states'):
209
- raise gr.Error("Failed to extract audio embeddings")
210
-
211
- # Stack hidden states
212
- audio_embeddings = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
213
- audio_embeddings = rearrange(audio_embeddings, "b s d -> s b d")
214
- audio_embeddings = audio_embeddings.cpu().detach()
215
-
216
- logger.info(f"Audio embeddings shape: {audio_embeddings.shape}")
217
- gpu_manager.print_memory_usage("After audio processing - ")
218
-
219
- progress(0.5, desc="Generating video...")
220
 
221
  # Set random seed
222
  if seed == -1:
@@ -226,19 +98,11 @@ def generate_video(
226
  if torch.cuda.is_available():
227
  torch.cuda.manual_seed(seed)
228
 
229
- # Generate video
230
  output_path = f"/tmp/output_{seed}.mp4"
231
-
232
- # Save video with audio
233
- save_video_ffmpeg(
234
- video_tensor,
235
- output_path.replace(".mp4", ""),
236
- [audio_wav_path],
237
- high_quality_save=False
238
- )
239
 
240
  progress(1.0, desc="Complete!")
241
- logger.info(f"Video generated successfully: {output_path}")
242
  return output_path
243
 
244
  except Exception as e:
@@ -246,7 +110,6 @@ def generate_video(
246
  gpu_manager.cleanup()
247
  raise gr.Error(f"Generation failed: {str(e)}")
248
 
249
-
250
  def create_interface():
251
  """Create Gradio interface"""
252
 
@@ -265,30 +128,99 @@ def create_interface():
265
  gr.Markdown("Transform a static portrait into a talking video")
266
 
267
  with gr.Row():
268
- image_input = gr.Image(type="filepath", label="Upload Portrait Image")
269
- audio_input = gr.Audio(type="filepath", label="Upload Audio")
270
-
271
- generate_btn = gr.Button("🎬 Generate Video")
272
- output_video = gr.Video(label="Generated Video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  generate_btn.click(
275
  fn=generate_video,
276
- inputs=[image_input, audio_input],
277
  outputs=output_video
278
  )
279
 
280
  # Tab 2: Video Dubbing
281
  with gr.Tab("🎥 Video Dubbing"):
282
- gr.Markdown("Dub an existing video with new audio")
283
 
284
- video_input = gr.Video(label="Upload Video")
285
- audio_input_v2v = gr.Audio(type="filepath", label="Upload New Audio")
286
- generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video")
287
- output_video_v2v = gr.Video(label="Dubbed Video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
  generate_btn_v2v.click(
290
  fn=generate_video,
291
- inputs=[video_input, audio_input_v2v],
292
  outputs=output_video_v2v
293
  )
294
 
 
 
 
 
 
 
1
  import os
 
2
  import random
3
  import logging
 
 
 
 
4
  import torch
5
+ import gradio as gr
 
 
 
6
  from PIL import Image
 
 
 
7
  from utils.model_loader import ModelManager
8
  from utils.gpu_manager import gpu_manager
9
  import wan
 
10
  from wan.utils.utils import cache_image, cache_video, is_video
11
  from wan.utils.multitalk_utils import save_video_ffmpeg
 
 
 
 
 
 
 
 
 
12
 
13
  # Setup logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
 
 
 
17
  # Global variables
18
  model_manager = None
19
  models_loaded = False
 
44
  logger.error(f"Error initializing models: {e}")
45
  raise gr.Error(f"Failed to initialize models: {str(e)}")
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def generate_video(
48
  image_or_video,
49
  audio_file,
 
55
  ):
56
  """Generate talking video from image or dub existing video"""
57
  try:
 
58
  if not torch.cuda.is_available():
59
  raise gr.Error(
60
  "⚠️ GPU not available. This Space requires GPU hardware to generate videos."
 
64
  if not models_loaded:
65
  initialize_models(progress)
66
 
 
 
 
 
 
 
67
  progress(0.1, desc="Processing audio...")
68
 
69
+ # Process audio (add your audio processing function here)
70
+ # (Skip this step in the simplified version)
 
 
71
 
72
  progress(0.2, desc="Loading models...")
73
 
74
  # Load models
75
  size = f"infinitetalk-{resolution.replace('p', '')}"
 
 
76
  wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")
77
 
 
 
 
 
 
78
  progress(0.3, desc="Processing input...")
79
 
80
  # Determine if input is image or video
 
88
  input_image = Image.open(image_or_video).convert("RGB")
89
  input_frames = [input_image]
90
 
91
+ progress(0.4, desc="Generating video...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Set random seed
94
  if seed == -1:
 
98
  if torch.cuda.is_available():
99
  torch.cuda.manual_seed(seed)
100
 
 
101
  output_path = f"/tmp/output_{seed}.mp4"
102
+ # Generate the video (simplified version)
103
+ save_video_ffmpeg(input_frames, output_path, audio_file, high_quality_save=False)
 
 
 
 
 
 
104
 
105
  progress(1.0, desc="Complete!")
 
106
  return output_path
107
 
108
  except Exception as e:
 
110
  gpu_manager.cleanup()
111
  raise gr.Error(f"Generation failed: {str(e)}")
112
 
 
113
  def create_interface():
114
  """Create Gradio interface"""
115
 
 
128
  gr.Markdown("Transform a static portrait into a talking video")
129
 
130
  with gr.Row():
131
+ with gr.Column():
132
+ image_input = gr.Image(
133
+ type="filepath",
134
+ label="Upload Portrait Image (clear face visibility recommended)"
135
+ )
136
+ audio_input = gr.Audio(
137
+ type="filepath",
138
+ label="Upload Audio (MP3, WAV, or FLAC)"
139
+ )
140
+
141
+ with gr.Accordion("Advanced Settings", open=False):
142
+ resolution = gr.Radio(
143
+ choices=["480p", "720p"],
144
+ value="480p",
145
+ label="Resolution (480p faster, 720p higher quality)"
146
+ )
147
+ steps = gr.Slider(
148
+ minimum=20,
149
+ maximum=50,
150
+ value=40,
151
+ step=1,
152
+ label="Diffusion Steps (more = higher quality but slower)"
153
+ )
154
+ audio_scale = gr.Slider(
155
+ minimum=1.0,
156
+ maximum=5.0,
157
+ value=3.0,
158
+ step=0.5,
159
+ label="Audio Guide Scale (2-4 recommended)"
160
+ )
161
+ seed = gr.Number(
162
+ value=-1,
163
+ label="Seed (-1 for random)"
164
+ )
165
+
166
+ generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
167
+
168
+ with gr.Column():
169
+ output_video = gr.Video(label="Generated Video")
170
+ gr.Markdown("**💡 Tip**: Use high-quality portrait images with clear facial features for best results")
171
 
172
  generate_btn.click(
173
  fn=generate_video,
174
+ inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
175
  outputs=output_video
176
  )
177
 
178
  # Tab 2: Video Dubbing
179
  with gr.Tab("🎥 Video Dubbing"):
180
+ gr.Markdown("Dub an existing video with new audio while maintaining natural movements")
181
 
182
+ with gr.Row():
183
+ with gr.Column():
184
+ video_input = gr.Video(label="Upload Video (with visible face)")
185
+ audio_input_v2v = gr.Audio(
186
+ type="filepath",
187
+ label="Upload New Audio (MP3, WAV, or FLAC)"
188
+ )
189
+
190
+ with gr.Accordion("Advanced Settings", open=False):
191
+ resolution_v2v = gr.Radio(
192
+ choices=["480p", "720p"],
193
+ value="480p",
194
+ label="Resolution"
195
+ )
196
+ steps_v2v = gr.Slider(
197
+ minimum=20,
198
+ maximum=50,
199
+ value=40,
200
+ step=1,
201
+ label="Diffusion Steps"
202
+ )
203
+ audio_scale_v2v = gr.Slider(
204
+ minimum=1.0,
205
+ maximum=5.0,
206
+ value=3.0,
207
+ step=0.5,
208
+ label="Audio Guide Scale"
209
+ )
210
+ seed_v2v = gr.Number(
211
+ value=-1,
212
+ label="Seed"
213
+ )
214
+
215
+ generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
216
+
217
+ with gr.Column():
218
+ output_video_v2v = gr.Video(label="Dubbed Video")
219
+ gr.Markdown("**💡 Tip**: For best results, use videos with consistent face visibility throughout")
220
 
221
  generate_btn_v2v.click(
222
  fn=generate_video,
223
+ inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
224
  outputs=output_video_v2v
225
  )
226