FarmerlineML commited on
Commit
e04d126
·
verified ·
1 Parent(s): 0965356

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -71
app.py CHANGED
@@ -1,25 +1,56 @@
1
  import os
2
  import random
3
  import logging
 
 
4
  import torch
5
  import gradio as gr
6
  from PIL import Image
 
7
  from utils.model_loader import ModelManager
8
  from utils.gpu_manager import gpu_manager
 
9
  import wan
10
  from wan.utils.utils import cache_image, cache_video, is_video
11
  from wan.utils.multitalk_utils import save_video_ffmpeg
12
 
13
- # Setup logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
16
 
17
- # Global variables
18
- model_manager = None
 
 
 
19
  models_loaded = False
20
 
 
21
  def initialize_models(progress=gr.Progress()):
22
- """Initialize models on first use"""
23
  global model_manager, models_loaded
24
 
25
  if models_loaded:
@@ -29,9 +60,9 @@ def initialize_models(progress=gr.Progress()):
29
  progress(0.1, desc="Initializing model manager...")
30
  model_manager = ModelManager()
31
 
32
- progress(0.3, desc="Downloading models (first time only - may take 2-3 minutes)...")
33
 
34
- # Download models (lazy loading - they'll be loaded on first inference)
35
  model_manager.get_wan_model_path()
36
  model_manager.get_infinitetalk_weights_path()
37
  model_manager.get_wav2vec_model_path()
@@ -41,9 +72,22 @@ def initialize_models(progress=gr.Progress()):
41
  logger.info("Models initialized successfully")
42
 
43
  except Exception as e:
44
- logger.error(f"Error initializing models: {e}")
45
  raise gr.Error(f"Failed to initialize models: {str(e)}")
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def generate_video(
48
  image_or_video,
49
  audio_file,
@@ -51,76 +95,74 @@ def generate_video(
51
  steps=40,
52
  audio_guide_scale=3.0,
53
  seed=-1,
54
- progress=gr.Progress()
55
  ):
56
- """Generate talking video from image or dub existing video"""
 
 
 
 
 
57
  try:
58
  if not torch.cuda.is_available():
59
- raise gr.Error(
60
- "⚠️ GPU not available. This Space requires GPU hardware to generate videos."
61
- )
62
 
63
- # Initialize models if needed
64
  if not models_loaded:
65
  initialize_models(progress)
66
 
67
  progress(0.1, desc="Processing audio...")
68
 
69
- # Process audio (add your audio processing function here)
70
- # (Skip this step in the simplified version)
71
-
72
  progress(0.2, desc="Loading models...")
73
-
74
- # Load models
75
  size = f"infinitetalk-{resolution.replace('p', '')}"
76
- wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")
77
 
78
  progress(0.3, desc="Processing input...")
79
 
80
- # Determine if input is image or video
81
- is_input_video = is_video(image_or_video)
82
-
83
- if is_input_video:
84
- logger.info("Processing video dubbing...")
85
  input_frames = cache_video(image_or_video)
86
  else:
87
- logger.info("Processing image-to-video...")
88
  input_image = Image.open(image_or_video).convert("RGB")
89
  input_frames = [input_image]
90
 
91
  progress(0.4, desc="Generating video...")
92
 
93
- # Set random seed
94
- if seed == -1:
95
- seed = random.randint(0, 99999999)
96
-
97
- torch.manual_seed(seed)
98
- if torch.cuda.is_available():
99
- torch.cuda.manual_seed(seed)
100
-
101
  output_path = f"/tmp/output_{seed}.mp4"
102
- # Generate the video (simplified version)
103
- save_video_ffmpeg(input_frames, output_path, audio_file, high_quality_save=False)
 
 
 
 
 
 
104
 
105
  progress(1.0, desc="Complete!")
106
  return output_path
107
 
108
  except Exception as e:
109
- logger.error(f"Error generating video: {e}")
110
  gpu_manager.cleanup()
111
  raise gr.Error(f"Generation failed: {str(e)}")
112
 
113
- def create_interface():
114
- """Create Gradio interface"""
115
 
 
 
116
  with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
117
- gr.Markdown("""
118
- # 🎬 InfiniteTalk - Talking Video Generator
 
119
 
120
- Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!
121
 
122
- **Note**: First generation may take 2-3 minutes while models download. Subsequent generations are much faster (~40s for 10s video).
123
- """)
 
124
 
125
  with gr.Tabs():
126
  # Tab 1: Image-to-Video
@@ -131,48 +173,45 @@ def create_interface():
131
  with gr.Column():
132
  image_input = gr.Image(
133
  type="filepath",
134
- label="Upload Portrait Image (clear face visibility recommended)"
135
  )
136
  audio_input = gr.Audio(
137
  type="filepath",
138
- label="Upload Audio (MP3, WAV, or FLAC)"
139
  )
140
 
141
  with gr.Accordion("Advanced Settings", open=False):
142
  resolution = gr.Radio(
143
  choices=["480p", "720p"],
144
  value="480p",
145
- label="Resolution (480p faster, 720p higher quality)"
146
  )
147
  steps = gr.Slider(
148
  minimum=20,
149
  maximum=50,
150
  value=40,
151
  step=1,
152
- label="Diffusion Steps (more = higher quality but slower)"
153
  )
154
  audio_scale = gr.Slider(
155
  minimum=1.0,
156
  maximum=5.0,
157
  value=3.0,
158
  step=0.5,
159
- label="Audio Guide Scale (2-4 recommended)"
160
- )
161
- seed = gr.Number(
162
- value=-1,
163
- label="Seed (-1 for random)"
164
  )
 
165
 
166
  generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
167
 
168
  with gr.Column():
169
  output_video = gr.Video(label="Generated Video")
170
- gr.Markdown("**💡 Tip**: Use high-quality portrait images with clear facial features for best results")
171
 
172
  generate_btn.click(
173
  fn=generate_video,
174
  inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
175
- outputs=output_video
176
  )
177
 
178
  # Tab 2: Video Dubbing
@@ -184,54 +223,52 @@ def create_interface():
184
  video_input = gr.Video(label="Upload Video (with visible face)")
185
  audio_input_v2v = gr.Audio(
186
  type="filepath",
187
- label="Upload New Audio (MP3, WAV, or FLAC)"
188
  )
189
 
190
  with gr.Accordion("Advanced Settings", open=False):
191
  resolution_v2v = gr.Radio(
192
  choices=["480p", "720p"],
193
  value="480p",
194
- label="Resolution"
195
  )
196
  steps_v2v = gr.Slider(
197
  minimum=20,
198
  maximum=50,
199
  value=40,
200
  step=1,
201
- label="Diffusion Steps"
202
  )
203
  audio_scale_v2v = gr.Slider(
204
  minimum=1.0,
205
  maximum=5.0,
206
  value=3.0,
207
  step=0.5,
208
- label="Audio Guide Scale"
209
- )
210
- seed_v2v = gr.Number(
211
- value=-1,
212
- label="Seed"
213
  )
 
214
 
215
  generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
216
 
217
  with gr.Column():
218
  output_video_v2v = gr.Video(label="Dubbed Video")
219
- gr.Markdown("**💡 Tip**: For best results, use videos with consistent face visibility throughout")
220
 
221
  generate_btn_v2v.click(
222
  fn=generate_video,
223
  inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
224
- outputs=output_video_v2v
225
  )
226
 
227
- # Footer
228
- gr.Markdown("""
229
- ---
230
- ### About
231
- Powered by [InfiniteTalk](https://github.com/MeiGen-AI/InfiniteTalk) - Apache 2.0 License
232
 
233
- ⚠️ **Note**: This Space requires GPU hardware to generate videos. Apply for a Community GPU Grant in Settings.
234
- """)
 
235
 
236
  return demo
237
 
 
1
  import os
2
  import random
3
  import logging
4
+ from typing import Any
5
+
6
  import torch
7
  import gradio as gr
8
  from PIL import Image
9
+
10
  from utils.model_loader import ModelManager
11
  from utils.gpu_manager import gpu_manager
12
+
13
  import wan
14
  from wan.utils.utils import cache_image, cache_video, is_video
15
  from wan.utils.multitalk_utils import save_video_ffmpeg
16
 
17
+
18
+ # =========================
19
+ # HOTFIX: Gradio /api_info crash
20
+ # =========================
21
+ # Fixes: TypeError: argument of type 'bool' is not iterable
22
+ # Caused by gradio_client trying to interpret JSON Schema nodes that can be booleans
23
+ try:
24
+ import gradio_client.utils as gcu
25
+
26
+ _old_json_schema_to_python_type = gcu._json_schema_to_python_type
27
+
28
+ def _json_schema_to_python_type_patched(schema: Any, defs=None):
29
+ if isinstance(schema, bool):
30
+ return "Any"
31
+ return _old_json_schema_to_python_type(schema, defs)
32
+
33
+ gcu._json_schema_to_python_type = _json_schema_to_python_type_patched
34
+ except Exception as e:
35
+ print("gradio_client patch skipped:", e)
36
+
37
+
38
+ # =========================
39
+ # Logging
40
+ # =========================
41
  logging.basicConfig(level=logging.INFO)
42
  logger = logging.getLogger(__name__)
43
 
44
+
45
+ # =========================
46
+ # Globals
47
+ # =========================
48
+ model_manager: ModelManager | None = None
49
  models_loaded = False
50
 
51
+
52
  def initialize_models(progress=gr.Progress()):
53
+ """Download/prepare model assets on first use."""
54
  global model_manager, models_loaded
55
 
56
  if models_loaded:
 
60
  progress(0.1, desc="Initializing model manager...")
61
  model_manager = ModelManager()
62
 
63
+ progress(0.3, desc="Downloading models (first time only)...")
64
 
65
+ # Pre-download assets (actual heavy loading happens on first inference)
66
  model_manager.get_wan_model_path()
67
  model_manager.get_infinitetalk_weights_path()
68
  model_manager.get_wav2vec_model_path()
 
72
  logger.info("Models initialized successfully")
73
 
74
  except Exception as e:
75
+ logger.exception("Error initializing models")
76
  raise gr.Error(f"Failed to initialize models: {str(e)}")
77
 
78
+
79
+ def _set_seed(seed: int) -> int:
80
+ """Set deterministic seeds and return the final seed used."""
81
+ if seed == -1:
82
+ seed = random.randint(0, 99_999_999)
83
+
84
+ torch.manual_seed(seed)
85
+ if torch.cuda.is_available():
86
+ torch.cuda.manual_seed(seed)
87
+
88
+ return seed
89
+
90
+
91
  def generate_video(
92
  image_or_video,
93
  audio_file,
 
95
  steps=40,
96
  audio_guide_scale=3.0,
97
  seed=-1,
98
+ progress=gr.Progress(),
99
  ):
100
+ """
101
+ Generate a talking video from an image OR dub an existing video.
102
+
103
+ Note: This is a simplified pipeline example. Your real pipeline may use
104
+ wan_pipeline + diffusion steps etc. This version just stitches frames + audio.
105
+ """
106
  try:
107
  if not torch.cuda.is_available():
108
+ raise gr.Error("⚠️ GPU not available. This Space requires GPU hardware to generate videos.")
 
 
109
 
110
+ # Ensure models are prepared
111
  if not models_loaded:
112
  initialize_models(progress)
113
 
114
  progress(0.1, desc="Processing audio...")
115
 
 
 
 
116
  progress(0.2, desc="Loading models...")
117
+ # Load models (kept for parity with your structure)
 
118
  size = f"infinitetalk-{resolution.replace('p', '')}"
119
+ wan_pipeline = model_manager.load_wan_model(size=size, device="cuda") # noqa: F841
120
 
121
  progress(0.3, desc="Processing input...")
122
 
123
+ # Decide whether the input is a video or image
124
+ if is_video(image_or_video):
125
+ logger.info("Processing video dubbing input...")
 
 
126
  input_frames = cache_video(image_or_video)
127
  else:
128
+ logger.info("Processing image-to-video input...")
129
  input_image = Image.open(image_or_video).convert("RGB")
130
  input_frames = [input_image]
131
 
132
  progress(0.4, desc="Generating video...")
133
 
134
+ seed = _set_seed(int(seed))
 
 
 
 
 
 
 
135
  output_path = f"/tmp/output_{seed}.mp4"
136
+
137
+ # Simplified output save (frames + audio)
138
+ save_video_ffmpeg(
139
+ input_frames,
140
+ output_path,
141
+ audio_file,
142
+ high_quality_save=False,
143
+ )
144
 
145
  progress(1.0, desc="Complete!")
146
  return output_path
147
 
148
  except Exception as e:
149
+ logger.exception("Error generating video")
150
  gpu_manager.cleanup()
151
  raise gr.Error(f"Generation failed: {str(e)}")
152
 
 
 
153
 
154
+ def create_interface():
155
+ """Create Gradio UI."""
156
  with gr.Blocks(title="InfiniteTalk - Talking Video Generator") as demo:
157
+ gr.Markdown(
158
+ """
159
+ # 🎬 InfiniteTalk - Talking Video Generator
160
 
161
+ Generate realistic talking head videos with accurate lip-sync from images or dub existing videos with new audio!
162
 
163
+ **Note**: First generation may take a few minutes while models download. Subsequent generations are faster.
164
+ """
165
+ )
166
 
167
  with gr.Tabs():
168
  # Tab 1: Image-to-Video
 
173
  with gr.Column():
174
  image_input = gr.Image(
175
  type="filepath",
176
+ label="Upload Portrait Image (clear face visibility recommended)",
177
  )
178
  audio_input = gr.Audio(
179
  type="filepath",
180
+ label="Upload Audio (MP3, WAV, or FLAC)",
181
  )
182
 
183
  with gr.Accordion("Advanced Settings", open=False):
184
  resolution = gr.Radio(
185
  choices=["480p", "720p"],
186
  value="480p",
187
+ label="Resolution (480p faster, 720p higher quality)",
188
  )
189
  steps = gr.Slider(
190
  minimum=20,
191
  maximum=50,
192
  value=40,
193
  step=1,
194
+ label="Diffusion Steps (more = higher quality but slower)",
195
  )
196
  audio_scale = gr.Slider(
197
  minimum=1.0,
198
  maximum=5.0,
199
  value=3.0,
200
  step=0.5,
201
+ label="Audio Guide Scale (24 recommended)",
 
 
 
 
202
  )
203
+ seed = gr.Number(value=-1, label="Seed (-1 for random)")
204
 
205
  generate_btn = gr.Button("🎬 Generate Video", variant="primary", size="lg")
206
 
207
  with gr.Column():
208
  output_video = gr.Video(label="Generated Video")
209
+ gr.Markdown("**💡 Tip**: Use a high-quality portrait image with clear facial features.")
210
 
211
  generate_btn.click(
212
  fn=generate_video,
213
  inputs=[image_input, audio_input, resolution, steps, audio_scale, seed],
214
+ outputs=output_video,
215
  )
216
 
217
  # Tab 2: Video Dubbing
 
223
  video_input = gr.Video(label="Upload Video (with visible face)")
224
  audio_input_v2v = gr.Audio(
225
  type="filepath",
226
+ label="Upload New Audio (MP3, WAV, or FLAC)",
227
  )
228
 
229
  with gr.Accordion("Advanced Settings", open=False):
230
  resolution_v2v = gr.Radio(
231
  choices=["480p", "720p"],
232
  value="480p",
233
+ label="Resolution",
234
  )
235
  steps_v2v = gr.Slider(
236
  minimum=20,
237
  maximum=50,
238
  value=40,
239
  step=1,
240
+ label="Diffusion Steps",
241
  )
242
  audio_scale_v2v = gr.Slider(
243
  minimum=1.0,
244
  maximum=5.0,
245
  value=3.0,
246
  step=0.5,
247
+ label="Audio Guide Scale",
 
 
 
 
248
  )
249
+ seed_v2v = gr.Number(value=-1, label="Seed")
250
 
251
  generate_btn_v2v = gr.Button("🎬 Generate Dubbed Video", variant="primary", size="lg")
252
 
253
  with gr.Column():
254
  output_video_v2v = gr.Video(label="Dubbed Video")
255
+ gr.Markdown("**💡 Tip**: Use a video with consistent face visibility.")
256
 
257
  generate_btn_v2v.click(
258
  fn=generate_video,
259
  inputs=[video_input, audio_input_v2v, resolution_v2v, steps_v2v, audio_scale_v2v, seed_v2v],
260
+ outputs=output_video_v2v,
261
  )
262
 
263
+ gr.Markdown(
264
+ """
265
+ ---
266
+ ### About
267
+ Powered by InfiniteTalk (Apache 2.0)
268
 
269
+ ⚠️ **Note**: This Space requires GPU hardware to generate videos.
270
+ """
271
+ )
272
 
273
  return demo
274