Spaces:

ShalomKing
/

infinitetalk

Running

App Files Files Community

ShalomKing commited on 14 days ago

Commit

bc5110c

verified ·

1 Parent(s): 38572a2

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +91 -54

app.py CHANGED Viewed

@@ -78,9 +78,23 @@ def initialize_models(progress=gr.Progress()):
         raise gr.Error(f"Failed to initialize models: {str(e)}")
 def process_audio(audio_path, target_sr=16000):
     """
-    Process audio file for InfiniteTalk
     Args:
         audio_path: Path to audio file
@@ -90,18 +104,11 @@ def process_audio(audio_path, target_sr=16000):
         Processed audio array and sample rate
     """
     try:
-        # Load audio
-        audio, sr = librosa.load(audio_path, sr=None)
-        # Resample if needed
-        if sr != target_sr:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr)
-            sr = target_sr
         # Normalize loudness
-        meter = pyln.Meter(sr)
-        loudness = meter.integrated_loudness(audio)
-        audio = pyln.normalize.loudness(audio, loudness, -20.0)
         # Ensure mono
         if len(audio.shape) > 1:
@@ -187,8 +194,8 @@ def generate_video(
         # Load models
         size = f"infinitetalk-{resolution.replace('p', '')}"
-        # Load Wan model
-        wan_model = model_manager.load_wan_model(size=size, device="cuda")
         # Load audio encoder
         audio_encoder, feature_extractor = model_manager.load_audio_encoder(device="cuda")
@@ -210,18 +217,31 @@ def generate_video(
         progress(0.4, desc="Extracting audio features...")
-        # Extract audio features
-        audio_features = feature_extractor(
-            audio,
-            sampling_rate=sr,
-            return_tensors="pt"
-        ).input_values
-        audio_features = audio_features.to("cuda")
         with torch.no_grad():
-            audio_embeddings = audio_encoder(audio_features).last_hidden_state
         gpu_manager.print_memory_usage("After audio processing - ")
         progress(0.5, desc="Generating video (this may take a minute)...")
@@ -234,44 +254,61 @@ def generate_video(
         if torch.cuda.is_available():
             torch.cuda.manual_seed(seed)
-        # Generate video
-        # This is a placeholder for the actual inference logic
-        # The actual implementation would call wan_model.generate() with proper parameters
         output_path = f"/tmp/output_{seed}.mp4"
-        # Simplified inference call (replace with actual InfiniteTalk logic)
         with torch.no_grad():
-            # Parameters
-            generation_args = {
-                "input_frames": input_frames,
-                "audio_embeddings": audio_embeddings,
-                "num_steps": steps,
-                "audio_guide_scale": audio_guide_scale,
-                "size": size,
-                "seed": seed,
-            }
-            # Call model inference (placeholder)
-            # output_frames = wan_model.generate(**generation_args)
-            # For now, just create a dummy output to test the pipeline
-            # In production, this would be replaced with actual video generation
             logger.info(f"Generating {resolution} video with {steps} steps...")
-            # Placeholder: copy input as output for testing
-            import shutil
-            if is_input_video:
-                shutil.copy(image_or_video, output_path)
-            else:
-                # Create a short video from the image
-                # This is just for testing - replace with actual generation
-                logger.warning("Placeholder: actual video generation not implemented yet")
-                raise gr.Error(
-                    "Video generation logic needs to be integrated. "
-                    "This is a template - please integrate the actual InfiniteTalk "
-                    "inference code from generate_infinitetalk.py"
-                )
         progress(0.9, desc="Finalizing...")

         raise gr.Error(f"Failed to initialize models: {str(e)}")
+def loudness_norm(audio_array, sr=16000, lufs=-20.0):
+    """Normalize audio loudness using pyloudnorm"""
+    try:
+        meter = pyln.Meter(sr)
+        loudness = meter.integrated_loudness(audio_array)
+        if abs(loudness) > 100:  # Skip if loudness measurement failed
+            return audio_array
+        normalized_audio = pyln.normalize.loudness(audio_array, loudness, lufs)
+        return normalized_audio
+    except Exception as e:
+        logger.warning(f"Loudness normalization failed: {e}, returning original audio")
+        return audio_array
 def process_audio(audio_path, target_sr=16000):
     """
+    Process audio file for InfiniteTalk (matches audio_prepare_single from reference)
     Args:
         audio_path: Path to audio file
         Processed audio array and sample rate
     """
     try:
+        # Load audio with librosa
+        audio, sr = librosa.load(audio_path, sr=target_sr)
         # Normalize loudness
+        audio = loudness_norm(audio, sr)
         # Ensure mono
         if len(audio.shape) > 1:
         # Load models
         size = f"infinitetalk-{resolution.replace('p', '')}"
+        # Load InfiniteTalk pipeline
+        wan_pipeline = model_manager.load_wan_model(size=size, device="cuda")
         # Load audio encoder
         audio_encoder, feature_extractor = model_manager.load_audio_encoder(device="cuda")
         progress(0.4, desc="Extracting audio features...")
+        # Extract audio features (matches get_embedding from reference)
+        audio_duration = len(audio) / sr
+        video_length = audio_duration * 25  # Assume 25 FPS
+        # Extract features with wav2vec
+        audio_feature = np.squeeze(
+            feature_extractor(audio, sampling_rate=sr).input_values
+        )
+        audio_feature = torch.from_numpy(audio_feature).float().to(device="cuda")
+        audio_feature = audio_feature.unsqueeze(0)
+        # Get embeddings from audio encoder
         with torch.no_grad():
+            embeddings = audio_encoder(audio_feature, seq_len=int(video_length), output_hidden_states=True)
+        if len(embeddings) == 0 or not hasattr(embeddings, 'hidden_states'):
+            raise gr.Error("Failed to extract audio embeddings")
+        # Stack hidden states (matches reference implementation)
+        from einops import rearrange
+        audio_embeddings = torch.stack(embeddings.hidden_states[1:], dim=1).squeeze(0)
+        audio_embeddings = rearrange(audio_embeddings, "b s d -> s b d")
+        audio_embeddings = audio_embeddings.cpu().detach()
+        logger.info(f"Audio embeddings shape: {audio_embeddings.shape}")
         gpu_manager.print_memory_usage("After audio processing - ")
         progress(0.5, desc="Generating video (this may take a minute)...")
         if torch.cuda.is_available():
             torch.cuda.manual_seed(seed)
+        # Generate video with InfiniteTalk
         output_path = f"/tmp/output_{seed}.mp4"
+        # Prepare input for pipeline (following generate_infinitetalk.py structure)
         with torch.no_grad():
             logger.info(f"Generating {resolution} video with {steps} steps...")
+            # Save audio embeddings to temporary file (pipeline expects file path)
+            import tempfile
+            os.makedirs("/tmp/audio_embeddings", exist_ok=True)
+            emb_path = "/tmp/audio_embeddings/1.pt"
+            audio_wav_path = "/tmp/audio_embeddings/sum.wav"
+            torch.save(audio_embeddings, emb_path)
+            sf.write(audio_wav_path, audio, sr)
+            # Prepare input dictionary (matches generate_infinitetalk.py format)
+            input_clip = {
+                "prompt": "",  # Empty prompt for talking head
+                "cond_video": image_or_video,
+                "cond_audio": {
+                    "person1": emb_path
+                },
+                "video_audio": audio_wav_path
+            }
+            # Calculate sample_shift based on resolution
+            sample_shift = 7 if resolution == "480p" else 11
+            # Call InfiniteTalk pipeline
+            video_tensor = wan_pipeline.generate_infinitetalk(
+                input_clip,
+                size_buckget=size,
+                motion_frame=9,  # Default motion frame
+                frame_num=81,  # Default frame num (4n+1 format)
+                shift=sample_shift,
+                sampling_steps=steps,
+                text_guide_scale=5.0,  # Default text guidance
+                audio_guide_scale=audio_guide_scale,
+                seed=seed,
+                offload_model=True,
+                max_frames_num=81,  # For clip mode
+                color_correction_strength=1.0,
+                extra_args=None
+            )
+            # Save video with audio
+            from wan.utils.multitalk_utils import save_video_ffmpeg
+            save_video_ffmpeg(
+                video_tensor,
+                output_path.replace(".mp4", ""),  # Function adds .mp4 extension
+                [audio_wav_path],
+                high_quality_save=False
+            )
         progress(0.9, desc="Finalizing...")