VibeVoice-demo-dev

Paused

App Files Files Community

broadfield-dev commited on Aug 26

Commit

212332b

verified ·

1 Parent(s): 6d6254a

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -37

app.py CHANGED Viewed

@@ -51,62 +51,45 @@ print(f"Reading {demo_script_path} to apply environment-specific modifications..
 try:
     modified_content = demo_script_path.read_text()
-    # Define the original model loading block using a list of lines for robustness.
-    original_model_lines = [
-        '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
-        '            self.model_path,',
-        '            torch_dtype=torch.bfloat16,',
-        "            device_map='cuda',",
-        '            attn_implementation="flash_attention_2",',
-        '        )'
-    ]
-    original_model_block = "\n".join(original_model_lines)
-    # More robustly define the generation method signature to patch.
-    # We only need the first line to find our target.
-    original_method_signature = "    def generate_podcast_streaming(self,"
     if USE_ZEROGPU:
-        print("Optimizing for ZeroGPU execution...")
         # Add 'import spaces' if it's not already there.
         if "import spaces" not in modified_content:
             modified_content = "import spaces\n" + modified_content
-        # New block for ZeroGPU model loading: remove `attn_implementation`.
-        replacement_model_lines_gpu = [
-            '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
-            '            self.model_path,',
-            '            torch_dtype=torch.bfloat16,',
-            "            device_map='cuda',",
-            '        )'
-        ]
-        replacement_model_block_gpu = "\n".join(replacement_model_lines_gpu)
-        # Add the @spaces.GPU decorator *with correct indentation* before the method.
         replacement_method_signature_gpu = "    @spaces.GPU(duration=120)\n" + original_method_signature
         # --- Apply Patches for GPU ---
-        # Patch 1: Decorate the generation method
         if original_method_signature in modified_content:
             modified_content = modified_content.replace(original_method_signature, replacement_method_signature_gpu)
             print("Successfully applied GPU decorator to the generation method.")
         else:
             print("\033[91mError: Could not find the generation method signature to apply the GPU decorator.\033[0m")
             sys.exit(1)
-        # Patch 2: Modify the model loading
-        if original_model_block in modified_content:
-            modified_content = modified_content.replace(original_model_block, replacement_model_block_gpu)
-            print("Successfully patched the model loading block for ZeroGPU.")
-        else:
-            print("\033[91mError: The original model loading block was not found. Patching may have failed.\033[0m")
-            sys.exit(1)
     else: # Pure CPU execution
         print("Modifying for pure CPU execution...")
         # New block for CPU: Use float32 and map to CPU.
         replacement_model_lines_cpu = [
             '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
@@ -122,7 +105,7 @@ try:
             modified_content = modified_content.replace(original_model_block, replacement_model_block_cpu)
             print("Script modified for CPU successfully.")
         else:
-            print("\033[91mError: The original model loading block was not found. Patching may have failed.\03-3[0m")
             sys.exit(1)
     # Write the dynamically modified content back to the demo file

 try:
     modified_content = demo_script_path.read_text()
     if USE_ZEROGPU:
+        print("Configuring for ZeroGPU execution while keeping Flash Attention...")
         # Add 'import spaces' if it's not already there.
         if "import spaces" not in modified_content:
             modified_content = "import spaces\n" + modified_content
+        # Define the generation method signature to add the decorator to.
+        # We target only the first line for robustness.
+        original_method_signature = "    def generate_podcast_streaming(self,"
+        # Define the replacement with the correctly indented decorator.
         replacement_method_signature_gpu = "    @spaces.GPU(duration=120)\n" + original_method_signature
         # --- Apply Patches for GPU ---
+        # The only change needed is to add the decorator. We will NOT modify the
+        # from_pretrained call, leaving attn_implementation="flash_attention_2" in place.
         if original_method_signature in modified_content:
             modified_content = modified_content.replace(original_method_signature, replacement_method_signature_gpu)
             print("Successfully applied GPU decorator to the generation method.")
+            print("Model loading block remains unchanged to explicitly use Flash Attention.")
         else:
             print("\033[91mError: Could not find the generation method signature to apply the GPU decorator.\033[0m")
             sys.exit(1)
     else: # Pure CPU execution
         print("Modifying for pure CPU execution...")
+        # For the CPU path, we still need to replace the entire CUDA-specific block.
+        original_model_lines = [
+            '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
+            '            self.model_path,',
+            '            torch_dtype=torch.bfloat16,',
+            "            device_map='cuda',",
+            '            attn_implementation="flash_attention_2",',
+            '        )'
+        ]
+        original_model_block = "\n".join(original_model_lines)
         # New block for CPU: Use float32 and map to CPU.
         replacement_model_lines_cpu = [
             '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
             modified_content = modified_content.replace(original_model_block, replacement_model_block_cpu)
             print("Script modified for CPU successfully.")
         else:
+            print("\033[91mError: The original model loading block was not found for CPU patching.\033[0m")
             sys.exit(1)
     # Write the dynamically modified content back to the demo file