VibeVoice-demo-dev

Paused

App Files Files Community

broadfield-dev commited on Aug 26

Commit

b338394

verified ·

1 Parent(s): 31adbe7

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -78

app.py CHANGED Viewed

@@ -3,10 +3,6 @@ import subprocess
 import sys
 from pathlib import Path
-# --- 0. Hardcoded Toggle for Execution Environment ---
-# Ensure this is set to True to use the GPU with quantization
-USE_ZEROGPU = False
 # --- 1. Clone the VibeVoice Repository ---
 repo_dir = "VibeVoice"
 if not os.path.exists(repo_dir):
@@ -14,9 +10,7 @@ if not os.path.exists(repo_dir):
     try:
         subprocess.run(
             ["git", "clone", "https://github.com/microsoft/VibeVoice.git"],
-            check=True,
-            capture_output=True,
-            text=True
         )
         print("Repository cloned successfully.")
     except subprocess.CalledProcessError as e:
@@ -29,8 +23,7 @@ else:
 os.chdir(repo_dir)
 print(f"Changed directory to: {os.getcwd()}")
-# Install bitsandbytes for quantization to reduce memory usage
-print("Installing bitsandbytes for quantization...")
 try:
     subprocess.run(
         [sys.executable, "-m", "pip", "install", "bitsandbytes"],
@@ -45,96 +38,103 @@ print("Installing the VibeVoice package in editable mode...")
 try:
     subprocess.run(
         [sys.executable, "-m", "pip", "install", "-e", "."],
-        check=True,
-        capture_output=True,
-        text=True
     )
     print("Package installed successfully.")
 except subprocess.CalledProcessError as e:
     print(f"Error installing package: {e.stderr}")
     sys.exit(1)
-# --- 3. Modify the demo script for a memory-constrained environment ---
 demo_script_path = Path("demo/gradio_demo.py")
-print(f"Reading {demo_script_path} to apply environment-specific modifications...")
 try:
     modified_content = demo_script_path.read_text()
-    # Define the original model loading block to be replaced.
-    original_model_lines = [
-        '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
-        '            self.model_path,',
-        '            torch_dtype=torch.bfloat16,',
-        "            device_map='cuda',",
-        '            attn_implementation="flash_attention_2",',
-        '        )'
-    ]
-    original_model_block = "\n".join(original_model_lines)
-    # Define the generation method signature to add the decorator to.
     original_method_signature = "    def generate_podcast_streaming(self,"
-    if USE_ZEROGPU:
-        print("Optimizing for ZeroGPU with 8-bit quantization...")
-        # Add necessary imports if they are not already there.
-        if "import spaces" not in modified_content:
-            modified_content = "import spaces\n" + modified_content
-        # New block for ZeroGPU with 8-bit quantization.
-        # This is the key change to solve the memory issue.
-        replacement_model_lines_gpu = [
-            '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
-            '            self.model_path,',
-            '            load_in_8bit=True,',
-            '            device_map="auto",',
-            '        )'
-        ]
-        replacement_model_block_gpu = "\n".join(replacement_model_lines_gpu)
-        # Add the @spaces.GPU decorator with correct indentation.
-        replacement_method_signature_gpu = "    @spaces.GPU(duration=120)\n" + original_method_signature
-        # --- Apply Patches for GPU ---
-        # Patch 1: Decorate the generation method
-        if original_method_signature in modified_content:
-            modified_content = modified_content.replace(original_method_signature, replacement_method_signature_gpu)
-            print("Successfully applied GPU decorator to the generation method.")
-        else:
-            print("\033[91mError: Could not find the generation method signature to patch.\033[0m")
-            sys.exit(1)
-        # Patch 2: Modify the model loading to use 8-bit quantization
-        if original_model_block in modified_content:
-            modified_content = modified_content.replace(original_model_block, replacement_model_block_gpu)
-            print("Successfully patched model loading for 8-bit quantization.")
-        else:
-            print("\033[91mError: The original model loading block was not found.\033[0m")
-            sys.exit(1)
-    else: # Pure CPU execution (not recommended on ZeroGPU hardware)
-        # This block is unlikely to be used but kept for completeness
-        print("Modifying for pure CPU execution...")
-        replacement_model_lines_cpu = [
-            '        self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(',
-            '            self.model_path,',
-            '            torch_dtype=torch.float32,',
-            '            device_map="cpu",',
-            '        )'
-        ]
-        replacement_model_block_cpu = "\n".join(replacement_model_lines_cpu)
-        if original_model_block in modified_content:
-            modified_content = modified_content.replace(original_model_block, replacement_model_block_cpu)
-        else:
-            print("\033[91mError: The original model loading block was not found for CPU patching.\033[0m")
-            sys.exit(1)
     demo_script_path.write_text(modified_content)
 except Exception as e:
     print(f"An error occurred while modifying the script: {e}")
     sys.exit(1)
 # --- 4. Launch the Gradio Demo ---

 import sys
 from pathlib import Path
 # --- 1. Clone the VibeVoice Repository ---
 repo_dir = "VibeVoice"
 if not os.path.exists(repo_dir):
     try:
         subprocess.run(
             ["git", "clone", "https://github.com/microsoft/VibeVoice.git"],
+            check=True, capture_output=True, text=True
         )
         print("Repository cloned successfully.")
     except subprocess.CalledProcessError as e:
 os.chdir(repo_dir)
 print(f"Changed directory to: {os.getcwd()}")
+print("Installing bitsandbytes for potential quantization...")
 try:
     subprocess.run(
         [sys.executable, "-m", "pip", "install", "bitsandbytes"],
 try:
     subprocess.run(
         [sys.executable, "-m", "pip", "install", "-e", "."],
+        check=True, capture_output=True, text=True
     )
     print("Package installed successfully.")
 except subprocess.CalledProcessError as e:
     print(f"Error installing package: {e.stderr}")
     sys.exit(1)
+# --- 3. Refactor the demo script for ZeroGPU compatibility ---
 demo_script_path = Path("demo/gradio_demo.py")
+print(f"Refactoring {demo_script_path} for ZeroGPU lazy loading...")
 try:
     modified_content = demo_script_path.read_text()
+    # --- Add necessary imports ---
+    if "import spaces" not in modified_content:
+        modified_content = "import spaces\n" + modified_content
+    # --- Patch 1: Prevent model loading at startup ---
+    # We comment out the self.load_model() call in the __init__ method.
+    # This stops the main CPU process from loading the heavyweight model.
+    original_init_line = "        self.load_model()"
+    replacement_init_line = "        # self.load_model() # Patched: Defer model loading to the GPU worker\n        self.model = None\n        self.processor = None"
+    if original_init_line in modified_content:
+        modified_content = modified_content.replace(original_init_line, replacement_init_line)
+        print("Successfully patched __init__ to prevent model loading on startup.")
+    else:
+        print(f"\033[91mError: Could not find '{original_init_line}' to patch.\033[0m")
+        sys.exit(1)
+    # --- Patch 2: Move model loading inside the generation function and add decorator ---
+    # This ensures the model is loaded "just-in-time" on the GPU worker.
     original_method_signature = "    def generate_podcast_streaming(self,"
+    # Define the model loading code to be inserted.
+    # We will use 8-bit quantization to be safe with memory.
+    lazy_load_code = """
+        # Patched: Lazy-load model and processor on the GPU worker
+        if self.model is None or self.processor is None:
+            print("Loading processor & model for the first time on GPU worker...")
+            self.processor = VibeVoiceProcessor.from_pretrained(self.model_path)
+            self.model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+                self.model_path,
+                load_in_8bit=True,
+                device_map="auto",
+            )
+            self.model.eval()
+            self.model.model.noise_scheduler = self.model.model.noise_scheduler.from_config(
+                self.model.model.noise_scheduler.config,
+                algorithm_type='sde-dpmsolver++',
+                beta_schedule='squaredcos_cap_v2'
+            )
+            self.model.set_ddpm_inference_steps(num_steps=self.inference_steps)
+            print("Model and processor loaded successfully on GPU worker.")
+"""
+    # We add the decorator and the lazy loading code.
+    replacement_block = (
+        "    @spaces.GPU(duration=120)\n" +
+        original_method_signature +
+        "\n" +
+        " " * 8 + lazy_load_code.strip().replace("\n", "\n" + " " * 8)
+    )
+    if original_method_signature in modified_content:
+        # Find the start of the method and insert our block right after the signature.
+        # We need to find the full method signature to insert code into it.
+        method_start_index = modified_content.find(original_method_signature)
+        # Find the end of the signature line
+        signature_end_index = modified_content.find("-> Iterator[tuple]:", method_start_index) + len("-> Iterator[tuple]:")
+        # Reconstruct the content
+        pre_method = modified_content[:method_start_index]
+        method_signature_and_body = modified_content[method_start_index:]
+        # Decorate the original signature
+        decorated_signature = "    @spaces.GPU(duration=120)\n" + original_method_signature
+        method_signature_and_body = method_signature_and_body.replace(original_method_signature, decorated_signature)
+        # Insert the lazy loading code after the signature line
+        final_method = method_signature_and_body.replace("-> Iterator[tuple]:", "-> Iterator[tuple]:\n" + lazy_load_code, 1)
+        modified_content = pre_method + final_method
+        print("Successfully refactored generation method for lazy loading on GPU.")
+    else:
+        print(f"\033[91mError: Could not find '{original_method_signature}' to patch.\033[0m")
+        sys.exit(1)
     demo_script_path.write_text(modified_content)
+    print("Script patching complete.")
 except Exception as e:
     print(f"An error occurred while modifying the script: {e}")
+    import traceback
+    traceback.print_exc()
     sys.exit(1)
 # --- 4. Launch the Gradio Demo ---