Spaces:

prithivMLmods
/

Vision-to-VibeVoice-en

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

d6f9fb3

verified ·

1 Parent(s): 7abc3cc

update app

Browse files

Files changed (1) hide show

app.py +47 -91

app.py CHANGED Viewed

@@ -10,15 +10,12 @@ from PIL import Image
 from threading import Thread
 from typing import Iterable, Optional, Tuple, List
-# --- Transformer & Model Imports ---
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
-# --- VibeVoice Imports ---
-# Assuming local folder structure exists for these imports
 try:
     from vibevoice.modular.modeling_vibevoice_streaming_inference import (
         VibeVoiceStreamingForConditionalGenerationInference,
@@ -28,39 +25,33 @@ try:
     )
 except ImportError:
     print("CRITICAL WARNING: 'vibevoice' modules not found. Ensure the vibevoice repository structure is present.")
-    # Mocking for syntax checking if files are missing during dry-run
     VibeVoiceStreamingForConditionalGenerationInference = None
     VibeVoiceStreamingProcessor = None
-# --- UI Theme Imports ---
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# ==========================================
-# 1. THEME CONFIGURATION (Steel Blue)
-# ==========================================
-colors.steel_blue = colors.Color(
-    name="steel_blue",
-    c50="#EBF3F8",
-    c100="#D3E5F0",
-    c200="#A8CCE1",
-    c300="#7DB3D2",
-    c400="#529AC3",
-    c500="#4682B4",
-    c600="#3E72A0",
-    c700="#36638C",
-    c800="#2E5378",
-    c900="#264364",
-    c950="#1E3450",
 )
-class SteelBlueTheme(Soft):
     def __init__(
         self,
         *,
         primary_hue: colors.Color | str = colors.gray,
-        secondary_hue: colors.Color | str = colors.steel_blue,
         neutral_hue: colors.Color | str = colors.slate,
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
@@ -87,8 +78,14 @@ class SteelBlueTheme(Soft):
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_800)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_500)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
@@ -100,7 +97,7 @@ class SteelBlueTheme(Soft):
             block_label_background_fill="*primary_200",
         )
-steel_blue_theme = SteelBlueTheme()
 css = """
 #main-title h1 {
@@ -114,20 +111,15 @@ css = """
 }
 """
-# ==========================================
-# 2. MODEL SETUP (Global)
-# ==========================================
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using Main Device: {device}")
-# --- A. Setup Chandra-OCR (Qwen3-VL) ---
-OCR_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
-print(f"Loading OCR Model: {OCR_MODEL_ID}...")
-ocr_processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)
-ocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    OCR_MODEL_ID,
     attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
@@ -135,14 +127,11 @@ ocr_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 print("OCR Model loaded successfully.")
-# --- B. Setup VibeVoice (TTS) ---
 TTS_MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
 print(f"Loading TTS Model: {TTS_MODEL_PATH}...")
-# Load processor
 tts_processor = VibeVoiceStreamingProcessor.from_pretrained(TTS_MODEL_PATH)
-# Load model on CPU initially (moved to GPU on demand to save VRAM)
 tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
     TTS_MODEL_PATH,
     torch_dtype=torch.float16,
@@ -152,12 +141,10 @@ tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
 tts_model.eval()
 tts_model.set_ddpm_inference_steps(num_steps=5)
-# Voice Mapper Class
 class VoiceMapper:
     """Maps speaker names to voice file paths"""
     def __init__(self):
         self.setup_voice_presets()
-        # Clean up names
         new_dict = {}
         for name, path in self.voice_presets.items():
             if "_" in name: name = name.split("_")[0]
@@ -169,8 +156,6 @@ class VoiceMapper:
         voices_dir = os.path.join(os.path.dirname(__file__), "demo/voices/streaming_model")
         if not os.path.exists(voices_dir):
             print(f"Warning: Voices directory not found at {voices_dir}")
-            # Create a placeholder if dir doesn't exist to prevent crash during init,
-            # though generation will fail if no files.
             self.voice_presets = {}
             self.available_voices = {}
             return
@@ -190,12 +175,10 @@ class VoiceMapper:
     def get_voice_path(self, speaker_name: str) -> str:
         if speaker_name in self.voice_presets:
             return self.voice_presets[speaker_name]
-        # Partial match
         speaker_lower = speaker_name.lower()
         for preset_name, path in self.voice_presets.items():
             if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
                 return path
-        # Default
         if self.voice_presets:
             return list(self.voice_presets.values())[0]
         return ""
@@ -203,12 +186,7 @@ class VoiceMapper:
 VOICE_MAPPER = VoiceMapper()
 print("TTS Model loaded successfully.")
-# ==========================================
-# 3. GENERATION FUNCTIONS
-# ==========================================
-@spaces.GPU(duration=120)
 def process_pipeline(
     image: Image.Image,
     query: str,
@@ -224,10 +202,8 @@ def process_pipeline(
     if image is None:
         return "Please upload an image.", None, "Error: No image provided."
-    # --- Step 1: OCR ---
     progress(0.1, desc="Analyzing Image (OCR)...")
-    # Clean query
     if not query:
         query = "OCR the content perfectly."
@@ -239,19 +215,16 @@ def process_pipeline(
         ]
     }]
-    prompt_full = ocr_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    # Process inputs (move to device)
-    inputs = ocr_processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
         padding=True
     ).to(device)
-    # Generate Text
-    # We use standard generate here instead of streamer to get the full string for TTS easily
-    generated_ids = ocr_model.generate(
         **inputs,
         max_new_tokens=ocr_max_tokens,
         do_sample=True,
@@ -262,31 +235,26 @@ def process_pipeline(
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
-    extracted_text = ocr_processor.batch_decode(
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
-    # Clean cleanup
     extracted_text = extracted_text.replace("<|im_end|>", "").strip()
     progress(0.5, desc=f"OCR Complete. Converting to speech ({len(extracted_text)} chars)...")
-    # --- Step 2: TTS ---
     if not extracted_text:
         return extracted_text, None, "OCR produced no text."
     try:
-        # Pre-process text
         full_script = extracted_text.replace("'", "'").replace('"', '"').replace('"', '"')
-        # Get voice
         voice_path = VOICE_MAPPER.get_voice_path(speaker_name)
         if not voice_path:
              return extracted_text, None, "Error: Voice file not found."
         all_prefilled_outputs = torch.load(voice_path, map_location="cuda", weights_only=False)
-        # Prepare inputs
         tts_inputs = tts_processor.process_input_with_cached_prompt(
             text=full_script,
             cached_prompt=all_prefilled_outputs,
@@ -295,13 +263,11 @@ def process_pipeline(
             return_attention_mask=True,
         )
-        # Move TTS model to GPU
         tts_model.to("cuda")
         for k, v in tts_inputs.items():
             if torch.is_tensor(v):
                 tts_inputs[k] = v.to("cuda")
-        # Generate Audio
         with torch.cuda.amp.autocast():
             outputs = tts_model.generate(
                 **tts_inputs,
@@ -313,7 +279,6 @@ def process_pipeline(
                 all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs)
             )
-        # Move TTS back to CPU to be safe
         tts_model.to("cpu")
         torch.cuda.empty_cache()
@@ -329,7 +294,7 @@ def process_pipeline(
                 output_path=output_path,
             )
-            status = f"✅ Success! OCR Text Length: {len(extracted_text)} chars."
             return extracted_text, output_path, status
         else:
             return extracted_text, None, "TTS Generation failed (no output)."
@@ -340,25 +305,14 @@ def process_pipeline(
         import traceback
         return extracted_text, None, f"Error during TTS: {str(e)}"
-# ==========================================
-# 4. GRADIO INTERFACE
-# ==========================================
-image_examples = [
-    ["OCR the content perfectly.", "examples/3.jpg"],
-    ["Perform OCR on the image.", "examples/1.jpg"],
-    ["Extract the contents. [page].", "examples/2.jpg"],
-]
 with gr.Blocks() as demo:
     gr.Markdown("# **Vision-to-VibeVoice-en**", elem_id="main-title")
     with gr.Row():
-        # --- Left Column: Inputs ---
         with gr.Column(scale=1):
             gr.Markdown("### 1. Vision Input")
             image_upload = gr.Image(type="pil", label="Upload Image", height=300)
-            image_query = gr.Textbox(label="OCR Prompt", value="OCR the content perfectly.", placeholder="E.g., Read this page...")
             gr.Markdown("### 2. Voice Settings")
             voice_choices = list(VOICE_MAPPER.available_voices.keys())
@@ -373,23 +327,20 @@ with gr.Blocks() as demo:
             cfg_slider = gr.Slider(minimum=1.0, maximum=3.0, value=1.5, step=0.1, label="CFG Scale (Speech Fidelity)")
             with gr.Accordion("Advanced Options", open=False):
-                max_new_tokens = gr.Slider(label="Max OCR Tokens", minimum=128, maximum=4096, step=128, value=2048)
-                temperature = gr.Slider(label="OCR Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.1)
-            submit_btn = gr.Button("🚀 Process Vision to Voice", variant="primary", size="lg")
-        # --- Right Column: Outputs ---
         with gr.Column(scale=1):
             gr.Markdown("### 3. Results", elem_id="output-title")
-            # Text Output
             text_output = gr.Textbox(
                 label="Extracted Text (Editable)",
                 interactive=True,
                 lines=10,
             )
-            # Audio Output
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="filepath",
@@ -397,8 +348,13 @@ with gr.Blocks() as demo:
             )
             status_output = gr.Textbox(label="Status Log", lines=2)
-    # --- Logic Connection ---
     submit_btn.click(
         fn=process_pipeline,
         inputs=[
@@ -413,4 +369,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=10).launch(css=css, theme=steel_blue_theme, ssr_mode=False, show_error=True)

 from threading import Thread
 from typing import Iterable, Optional, Tuple, List
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 try:
     from vibevoice.modular.modeling_vibevoice_streaming_inference import (
         VibeVoiceStreamingForConditionalGenerationInference,
     )
 except ImportError:
     print("CRITICAL WARNING: 'vibevoice' modules not found. Ensure the vibevoice repository structure is present.")
     VibeVoiceStreamingForConditionalGenerationInference = None
     VibeVoiceStreamingProcessor = None
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
+colors.orange_red = colors.Color(
+    name="orange_red",
+    c50="#FFF0E5",
+    c100="#FFE0CC",
+    c200="#FFC299",
+    c300="#FFA366",
+    c400="#FF8533",
+    c500="#FF4500",
+    c600="#E63E00",
+    c700="#CC3700",
+    c800="#B33000",
+    c900="#992900",
+    c950="#802200",
 )
+class OrangeRedTheme(Soft):
     def __init__(
         self,
         *,
         primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.orange_red,
         neutral_hue: colors.Color | str = colors.slate,
         text_size: sizes.Size | str = sizes.text_lg,
         font: fonts.Font | str | Iterable[fonts.Font | str] = (
             button_primary_text_color_hover="white",
             button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
             button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_secondary_text_color="black",
+            button_secondary_text_color_hover="white",
+            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
+            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
+            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
+            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
             slider_color="*secondary_500",
             slider_color_dark="*secondary_600",
             block_title_text_weight="600",
             block_label_background_fill="*primary_200",
         )
+orange_red_theme = OrangeRedTheme()
 css = """
 #main-title h1 {
 }
 """
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 print(f"Using Main Device: {device}")
+QWEN_VL_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
+print(f"Loading OCR Model: {QWEN_VL_MODEL_ID}...")
+qwen_processor = AutoProcessor.from_pretrained(QWEN_VL_MODEL_ID, trust_remote_code=True)
+qwen_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    QWEN_VL_MODEL_ID,
     attn_implementation="flash_attention_2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 print("OCR Model loaded successfully.")
 TTS_MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
 print(f"Loading TTS Model: {TTS_MODEL_PATH}...")
 tts_processor = VibeVoiceStreamingProcessor.from_pretrained(TTS_MODEL_PATH)
 tts_model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
     TTS_MODEL_PATH,
     torch_dtype=torch.float16,
 tts_model.eval()
 tts_model.set_ddpm_inference_steps(num_steps=5)
 class VoiceMapper:
     """Maps speaker names to voice file paths"""
     def __init__(self):
         self.setup_voice_presets()
         new_dict = {}
         for name, path in self.voice_presets.items():
             if "_" in name: name = name.split("_")[0]
         voices_dir = os.path.join(os.path.dirname(__file__), "demo/voices/streaming_model")
         if not os.path.exists(voices_dir):
             print(f"Warning: Voices directory not found at {voices_dir}")
             self.voice_presets = {}
             self.available_voices = {}
             return
     def get_voice_path(self, speaker_name: str) -> str:
         if speaker_name in self.voice_presets:
             return self.voice_presets[speaker_name]
         speaker_lower = speaker_name.lower()
         for preset_name, path in self.voice_presets.items():
             if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower():
                 return path
         if self.voice_presets:
             return list(self.voice_presets.values())[0]
         return ""
 VOICE_MAPPER = VoiceMapper()
 print("TTS Model loaded successfully.")
+@spaces.GPU
 def process_pipeline(
     image: Image.Image,
     query: str,
     if image is None:
         return "Please upload an image.", None, "Error: No image provided."
     progress(0.1, desc="Analyzing Image (OCR)...")
     if not query:
         query = "OCR the content perfectly."
         ]
     }]
+    prompt_full = qwen_processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    inputs = qwen_processor(
         text=[prompt_full],
         images=[image],
         return_tensors="pt",
         padding=True
     ).to(device)
+    generated_ids = qwen_model.generate(
         **inputs,
         max_new_tokens=ocr_max_tokens,
         do_sample=True,
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
+    extracted_text = qwen_processor.batch_decode(
         generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )[0]
     extracted_text = extracted_text.replace("<|im_end|>", "").strip()
     progress(0.5, desc=f"OCR Complete. Converting to speech ({len(extracted_text)} chars)...")
     if not extracted_text:
         return extracted_text, None, "OCR produced no text."
     try:
         full_script = extracted_text.replace("'", "'").replace('"', '"').replace('"', '"')
         voice_path = VOICE_MAPPER.get_voice_path(speaker_name)
         if not voice_path:
              return extracted_text, None, "Error: Voice file not found."
         all_prefilled_outputs = torch.load(voice_path, map_location="cuda", weights_only=False)
         tts_inputs = tts_processor.process_input_with_cached_prompt(
             text=full_script,
             cached_prompt=all_prefilled_outputs,
             return_attention_mask=True,
         )
         tts_model.to("cuda")
         for k, v in tts_inputs.items():
             if torch.is_tensor(v):
                 tts_inputs[k] = v.to("cuda")
         with torch.cuda.amp.autocast():
             outputs = tts_model.generate(
                 **tts_inputs,
                 all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs)
             )
         tts_model.to("cpu")
         torch.cuda.empty_cache()
                 output_path=output_path,
             )
+            status = f"✅ Success! Text Length: {len(extracted_text)} chars."
             return extracted_text, output_path, status
         else:
             return extracted_text, None, "TTS Generation failed (no output)."
         import traceback
         return extracted_text, None, f"Error during TTS: {str(e)}"
 with gr.Blocks() as demo:
     gr.Markdown("# **Vision-to-VibeVoice-en**", elem_id="main-title")
+    gr.Markdown("Perform vision-to-audio inference with [Qwen2.5VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) + [VibeVoice-Realtime-0.5B](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B).")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Vision Input")
             image_upload = gr.Image(type="pil", label="Upload Image", height=300)
+            image_query = gr.Textbox(label="Enter the prompt", value="Give a short description indicating whether the image is safe or unsafe.", placeholder="E.g., Read this page...")
             gr.Markdown("### 2. Voice Settings")
             voice_choices = list(VOICE_MAPPER.available_voices.keys())
             cfg_slider = gr.Slider(minimum=1.0, maximum=3.0, value=1.5, step=0.1, label="CFG Scale (Speech Fidelity)")
             with gr.Accordion("Advanced Options", open=False):
+                max_new_tokens = gr.Slider(label="Max Tokens", minimum=128, maximum=4096, step=128, value=2048)
+                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.1)
+            submit_btn = gr.Button("Process Vision to Voice", variant="primary", size="lg")
         with gr.Column(scale=1):
             gr.Markdown("### 3. Results", elem_id="output-title")
             text_output = gr.Textbox(
                 label="Extracted Text (Editable)",
                 interactive=True,
                 lines=10,
             )
             audio_output = gr.Audio(
                 label="Generated Speech",
                 type="filepath",
             )
             status_output = gr.Textbox(label="Status Log", lines=2)
+    gr.Examples(
+        examples=[["Perform OCR on the image.", "examples/1.jpg"]],
+        inputs=[image_query, image_upload],
+        label="Example"
+    )
     submit_btn.click(
         fn=process_pipeline,
         inputs=[
     )
 if __name__ == "__main__":
+    demo.queue(max_size=40).launch(css=css, theme=orange_red_theme, ssr_mode=False, show_error=True)