Spaces:

prithivMLmods
/

Vision-to-VibeVoice-en

Running on Zero

prithivMLmods commited on 11 days ago

Commit

198a838

verified ·

1 Parent(s): d6f9fb3

update app

Files changed (1) hide show

app.py CHANGED Viewed

@@ -305,13 +305,16 @@ def process_pipeline(
         import traceback
         return extracted_text, None, f"Error during TTS: {str(e)}"
 with gr.Blocks() as demo:
     gr.Markdown("# **Vision-to-VibeVoice-en**", elem_id="main-title")
     gr.Markdown("Perform vision-to-audio inference with [Qwen2.5VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) + [VibeVoice-Realtime-0.5B](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B).")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Vision Input")
-            image_upload = gr.Image(type="pil", label="Upload Image", height=300)
             image_query = gr.Textbox(label="Enter the prompt", value="Give a short description indicating whether the image is safe or unsafe.", placeholder="E.g., Read this page...")
             gr.Markdown("### 2. Voice Settings")
@@ -349,11 +352,11 @@ with gr.Blocks() as demo:
             status_output = gr.Textbox(label="Status Log", lines=2)
-    gr.Examples(
-        examples=[["Perform OCR on the image.", "examples/1.jpg"]],
-        inputs=[image_query, image_upload],
-        label="Example"
-    )
     submit_btn.click(
         fn=process_pipeline,

         import traceback
         return extracted_text, None, f"Error during TTS: {str(e)}"
+url = "https://huggingface.co/datasets/strangervisionhf/image-examples/resolve/main/2.jpg?download=true"
+example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 with gr.Blocks() as demo:
     gr.Markdown("# **Vision-to-VibeVoice-en**", elem_id="main-title")
     gr.Markdown("Perform vision-to-audio inference with [Qwen2.5VL](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) + [VibeVoice-Realtime-0.5B](https://huggingface.co/microsoft/VibeVoice-Realtime-0.5B).")
     with gr.Row():
         with gr.Column(scale=1):
             gr.Markdown("### 1. Vision Input")
+            image_upload = gr.Image(type="pil", label="Upload Image", value=example_image, height=300)
             image_query = gr.Textbox(label="Enter the prompt", value="Give a short description indicating whether the image is safe or unsafe.", placeholder="E.g., Read this page...")
             gr.Markdown("### 2. Voice Settings")
             status_output = gr.Textbox(label="Status Log", lines=2)
+        gr.Examples(
+            examples=[["Perform OCR on the image.", "examples/1.jpg"]],
+            inputs=[image_query, image_upload],
+            label="Example"
+        )
     submit_btn.click(
         fn=process_pipeline,