import gradio as gr from openvino_genai import VLMPipeline import openvino as ov from huggingface_hub import snapshot_download from PIL import Image import numpy as np import threading from queue import Queue import time # 1. Setup Model MODEL_ID = "OpenVINO/Phi-3.5-vision-instruct-int4-ov" model_dir = snapshot_download(repo_id=MODEL_ID) pipe = VLMPipeline(model_dir, "CPU") # 2. Streaming Generator def stream_generation(text_prompt, input_image=None): # Queue to pass tokens from the model thread to the Gradio thread streamer_queue = Queue() # Callback function called by OpenVINO for every new token def streamer_callback(subword): streamer_queue.put(subword) return False # Return False to continue generation # Function to run generation in a separate thread def run_generation(): config = { "max_new_tokens": 512, "do_sample": True, "temperature": 0.7, } # Prepare Inputs if input_image is not None: image_data = np.array(input_image.convert("RGB"))[None] ov_image = ov.Tensor(image_data) # Pass streamer callback here pipe.generate(text_prompt, images=ov_image, streamer=streamer_callback, **config) else: pipe.generate(text_prompt, streamer=streamer_callback, **config) # Signal that we are done streamer_queue.put(None) # Start generation in background thread thread = threading.Thread(target=run_generation) thread.start() # Yield tokens as they arrive generated_text = "" while True: token = streamer_queue.get() if token is None: # End signal break generated_text += token yield generated_text # 3. Gradio Interface with gr.Blocks() as demo: gr.Markdown("# Phi-3.5 Streaming Backend") with gr.Row(): txt = gr.Textbox() img = gr.Image(type="pil") out = gr.Textbox() btn = gr.Button("Generate") # Note: No separate queue=True needed in recent Gradio, but good practice btn.click(stream_generation, inputs=[txt, img], outputs=out) demo.queue().launch()