phi-3.5 / app.py
knud055's picture
Update app.py
7b758b5 verified
import gradio as gr
from openvino_genai import VLMPipeline
import openvino as ov
from huggingface_hub import snapshot_download
from PIL import Image
import numpy as np
import threading
from queue import Queue
import time
# 1. Setup Model
MODEL_ID = "OpenVINO/Phi-3.5-vision-instruct-int4-ov"
model_dir = snapshot_download(repo_id=MODEL_ID)
pipe = VLMPipeline(model_dir, "CPU")
# 2. Streaming Generator
def stream_generation(text_prompt, input_image=None):
# Queue to pass tokens from the model thread to the Gradio thread
streamer_queue = Queue()
# Callback function called by OpenVINO for every new token
def streamer_callback(subword):
streamer_queue.put(subword)
return False # Return False to continue generation
# Function to run generation in a separate thread
def run_generation():
config = {
"max_new_tokens": 512,
"do_sample": True,
"temperature": 0.7,
}
# Prepare Inputs
if input_image is not None:
image_data = np.array(input_image.convert("RGB"))[None]
ov_image = ov.Tensor(image_data)
# Pass streamer callback here
pipe.generate(text_prompt, images=ov_image, streamer=streamer_callback, **config)
else:
pipe.generate(text_prompt, streamer=streamer_callback, **config)
# Signal that we are done
streamer_queue.put(None)
# Start generation in background thread
thread = threading.Thread(target=run_generation)
thread.start()
# Yield tokens as they arrive
generated_text = ""
while True:
token = streamer_queue.get()
if token is None: # End signal
break
generated_text += token
yield generated_text
# 3. Gradio Interface
with gr.Blocks() as demo:
gr.Markdown("# Phi-3.5 Streaming Backend")
with gr.Row():
txt = gr.Textbox()
img = gr.Image(type="pil")
out = gr.Textbox()
btn = gr.Button("Generate")
# Note: No separate queue=True needed in recent Gradio, but good practice
btn.click(stream_generation, inputs=[txt, img], outputs=out)
demo.queue().launch()