File size: 2,191 Bytes
305f7a9
4a1980e
 
 
305f7a9
4a1980e
7b758b5
 
 
305f7a9
7b758b5
4a1980e
 
 
305f7a9
7b758b5
 
 
 
 
 
 
 
 
 
 
 
4a1980e
7b758b5
4a1980e
7b758b5
4a1980e
 
7b758b5
305f7a9
7b758b5
4a1980e
7b758b5
 
305f7a9
7b758b5
4a1980e
7b758b5
 
305f7a9
7b758b5
 
 
 
 
 
 
 
 
 
 
 
 
 
305f7a9
7b758b5
305f7a9
7b758b5
 
 
 
4a1980e
7b758b5
 
305f7a9
7b758b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import gradio as gr
from openvino_genai import VLMPipeline
import openvino as ov
from huggingface_hub import snapshot_download
from PIL import Image
import numpy as np
import threading
from queue import Queue
import time

# 1. Setup Model
MODEL_ID = "OpenVINO/Phi-3.5-vision-instruct-int4-ov"
model_dir = snapshot_download(repo_id=MODEL_ID)
pipe = VLMPipeline(model_dir, "CPU")

# 2. Streaming Generator
def stream_generation(text_prompt, input_image=None):
    # Queue to pass tokens from the model thread to the Gradio thread
    streamer_queue = Queue()
    
    # Callback function called by OpenVINO for every new token
    def streamer_callback(subword):
        streamer_queue.put(subword)
        return False # Return False to continue generation

    # Function to run generation in a separate thread
    def run_generation():
        config = {
            "max_new_tokens": 512,
            "do_sample": True,
            "temperature": 0.7,
        }

        # Prepare Inputs
        if input_image is not None:
            image_data = np.array(input_image.convert("RGB"))[None]
            ov_image = ov.Tensor(image_data)
            # Pass streamer callback here
            pipe.generate(text_prompt, images=ov_image, streamer=streamer_callback, **config)
        else:
            pipe.generate(text_prompt, streamer=streamer_callback, **config)
        
        # Signal that we are done
        streamer_queue.put(None)

    # Start generation in background thread
    thread = threading.Thread(target=run_generation)
    thread.start()

    # Yield tokens as they arrive
    generated_text = ""
    while True:
        token = streamer_queue.get()
        if token is None: # End signal
            break
        generated_text += token
        yield generated_text

# 3. Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("# Phi-3.5 Streaming Backend")
    with gr.Row():
        txt = gr.Textbox()
        img = gr.Image(type="pil")
    out = gr.Textbox()
    btn = gr.Button("Generate")
    
    # Note: No separate queue=True needed in recent Gradio, but good practice
    btn.click(stream_generation, inputs=[txt, img], outputs=out)

demo.queue().launch()