File size: 3,585 Bytes
18bfa5d
 
 
195b7ab
18bfa5d
195b7ab
18bfa5d
 
 
2b450ba
18bfa5d
195b7ab
18bfa5d
195b7ab
18bfa5d
 
195b7ab
 
80bb39b
 
 
 
 
195b7ab
80bb39b
 
 
 
 
 
195b7ab
 
 
 
 
80bb39b
 
 
 
 
 
 
 
 
6a80dac
18bfa5d
 
 
 
 
 
195b7ab
 
18bfa5d
 
195b7ab
 
18bfa5d
195b7ab
18bfa5d
 
 
 
 
 
195b7ab
 
 
 
 
 
 
 
18bfa5d
6a80dac
18bfa5d
 
195b7ab
 
 
 
 
 
6a80dac
 
 
 
195b7ab
 
18bfa5d
 
195b7ab
18bfa5d
195b7ab
6a80dac
80bb39b
 
 
 
 
 
18bfa5d
 
 
 
80bb39b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import gradio as gr
from transformers import pipeline
from PIL import Image
import io

# ---------- optional: PDF -> PIL first page ----------
def pdf_first_page_to_pil(file_bytes: bytes) -> Image.Image:
    import fitz  # PyMuPDF
    with fitz.open(stream=file_bytes, filetype="pdf") as doc:
        page = doc[0] 
        pix = page.get_pixmap(dpi=200)
        return Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")

# ---------- init model ----------
pipe = pipeline("image-text-to-text", model="Qwen/Qwen2-VL-2B-Instruct")

# ---------- robust extractor: returns ONLY the model text ----------
def _only_model_text(out) -> str:
    # Case 1: pipelines often return {"generated_text": "..."}
    if isinstance(out, dict) and "generated_text" in out:
        return out["generated_text"]

    # Case 2: list of dicts (mixed roles)
    if isinstance(out, list):
        # Prefer any dict with generated_text first
        for item in out:
            if isinstance(item, dict) and "generated_text" in item:
                return item["generated_text"]
        # Otherwise find assistant role
        for item in out:
            if isinstance(item, dict) and item.get("role") == "assistant":
                content = item.get("content")
                if isinstance(content, str):
                    return content
                if isinstance(content, list):
                    # collect text pieces within the assistant content
                    chunks = []
                    for c in content:
                        if isinstance(c, dict) and c.get("type") == "text" and isinstance(c.get("text"), str):
                            chunks.append(c["text"])
                    if chunks:
                        return "\n".join(chunks)
    # Fallback
    return str(out)

def infer(file_obj, prompt):
    if file_obj is None:
        return "Please upload an image or PDF."
    if not prompt or not prompt.strip():
        return "Please enter a prompt."

    # read file
    with open(file_obj.name, "rb") as f:
        raw = f.read()

    # load PIL
    name = (file_obj.name or "").lower()
    try:
        if name.endswith(".pdf") or raw[:4] == b"%PDF":
            pil_img = pdf_first_page_to_pil(raw)
        else:
            pil_img = Image.open(io.BytesIO(raw)).convert("RGB")
    except Exception as e:
        return f"Failed to read the file: {e}"

    # build messages in Qwen2-VL format
    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": pil_img},
            {"type": "text", "text": prompt.strip()}
        ]
    }]

    # run model
    out = pipe(text=messages, max_new_tokens=256)

    # return ONLY the assistant text
    return _only_model_text(out)

# ---------- Gradio UI ----------
with gr.Blocks(
    title="Qwen2-VL-2B — File + Prompt",
    css="""
    /* make the output box grow nicely */
    #resp_out textarea {min-height: 220px;}
    """
) as demo:
    gr.Markdown("### Qwen2-VL-2B — Upload an image (or PDF first page) and ask a question.")
    with gr.Row():
        file_in = gr.File(label="Upload image or PDF", file_types=["image", ".pdf"])
    prompt_in = gr.Textbox(label="Prompt", placeholder="Ask anything…", lines=3)
    run_btn = gr.Button("Run")

    # output textbox that expands (via CSS above)
    resp_out = gr.Textbox(
        label="Model Response",
        lines=8,
        show_copy_button=True,
        elem_id="resp_out"
    )

    run_btn.click(fn=infer, inputs=[file_in, prompt_in], outputs=[resp_out])

if __name__ == "__main__":
    demo.launch()