Doc_type_agent / app.py
KarthiEz's picture
Update app.py
795beee verified
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq
MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
print(f"🚀 Loading model on {device} ...")
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForVision2Seq.from_pretrained(
MODEL_ID,
torch_dtype=dtype,
device_map="auto" if device == "cuda" else None,
trust_remote_code=True,
)
model.to(device).eval()
print("✅ Model loaded successfully!")
def ask_about_image(image, prompt):
if image is None or not prompt or not prompt.strip():
return "Please upload an image and enter a question."
# ✅ Build a multimodal turn via the official chat template
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt.strip()},
],
}
]
# Tokenize with the chat template (injects the correct image placeholders)
templated = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True, # adds the assistant prefix
)
# Important: pass lists for batched API consistency
inputs = processor(
text=[templated],
images=[image],
return_tensors="pt",
).to(device)
# Safety pads
if model.generation_config.pad_token_id is None and processor.tokenizer.pad_token_id is not None:
model.generation_config.pad_token_id = processor.tokenizer.pad_token_id
if model.generation_config.eos_token_id is None and processor.tokenizer.eos_token_id is not None:
model.generation_config.eos_token_id = processor.tokenizer.eos_token_id
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=256,
do_sample=False, # deterministic; toggle to True if you want more creative outputs
)
answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
return answer.strip()
with gr.Blocks(css="""
#resp_box textarea {
min-height: 96px;
max-height: 180px;
overflow: auto;
resize: none;
line-height: 1.2;
white-space: pre-wrap;
}
#resp_box label { margin-bottom: 4px; }
""") as demo:
gr.Markdown("## 🧠 Qwen2.5-VL-3B — Visual Reasoning Assistant")
with gr.Row():
image = gr.Image(type="pil", label="Upload an Image")
with gr.Column():
question = gr.Textbox(
label="Ask about this image",
placeholder="e.g. What type of document is this? Is there a stamp or signature?",
)
ask_button = gr.Button("Ask")
# ✅ Compact area showing only the assistant’s answer
answer = gr.Textbox(
label="Response",
lines=4,
interactive=False, # user cannot edit
elem_id="resp_box"
)
# The model’s pure output only
ask_button.click(fn=ask_about_image, inputs=[image, question], outputs=[answer])
demo.launch()