Spaces:

KarthiEz
/

Doc_type_agent

Sleeping

App Files Files Community

Doc_type_agent / app.py

KarthiEz

Update app.py

795beee verified 4 months ago

raw

history blame contribute delete

3.15 kB

	import gradio as gr
	import torch
	from transformers import AutoProcessor, AutoModelForVision2Seq

	MODEL_ID = "Qwen/Qwen2.5-VL-3B-Instruct"

	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16 if device == "cuda" else torch.float32
	print(f"🚀 Loading model on {device} ...")

	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModelForVision2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=dtype,
	device_map="auto" if device == "cuda" else None,
	trust_remote_code=True,
	)
	model.to(device).eval()
	print("✅ Model loaded successfully!")

	def ask_about_image(image, prompt):
	if image is None or not prompt or not prompt.strip():
	return "Please upload an image and enter a question."

	# ✅ Build a multimodal turn via the official chat template
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image"},
	{"type": "text", "text": prompt.strip()},
	],
	}
	]

	# Tokenize with the chat template (injects the correct image placeholders)
	templated = processor.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True, # adds the assistant prefix
	)

	# Important: pass lists for batched API consistency
	inputs = processor(
	text=[templated],
	images=[image],
	return_tensors="pt",
	).to(device)

	# Safety pads
	if model.generation_config.pad_token_id is None and processor.tokenizer.pad_token_id is not None:
	model.generation_config.pad_token_id = processor.tokenizer.pad_token_id
	if model.generation_config.eos_token_id is None and processor.tokenizer.eos_token_id is not None:
	model.generation_config.eos_token_id = processor.tokenizer.eos_token_id

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=256,
	do_sample=False, # deterministic; toggle to True if you want more creative outputs
	)

	answer = processor.batch_decode(output_ids, skip_special_tokens=True)[0]
	return answer.strip()

	with gr.Blocks(css="""
	#resp_box textarea {
	min-height: 96px;
	max-height: 180px;
	overflow: auto;
	resize: none;
	line-height: 1.2;
	white-space: pre-wrap;
	}
	#resp_box label { margin-bottom: 4px; }
	""") as demo:
	gr.Markdown("## 🧠 Qwen2.5-VL-3B — Visual Reasoning Assistant")

	with gr.Row():
	image = gr.Image(type="pil", label="Upload an Image")
	with gr.Column():
	question = gr.Textbox(
	label="Ask about this image",
	placeholder="e.g. What type of document is this? Is there a stamp or signature?",
	)
	ask_button = gr.Button("Ask")

	# ✅ Compact area showing only the assistant’s answer
	answer = gr.Textbox(
	label="Response",
	lines=4,
	interactive=False, # user cannot edit
	elem_id="resp_box"
	)

	# The model’s pure output only
	ask_button.click(fn=ask_about_image, inputs=[image, question], outputs=[answer])

	demo.launch()