qwen3-vl / handler.py
wealthcoders's picture
Update handler.py
83af6e8 verified
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from typing import Dict, List, Any
import torch
import io
from PIL import Image
import base64
import time
import uuid
prompt = """**Task**:
Analyze this document image exhaustively and output in Markdown format.
**Rules**:
- Do not add any comments, provide content only;
- Extract ALL visible text exactly as written;
- Preserve possible additional languages;
- Maintain line breaks, indentation, and spacing;
- Never translate non-English text.
- Do not add unnecessary or additional information. Do not add any links or images. Do not add Chinese symbols.
**Important**: the output format must be Markdown (use bold text, headlines, so on)."""
class EndpointHandler:
def __init__(self, path: str = "Qwen/Qwen3-VL-8B-Instruct"):
# Load tokenizer and model
self.processor = AutoProcessor.from_pretrained(path)
self.model = Qwen3VLForConditionalGeneration.from_pretrained(path, device_map="auto")
self.model.eval()
def __call__(self, data: Dict[str, Any]) -> str:
# Prepare your messages with image and text
inputs = data.get("inputs")
base64image = inputs["base64"]
img_bytes = base64.b64decode(base64image)
pil_img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": pil_img}, # pass PIL image directly
{"type": "text", "text": prompt},
]
}
]
# Process the input and generate a response
inputs = self.processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt"
)
inputs = inputs.to(self.model.device)
generated_ids = self.model.generate(**inputs, max_new_tokens=2048)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = self.processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
response = {
"id": f"chatcmpl-{uuid.uuid4().hex}",
"object": "chat.completion",
"created": int(time.time()),
"model": "Qwen/Qwen3-VL-8B-Instruct",
"usage": {
# you might compute these if you can get token counts
"prompt_tokens": None,
"completion_tokens": None,
"total_tokens": None
},
"choices": [
{
"message": {
"role": "assistant",
"content": output_text[0]
},
"finish_reason": "stop",
"index": 0
}
]
}
return response