File size: 3,165 Bytes

3fdba7b
ca17628
 
8128625
 
0ba1f5a
40f9682
 
8128625
 
 
 
 
 
 
 
 
 
83af6e8
3fdba7b
 
49630f8
16ceee2
 
 
 
49630f8
 
3fdba7b
5bdd01a
 
8128625
ba690ad
8128625
 
 
 
 
 
 
 
 
 
 
3fdba7b
 
16ceee2
8128625
3fdba7b
 
 
 
 
16ceee2
3fdba7b
147fcdd
1fe38b4
 
 
bd0b411
1fe38b4
 
05dd32e

from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
from typing import Dict, List, Any
import torch
import io
from PIL import Image
import base64
import time
import uuid

prompt = """**Task**: 
          Analyze this document image exhaustively and output in Markdown format. 
          **Rules**:  
            - Do not add any comments, provide content only;
            - Extract ALL visible text exactly as written;
            - Preserve possible additional languages;
            - Maintain line breaks, indentation, and spacing;
            - Never translate non-English text.
            - Do not add unnecessary or additional information. Do not add any links or images. Do not add Chinese symbols.
        **Important**: the output format must be Markdown (use bold text, headlines, so on)."""

class EndpointHandler:
    def __init__(self, path: str = "Qwen/Qwen3-VL-8B-Instruct"):
        # Load tokenizer and model
        self.processor = AutoProcessor.from_pretrained(path)
        self.model = Qwen3VLForConditionalGeneration.from_pretrained(path, device_map="auto")
        self.model.eval()
    
    def __call__(self, data: Dict[str, Any]) -> str:
        # Prepare your messages with image and text
        inputs = data.get("inputs")
        base64image = inputs["base64"]

        img_bytes = base64.b64decode(base64image)
        pil_img = Image.open(io.BytesIO(img_bytes)).convert("RGB")

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": pil_img},      # pass PIL image directly
                    {"type": "text", "text": prompt},
                ]
            }
        ]

        # Process the input and generate a response
        inputs = self.processor.apply_chat_template(
            messages,
            tokenize=True,
            add_generation_prompt=True,
            return_dict=True,
            return_tensors="pt"
        )
        inputs = inputs.to(self.model.device)

        generated_ids = self.model.generate(**inputs, max_new_tokens=2048)
        generated_ids_trimmed = [
            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = self.processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )

        response = {
            "id": f"chatcmpl-{uuid.uuid4().hex}",
            "object": "chat.completion",
            "created": int(time.time()),
            "model": "Qwen/Qwen3-VL-8B-Instruct",
            "usage": {
                # you might compute these if you can get token counts
                "prompt_tokens": None,
                "completion_tokens": None,
                "total_tokens": None
            },
            "choices": [
                {
                    "message": {
                        "role": "assistant",
                        "content": output_text[0]
                    },
                    "finish_reason": "stop",
                    "index": 0
                }
            ]
        }

        return response