| import subprocess |
| import sys |
| import os |
|
|
| |
| subprocess.check_call([ |
| sys.executable, "-m", "pip", "install", "--quiet", "--upgrade", |
| "git+https://github.com/huggingface/transformers.git", |
| ]) |
|
|
| |
| import transformers |
| if not hasattr(transformers, "file_utils"): |
| import types |
| file_utils_shim = types.ModuleType("transformers.file_utils") |
| |
| file_utils_shim.is_tf_available = transformers.utils.is_tf_available |
| file_utils_shim.is_torch_available = transformers.utils.is_torch_available |
| sys.modules["transformers.file_utils"] = file_utils_shim |
| transformers.file_utils = file_utils_shim |
|
|
| from transformers import AutoProcessor, AutoModelForImageTextToText |
| import torch |
| from PIL import Image |
| import requests |
| from io import BytesIO |
| import base64 |
|
|
| MODEL_ID = "strangervisionhf/PaddleOCR-VL-1.5-hf-transformers-v5.2.0.dev0" |
|
|
|
|
| class EndpointHandler: |
| def __init__(self, path=""): |
| self.processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
| self.model = AutoModelForImageTextToText.from_pretrained( |
| MODEL_ID, |
| trust_remote_code=True, |
| torch_dtype=torch.float16, |
| device_map="auto", |
| ) |
| self.model.eval() |
|
|
| def __call__(self, data: dict): |
| inputs_data = data.get("inputs", data) |
|
|
| image_src = inputs_data.get("image", "") |
| if not image_src: |
| return {"error": "No image provided"} |
|
|
| if image_src.startswith("http://") or image_src.startswith("https://"): |
| response = requests.get(image_src, stream=True, timeout=30) |
| image = Image.open(response.raw).convert("RGB") |
| else: |
| image = Image.open(BytesIO(base64.b64decode(image_src))).convert("RGB") |
|
|
| prompt = inputs_data.get("text", "Recognize text in the image.") |
|
|
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": image}, |
| {"type": "text", "text": prompt}, |
| ], |
| } |
| ] |
|
|
| text_input = self.processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| model_inputs = self.processor( |
| text=[text_input], |
| images=[image], |
| return_tensors="pt", |
| ).to(self.model.device) |
|
|
| with torch.no_grad(): |
| output_ids = self.model.generate( |
| **model_inputs, |
| max_new_tokens=512, |
| do_sample=False, |
| ) |
|
|
| output_ids_trimmed = [ |
| o[len(i):] for i, o in zip(model_inputs.input_ids, output_ids) |
| ] |
| output_text = self.processor.batch_decode( |
| output_ids_trimmed, skip_special_tokens=True |
| ) |
|
|
| return {"generated_text": output_text[0]} |