| | from typing import Dict, Any |
| | import torch |
| | from transformers import AutoProcessor, Qwen2VLForConditionalGeneration |
| | from PIL import Image |
| | import requests |
| | from io import BytesIO |
| |
|
| | |
| | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
|
| | class EndpointHandler: |
| | def __init__(self, path: str = "morthens/qwen2-vl-inference"): |
| | |
| | self.processor = AutoProcessor.from_pretrained(path) |
| | self.model = Qwen2VLForConditionalGeneration.from_pretrained( |
| | path, |
| | torch_dtype="auto", |
| | device_map="auto" |
| | ) |
| | |
| | self.model.to(device) |
| |
|
| | def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: |
| | |
| | image_url = data.get("image_url", "") |
| | text = data.get("text", "") |
| |
|
| | |
| | try: |
| | response = requests.get(image_url) |
| | response.raise_for_status() |
| | image = Image.open(BytesIO(response.content)) |
| | except Exception as e: |
| | return {"error": f"Failed to fetch or process image: {str(e)}"} |
| |
|
| | |
| | inputs = self.processor( |
| | text=[text], |
| | images=[image], |
| | padding=True, |
| | return_tensors="pt" |
| | ) |
| |
|
| | |
| | inputs = {key: value.to(device) for key, value in inputs.items()} |
| |
|
| | |
| | output_ids = self.model.generate( |
| | **inputs, |
| | max_new_tokens=128 |
| | ) |
| |
|
| | |
| | output_text = self.processor.batch_decode( |
| | output_ids, |
| | skip_special_tokens=True, |
| | clean_up_tokenization_spaces=True |
| | )[0] |
| |
|
| | |
| | return {"prediction": output_text} |
| |
|
| |
|