| | from typing import Dict, List, Any |
| | import torch |
| | from transformers import AutoProcessor, Pix2StructVisionModel |
| | from PIL import Image |
| | import pdb |
| | import requests |
| |
|
| | class EndpointHandler(): |
| | def __init__(self, path=""): |
| | |
| | |
| | self.processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base") |
| | self.model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base") |
| |
|
| | def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
| | url = data.pop("inputs", data) |
| | image = Image.open(requests.get(url, stream=True).raw) |
| | inputs = self.processor(images=image, return_tensors="pt") |
| |
|
| | with torch.no_grad(): |
| | outputs = self.model(**inputs) |
| | |
| | last_hidden_state = outputs['last_hidden_state'] |
| | embedding = torch.mean(last_hidden_state, dim=1).flatten().tolist() |
| | return {"embedding": embedding} |
| |
|
| | handler = EndpointHandler() |
| | output = handler({"inputs": "https://figma-staging-api.s3.us-west-2.amazonaws.com/images/a8c6a0cc-c022-4f3a-9fc5-ac8582c964dd"}) |
| | print(output) |
| |
|