from typing import  Dict, List, Any
from PIL import Image
import clip
import torch
import requests
import io

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class EndpointHandler():
    def __init__(self, path=""):
        # load the optimized model
        self.model, self.preprocess = clip.load('ViT-B/32', device)
        self.model.eval()
        self.model = self.model.to(device)
        

    def __call__(self, data: Any) -> Dict[str, List[float]]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict like {"feature_vector": [0.6331314444541931,0.8802216053009033,...,-0.7866355180740356,]} containing :
                - "feature_vector": A list of floats corresponding to the image embedding.
        """
        inputs = data.pop("inputs", data)
        if inputs.startswith("http") or inputs.startswith("www"):
            response = requests.get(inputs).content
            img = Image.open(io.BytesIO(response))
        else:
            img = Image.open(inputs['image'])
        # decode base64 image to PIL

        image_input = self.preprocess(img).unsqueeze(0).to(device)

        # Calculate features
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
        # postprocess the prediction
        return {"feature_vector": image_features.tolist()[0]}