import torch
from transformers import CLIPModel, CLIPProcessor


def transform_genre_to_label(genre: int) -> str:
    label = "Unknown Genre"
    if genre == 0:
        label = "abstract_painting"
    elif genre == 1:
        label = "cityscape"
    elif genre == 2:
        label = "enre_painting"
    elif genre == 3:
        label = "illustration"
    elif genre == 4:
        label = "landscape"
    elif genre == 5:
        label = "nude_painting"
    elif genre == 6:
        label = "portrait"
    elif genre == 7:
        label = "religious_painting"
    elif genre == 8:
        label = "sketch_and_study"
    elif genre == 9:
        label = "still_life"

    return label


genres = set([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

label2id = {transform_genre_to_label(genre): i for i, genre in enumerate(genres)}
id2label = {i: label for label, i in label2id.items()}
labels = list(label2id)
label_prompt = [f"the genre of the painting is {transform_genre_to_label(genre)}" for genre in range(11)]

MODEL_NAME = "flaviupop/CLIP-Finetuned-Painting-Genre-Recognition"


class ImageAnalyzer:
    def __init__(self):
        self.model = CLIPModel.from_pretrained(MODEL_NAME)
        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    def predict_genre(self, input_image) -> str:
        inputs = self.processor(text=label_prompt, images=input_image, return_tensors="pt", padding=True)

        outputs = self.model(**inputs)
        logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

        result = torch.argmax(probs)

        return transform_genre_to_label(result)


image_analyzer = ImageAnalyzer()