import torch
import gradio as gr
from PIL import Image
from urllib.request import urlopen
from open_clip import create_model_from_pretrained, get_tokenizer

# Load the model and tokenizer from the Hugging Face Hub
model, preprocess = create_model_from_pretrained('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
tokenizer = get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')

# Zero-shot image classification
template = 'this is a photo of '

# Device configuration
device = torch.device('mps') if torch.mps.is_available() else torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.eval()

def classify_image(image, candidate_labels):
    # Convert candidate_labels string to a list
    labels = [label.strip() for label in candidate_labels.split(",")]
    context_length = 256

    # Preprocess the image
    image_input = preprocess(image).unsqueeze(0).to(device)

    # Tokenize the candidate labels
    texts = tokenizer([template + label for label in labels], context_length=context_length).to(device)

    # Perform inference
    with torch.no_grad():
        image_features, text_features, logit_scale = model(image_input, texts)
        logits = (logit_scale * image_features @ text_features.t()).detach().softmax(dim=-1)
        sorted_indices = torch.argsort(logits, dim=-1, descending=True)
        logits = logits.cpu().numpy()
        sorted_indices = sorted_indices.cpu().numpy()

    # Prepare the results
    results = []
    for j in range(len(labels)):
        jth_index = sorted_indices[0][j]
        results.append({
            "label": labels[jth_index],
            "score": float(logits[0][jth_index])
        })

    return results

# Create the Gradio interface
iface = gr.Interface(
    fn=classify_image,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Textbox(lines=2, placeholder="Enter candidate labels, separated by commas..."),
    ],
    outputs=gr.JSON(),
    title="Zero-Shot Image Classification",
    description="Technical example of using the `microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224` model.\nUpload an image and enter candidate labels to classify the image."
)

# Launch the interface
iface.launch()