import gradio as gr
import numpy as np

from io import BytesIO
from PIL import Image as PImage
from torch import cuda

from sklearn.metrics.pairwise import euclidean_distances, cosine_distances
from transformers import AutoModel, AutoProcessor, pipeline

from utils import draw_results, embed_image, embed_word, idxs_along_axes, idxs_by_dist, make_image

DEVICE = "cuda" if cuda.is_available() else "cpu"

# CLIP_MODEL = "google/siglip2-large-patch16-256"
CLIP_MODEL = "openai/clip-vit-large-patch14"
DETR_MODEL = "facebook/detr-resnet-50"
OWL_MODEL = "google/owlv2-base-patch16"

detr = pipeline(task="object-detection",
                model=DETR_MODEL,
                device=DEVICE)

owl = pipeline(task="zero-shot-object-detection",
               model=OWL_MODEL,
               device=DEVICE)

clip_processor = AutoProcessor.from_pretrained(CLIP_MODEL)
clip = AutoModel.from_pretrained(CLIP_MODEL, device_map="auto").to(DEVICE)

def run_detr(img):
  predictions = detr(img)
  return draw_results(img, predictions)

def run_owl(img, classes_str):
  classes = [c.strip() for c in classes_str.split(",")]
  predictions = owl(img, candidate_labels=classes)
  return draw_results(img, predictions)

def run_clip(files, word0, word1=""):
  w0e = embed_word(word0, clip_processor, clip, DEVICE)
  w1e = embed_word(word1, clip_processor, clip, DEVICE)
  
  ies = []
  imgs = []
  for f in files:
    img = PImage.open(f.name).convert("RGB")
    img = img.resize((int(256 * img.width/img.height), 256))
    imgs.append(img)
    ies.append(embed_image(img, clip_processor, clip, DEVICE))

  if word1 == "":
    ordered_idxs = idxs_by_dist(ies, w0e)
    return make_image(imgs, ordered_idxs)
  else:
    ordered_idxs = idxs_along_axes(ies, (w0e, w1e))
    return make_image(imgs, ordered_idxs)


examples = [
  ("painted portrait young person", "painted portrait old person"),
  ("painted portrait happy person", "painted portrait worried person"),
]

with gr.Blocks() as demo:
  gr.Interface(
    title="Object Detection",
    description="[DETR](https://huggingface.co/facebook/detr-resnet-50) model from facebook (2020), trained on [COCO 2017](https://github.com/amikelive/coco-labels/blob/master/coco-labels-2014_2017.txt) dataset and labels.",
    api_name="object",
    fn=run_detr,
    inputs=gr.Image(type="pil"),
    outputs=gr.Image(format="jpeg"),
    flagging_mode="never",
  )

  gr.Interface(
    title="Zero-Shot Object Detection",
    description="[OWLv2](https://huggingface.co/google/owlv2-large-patch14-ensemble) model from google (2023).",
    api_name="zero",
    fn=run_owl,
    inputs=[gr.Image(type="pil"), gr.Textbox(label="Object", show_label=True)],
    outputs=gr.Image(format="jpeg"),
    flagging_mode="never",
  )

  gr.Interface(
    title="Contrastive Embedding",
    description="[CLIP](https://huggingface.co/openai/clip-vit-large-patch14) model from openai (2021).",
    api_name="clip",
    fn=run_clip,
    inputs=[gr.File(file_count="multiple"),
            gr.Textbox(label="1st Descriptor", show_label=True),
            gr.Textbox(label="2nd Descriptor", show_label=True)],
    outputs=gr.Image(format="jpeg"),
    flagging_mode="never",
  )

if __name__ == "__main__":
  demo.launch()