| | import gradio as gr |
| | import numpy as np |
| | from PIL import Image |
| | from transformers import CLIPProcessor, CLIPModel, DetrFeatureExtractor, DetrForObjectDetection, AutoFeatureExtractor, AutoModelForObjectDetection |
| | import torch |
| |
|
| | feature_extractor = AutoFeatureExtractor.from_pretrained("nielsr/detr-resnet-50") |
| | dmodel = AutoModelForObjectDetection.from_pretrained("nielsr/detr-resnet-50") |
| |
|
| | model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
| | processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
| |
|
| | i1 = gr.inputs.Image(type="pil", label="Input image") |
| | i2 = gr.inputs.Textbox(label="Input text") |
| | i3 = gr.inputs.Number(default=0.96, label="Threshold percentage score") |
| | o1 = gr.outputs.Image(type="pil", label="Cropped part") |
| | o2 = gr.outputs.Textbox(label="Similarity score") |
| |
|
| | def extract_image(image, text, prob, num=1): |
| | |
| | inputs = feature_extractor(images=image, return_tensors="pt") |
| | outputs = dmodel(**inputs) |
| | |
| | |
| | logits = outputs.logits |
| | bboxes = outputs.pred_boxes |
| | probas = outputs.logits.softmax(-1)[0, :, :-1] |
| | |
| | keep = probas.max(-1).values > prob |
| | outs = feature_extractor.post_process(outputs, torch.tensor(image.size[::-1]).unsqueeze(0)) |
| | bboxes_scaled = outs[0]['boxes'][keep].detach().numpy() |
| | labels = outs[0]['labels'][keep].detach().numpy() |
| | scores = outs[0]['scores'][keep].detach().numpy() |
| | |
| | images_list = [] |
| | for i,j in enumerate(bboxes_scaled): |
| | |
| | xmin = int(j[0]) |
| | ymin = int(j[1]) |
| | xmax = int(j[2]) |
| | ymax = int(j[3]) |
| | |
| | im_arr = np.array(image) |
| | roi = im_arr[ymin:ymax, xmin:xmax] |
| | roi_im = Image.fromarray(roi) |
| | |
| | images_list.append(roi_im) |
| | |
| | inpu = processor(text = [text], images=images_list , return_tensors="pt", padding=True) |
| | output = model(**inpu) |
| | logits_per_image = output.logits_per_text |
| | probs = logits_per_image.softmax(-1) |
| | l_idx = np.argsort(probs[-1].detach().numpy())[::-1][0:num] |
| | |
| | final_ims = [] |
| | for i,j in enumerate(images_list): |
| | json_dict = {} |
| | if i in l_idx: |
| | json_dict['image'] = images_list[i] |
| | json_dict['score'] = probs[-1].detach().numpy()[i] |
| | |
| | final_ims.append(json_dict) |
| | |
| | fi = sorted(final_ims, key=lambda item: item.get("score"), reverse=True) |
| | return fi[0]['image'], fi[0]['score'] |
| |
|
| | title = "ClipnCrop" |
| | description = "Extract sections of images from your image by using OpenAI's CLIP and Facebooks Detr implemented on HuggingFace Transformers" |
| | examples=[['ex3.jpg', 'black bag', 0.96],['ex2.jpg', 'man in red dress', 0.85]] |
| | article = "<p style='text-align: center'><a href='https://github.com/Vishnunkumar/clipcrop' target='_blank'>clipcrop</a></p>" |
| | gr.Interface(fn=extract_image, inputs=[i1, i2, i3], outputs=[o1, o2], title=title, description=description, article=article, examples=examples, enable_queue=True).launch() |
| |
|