File size: 2,417 Bytes
e98dbd2
 
 
 
 
a29447f
d969a42
e98dbd2
 
856cff1
e98dbd2
 
 
 
856cff1
76085cf
e98dbd2
 
856cff1
 
a29447f
e98dbd2
 
 
 
 
 
 
 
 
 
ece2f26
e98dbd2
 
 
 
 
 
 
 
 
 
 
 
ece2f26
3def095
e98dbd2
c9409e6
ece2f26
ffe2dca
ece2f26
e98dbd2
ece2f26
e98dbd2
2152207
e98dbd2
 
 
 
 
92159f4
 
ece2f26
376ce67
92159f4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import os

import torch
import torch.nn.functional as F
import torchvision.transforms as T
from uniformer import uniformer_small
from imagenet_class_index import imagenet_classnames

import gradio as gr
from huggingface_hub import hf_hub_download

# Device on which to run the model
# Set to cuda to load on GPU
device = "cpu"
# os.system("wget https://cdn-lfs.huggingface.co/Andy1621/uniformer/fd192c31f8bd77670de8f171111bd51f56fd87e6aea45043ab2edc181e1fa775")
model_path = hf_hub_download(repo_id="https://huggingface.co/Andy1621/uniformer", filename="uniformer_small_in1k.pth", revision="main")
# Pick a pretrained model 
model = uniformer_small()
# state_dict = torch.load('fd192c31f8bd77670de8f171111bd51f56fd87e6aea45043ab2edc181e1fa775', map_location='cpu')
state_dict = torch.load(model_path, map_location='cpu')
model.load_state_dict(state_dict['model'])

# Set to eval mode and move to desired device
model = model.to(device)
model = model.eval()

# Create an id to label name mapping
imagenet_id_to_classname = {}
for k, v in imagenet_classnames.items():
    imagenet_id_to_classname[k] = v[1] 


def inference(img):
    image = img
    image_transform = T.Compose(
    [
        T.Resize(224),
        T.CenterCrop(224),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
    )
    image = image_transform(image)
    
    # The model expects inputs of shape: B x C x H x W
    image = image.unsqueeze(0)
    
    prediction = model(image)
    prediction = F.softmax(prediction, dim=1).flatten()

    return {imagenet_id_to_classname[str(i)]: float(prediction[i]) for i in range(1000)}
    

inputs = gr.inputs.Image(type='pil')
label = gr.outputs.Label(num_top_classes=5)

title = "UniFormer-S"
description = "Gradio demo for UniFormer: To use it, simply upload your image, or click one of the examples to load them. Read more at the links below."
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2201.09450' target='_blank'>UniFormer: Unifying Convolution and Self-attention for Visual Recognition</a> | <a href='https://github.com/Sense-X/UniFormer' target='_blank'>Github Repo</a></p>"

gr.Interface(
    inference, inputs, outputs=label, 
    title=title, description=description, article=article, 
    examples=[['library.jpeg'], ['cat.png'], ['dog.png'], ['panda.png']]
    ).launch(enable_queue=True, cache_examples=True)