File size: 6,279 Bytes
e2c6636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83a6c8a
e2c6636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import requests
from PIL import Image
import gradio as gr
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch


css = """
#column_container {
  position: relative;
  height: 800px;
  max-width: 700px;
  display: flex;
  flex-direction: column;
  background-color: lightgray;
  border: 1px solid gray;
  border-radius: 5px;
  padding: 10px;
  box-shadow: 2px 2px 5px gray;
  margin-left: auto; 
  margin-right: auto;
}
#input_prompt {
  position: fixed;
  bottom: 0;
  max-width: 680px;
}
#chatbot-component {
  overflow: auto;
}
"""

processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) 

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def upload_button_config():
    return gr.update(visible=False)

def upload_textbox_config(text_in):
    return gr.update(visible=True)

#takes input and generates the Response
def predict(btn_upload, counter,image_hid, input, history):
    
    if counter == 0:
      image_in = Image.open(btn_upload)
      #Resizing the image
      basewidth = 512
      wpercent = (basewidth/float(image_in.size[0]))
      hsize = int((float(image_in.size[1])*float(wpercent)))
      image_in = image_in.resize((basewidth,hsize)) #, Image.Resampling.LANCZOS)
      # Save the image to the file-like object
      #seed = random.randint(0, 1000000)
      img_name = "uploaded_image.png" #f"./edited_image_{seed}.png"
      image_in.save(img_name)
      #add state
      history = history or []
      response = '<img src="/file=' + img_name + '">'
      history.append((input, response))
      counter += 1
      return history, history, img_name, counter, image_in

    #process the input prompt and image
    #image = Image.open(btn_upload)
    print(f"prompt is :{input}") #Question: Is this photo unusual? Answer:
    prompt = f"Question: {input} Answer: "
    inputs = processor(image_hid, text=prompt, return_tensors="pt").to(device, torch.float16)
    
    #generte the response
    generated_ids = model.generate(**inputs, max_new_tokens=10)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    print(f"generated_text is : {generated_text}")

    #add state
    history = history or []
    response = generated_text #'<img src="/file=' + img_name + '">'
    history.append((input, response))
    counter += 1
    return history, history, "uploaded_image.png", counter, image_hid

#Blocks Layout
with gr.Blocks(css="#chatbot-component {height: 800px}") as demo:  
  with gr.Row():
    with gr.Column(scale=1):
        #with gr.Accordion("See details"):
        gr.HTML("""<div style="text-align: center; max-width: 700px; margin: 0 auto;">
                    <div
                style="
                    display: inline-flex;
                    align-items: center;
                    gap: 0.8rem;
                    font-size: 1.75rem;
                "
                >
                <h1 style="font-weight: 900; margin-bottom: 7px; margin-top: 5px;">
                    Bringing Visual Conversations to Life with BLIP2
                </h1>
                </div>
                <p style="margin-bottom: 10px; font-size: 94%">
                Blip2 is functioning as an <b>instructed zero-shot image-to-text generation</b> model using OPT-2.7B in this Space. 
                It shows a wide range of capabilities including visual conversation, visual knowledge reasoning, visual commensense reasoning, storytelling, 
                personalized image-to-text generation etc.<br>
                BLIP-2 by <a href="https://huggingface.co/Salesforce" target="_blank">Salesforce</a> is now available in🤗Transformers! 
                This model was contributed by <a href="https://twitter.com/NielsRogge" target="_blank">nielsr</a>. 
                The BLIP-2 model was proposed in <a href="https://arxiv.org/abs/2301.12597" target="_blank">BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a> 
                by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.<br><br>
                </p></div>""")
  
    with gr.Column(elem_id = "column_container", scale=2):
        #text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
        btn_upload = gr.UploadButton("Upload image!", file_types=["image"], file_count="single", elem_id="upload_button")
        chatbot = gr.Chatbot(elem_id = 'chatbot-component', label='Converse with Images')
        text_in = gr.Textbox(value='', placeholder="Type your questions here and press enter", elem_id = "input_prompt", visible=False, label='Great! Now you can ask questions to get more information about the image')
        state_in = gr.State()
        counter_out = gr.Number(visible=False, value=0, precision=0)
        text_out = gr.Textbox(visible=False)  #getting imag name out
        image_hid = gr.Image(visible=False) #, type='pil')

  #Using Event Listeners
  btn_upload.upload(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
  btn_upload.upload(fn = upload_textbox_config, inputs=text_in, outputs = text_in)

  text_in.submit(predict, [btn_upload, counter_out, image_hid, text_in, state_in], [chatbot, state_in, text_out, counter_out, image_hid])
  #text_in.submit(previous, [image_hid], [image_oneup])
  
  chatbot.change(fn = upload_button_config, outputs=btn_upload) #, scroll_to_output = True)
  #text_in.submit(None, [], [], _js = "() => document.getElementById('#chatbot-component').scrollTop = document.getElementById('#chatbot-component').scrollHeight")

  #with gr.Accordion("Release Notes", open=False):
  #gr.Markdown(help_text)
  gr.HTML("""<a href="https://huggingface.co/spaces/ysharma/InstructPix2Pix_Chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate Space with GPU Upgrade for fast Inference & no queue<br>""")
    
demo.queue(concurrency_count=10)
demo.launch(debug=True) #, width="80%", height=2000)