Tojichok commited on
Commit
2d34d61
·
verified ·
1 Parent(s): 8f8bd38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -280
app.py CHANGED
@@ -1,285 +1,51 @@
1
- from io import BytesIO
2
-
3
- import string
4
  import gradio as gr
5
- import requests
6
- from utils import Endpoint, get_token
7
-
8
-
9
- def encode_image(image):
10
- buffered = BytesIO()
11
- image.save(buffered, format="JPEG")
12
- buffered.seek(0)
13
-
14
- return buffered
15
-
16
-
17
- def query_chat_api(
18
- image, prompt, decoding_method, temperature, len_penalty, repetition_penalty
19
- ):
20
-
21
- url = endpoint.url
22
- url = url + "/api/generate"
23
-
24
- headers = {
25
- "User-Agent": "BLIP-2 HuggingFace Space",
26
- "Auth-Token": get_token(),
27
- }
28
-
29
- data = {
30
- "prompt": prompt,
31
- "use_nucleus_sampling": decoding_method == "Nucleus sampling",
32
- "temperature": temperature,
33
- "length_penalty": len_penalty,
34
- "repetition_penalty": repetition_penalty,
35
- }
36
-
37
- image = encode_image(image)
38
- files = {"image": image}
39
-
40
- response = requests.post(url, data=data, files=files, headers=headers)
41
-
42
- if response.status_code == 200:
43
- return response.json()
44
- else:
45
- return "Error: " + response.text
46
-
47
-
48
- def query_caption_api(
49
- image, decoding_method, temperature, len_penalty, repetition_penalty
50
- ):
51
-
52
- url = endpoint.url
53
- url = url + "/api/caption"
54
-
55
- headers = {
56
- "User-Agent": "BLIP-2 HuggingFace Space",
57
- "Auth-Token": get_token(),
58
- }
59
-
60
- data = {
61
- "use_nucleus_sampling": decoding_method == "Nucleus sampling",
62
- "temperature": temperature,
63
- "length_penalty": len_penalty,
64
- "repetition_penalty": repetition_penalty,
65
- }
66
-
67
- image = encode_image(image)
68
- files = {"image": image}
69
-
70
- response = requests.post(url, data=data, files=files, headers=headers)
71
-
72
- if response.status_code == 200:
73
- return response.json()
74
- else:
75
- return "Error: " + response.text
76
-
77
-
78
- def postprocess_output(output):
79
- # if last character is not a punctuation, add a full stop
80
- if not output[0][-1] in string.punctuation:
81
- output[0] += "."
82
-
83
- return output
84
-
85
-
86
- def inference_chat(
87
- image,
88
- text_input,
89
- decoding_method,
90
- temperature,
91
- length_penalty,
92
- repetition_penalty,
93
- history=[],
94
- ):
95
- text_input = text_input
96
- history.append(text_input)
97
-
98
- prompt = " ".join(history)
99
 
100
- output = query_chat_api(
101
- image, prompt, decoding_method, temperature, length_penalty, repetition_penalty
102
- )
103
- output = postprocess_output(output)
104
- history += output
105
 
106
- chat = [
107
- (history[i], history[i + 1]) for i in range(0, len(history) - 1, 2)
108
- ] # convert to tuples of list
109
 
110
- return {chatbot: chat, state: history}
111
-
112
-
113
- def inference_caption(
114
- image,
115
- decoding_method,
116
- temperature,
117
- length_penalty,
118
- repetition_penalty,
119
- ):
120
- output = query_caption_api(
121
- image, decoding_method, temperature, length_penalty, repetition_penalty
122
- )
123
-
124
- return output[0]
125
-
126
-
127
- title = """<h1 align="center">BLIP-2</h1>"""
128
- description = """Gradio demo for BLIP-2, image-to-text generation from Salesforce Research. To use it, simply upload your image, or click one of the examples to load them.
129
- <br> <strong>Disclaimer</strong>: This is a research prototype and is not intended for production use. No data including but not restricted to text and images is collected."""
130
- article = """<strong>Paper</strong>: <a href='https://arxiv.org/abs/2301.12597' target='_blank'>BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models</a>
131
- <br> <strong>Code</strong>: BLIP2 is now integrated into GitHub repo: <a href='https://github.com/salesforce/LAVIS' target='_blank'>LAVIS: a One-stop Library for Language and Vision</a>
132
- <br> <strong>🤗 `transformers` integration</strong>: You can now use `transformers` to use our BLIP-2 models! Check out the <a href='https://huggingface.co/docs/transformers/main/en/model_doc/blip-2' target='_blank'> official docs </a>
133
- <p> <strong>Project Page</strong>: <a href='https://github.com/salesforce/LAVIS/tree/main/projects/blip2' target='_blank'> BLIP2 on LAVIS</a>
134
- <br> <strong>Description</strong>: Captioning results from <strong>BLIP2_OPT_6.7B</strong>. Chat results from <strong>BLIP2_FlanT5xxl</strong>.
135
-
136
- <p><strong>We have now suspended the official BLIP2 demo from March 23. 2023. </strong>
137
- <p><strong>For example usage, see notebooks https://github.com/salesforce/LAVIS/tree/main/examples.</strong>
138
- """
139
-
140
- endpoint = Endpoint()
141
-
142
- examples = [
143
- ["house.png", "How could someone get out of the house?"],
144
- ["flower.jpg", "Question: What is this flower and where is it's origin? Answer:"],
145
- ["pizza.jpg", "What are steps to cook it?"],
146
- ["sunset.jpg", "Here is a romantic message going along the photo:"],
147
- ["forbidden_city.webp", "In what dynasties was this place built?"],
148
- ]
149
-
150
- with gr.Blocks(
151
- css="""
152
- .message.svelte-w6rprc.svelte-w6rprc.svelte-w6rprc {font-size: 20px; margin-top: 20px}
153
- #component-21 > div.wrap.svelte-w6rprc {height: 600px;}
154
  """
155
- ) as iface:
156
- state = gr.State([])
157
-
158
- gr.Markdown(title)
159
- gr.Markdown(description)
160
- gr.Markdown(article)
161
-
162
- with gr.Row():
163
- with gr.Column(scale=1):
164
- image_input = gr.Image(type="pil", interactive=False)
165
-
166
- # with gr.Row():
167
- sampling = gr.Radio(
168
- choices=["Beam search", "Nucleus sampling"],
169
- value="Beam search",
170
- label="Text Decoding Method",
171
- interactive=True,
172
- )
173
-
174
- temperature = gr.Slider(
175
- minimum=0.5,
176
- maximum=1.0,
177
- value=1.0,
178
- step=0.1,
179
- interactive=True,
180
- label="Temperature (used with nucleus sampling)",
181
- )
182
-
183
- len_penalty = gr.Slider(
184
- minimum=-1.0,
185
- maximum=2.0,
186
- value=1.0,
187
- step=0.2,
188
- interactive=True,
189
- label="Length Penalty (set to larger for longer sequence, used with beam search)",
190
- )
191
-
192
- rep_penalty = gr.Slider(
193
- minimum=1.0,
194
- maximum=5.0,
195
- value=1.5,
196
- step=0.5,
197
- interactive=True,
198
- label="Repeat Penalty (larger value prevents repetition)",
199
- )
200
-
201
- with gr.Column(scale=1.8):
202
-
203
- with gr.Column():
204
- caption_output = gr.Textbox(lines=1, label="Caption Output")
205
- caption_button = gr.Button(
206
- value="Caption it!", interactive=True, variant="primary"
207
- )
208
- caption_button.click(
209
- inference_caption,
210
- [
211
- image_input,
212
- sampling,
213
- temperature,
214
- len_penalty,
215
- rep_penalty,
216
- ],
217
- [caption_output],
218
- )
219
-
220
- gr.Markdown("""Trying prompting your input for chat; e.g. example prompt for QA, \"Question: {} Answer:\" Use proper punctuation (e.g., question mark).""")
221
- with gr.Row():
222
- with gr.Column(
223
- scale=1.5,
224
- ):
225
- chatbot = gr.Chatbot(
226
- label="Chat Output (from FlanT5)",
227
- )
228
-
229
- # with gr.Row():
230
- with gr.Column(scale=1):
231
- chat_input = gr.Textbox(lines=1, label="Chat Input")
232
- chat_input.submit(
233
- inference_chat,
234
- [
235
- image_input,
236
- chat_input,
237
- sampling,
238
- temperature,
239
- len_penalty,
240
- rep_penalty,
241
- state,
242
- ],
243
- [chatbot, state],
244
- )
245
-
246
- with gr.Row():
247
- clear_button = gr.Button(value="Clear", interactive=True)
248
- clear_button.click(
249
- lambda: ("", [], []),
250
- [],
251
- [chat_input, chatbot, state],
252
- queue=False,
253
- )
254
-
255
- submit_button = gr.Button(
256
- value="Submit", interactive=True, variant="primary"
257
- )
258
- submit_button.click(
259
- inference_chat,
260
- [
261
- image_input,
262
- chat_input,
263
- sampling,
264
- temperature,
265
- len_penalty,
266
- rep_penalty,
267
- state,
268
- ],
269
- [chatbot, state],
270
- )
271
-
272
- image_input.change(
273
- lambda: ("", "", []),
274
- [],
275
- [chatbot, caption_output, state],
276
- queue=False,
277
- )
278
-
279
- examples = gr.Examples(
280
- examples=examples,
281
- inputs=[image_input, chat_input],
282
- )
283
-
284
- iface.queue(concurrency_count=1, api_open=False, max_size=10)
285
- iface.launch(enable_queue=True)
 
1
+ import os, io, requests
 
 
2
  import gradio as gr
3
+ from PIL import Image
4
+ import torch
5
+ from transformers import Blip2Processor, Blip2ForConditionalGeneration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ # 1) Загружаем BLIP-2
8
+ model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to("cuda")
9
+ processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
 
 
10
 
11
+ # 2) TMDb API
12
+ TMDB_KEY = os.environ.get("TMDB_API_KEY", "")
13
+ TMDB_SEARCH_URL = "https://api.themoviedb.org/3/search/movie"
14
 
15
+ def caption_and_search(image: Image.Image, dummy):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
+ 1) Генерируем описательную подпись BLIP-2
18
+ 2) По этой подписи ищем в TMDb title + ссылку
19
+ """
20
+ # ——— Генерация подписи ———
21
+ inputs = processor(images=image, return_tensors="pt").to(model.device)
22
+ gen = model.generate(**inputs, max_new_tokens=50)
23
+ caption = processor.decode(gen[0], skip_special_tokens=True)
24
+
25
+ # ——— Поиск по TMDb ———
26
+ params = {"api_key": TMDB_KEY, "query": caption}
27
+ resp = requests.get(TMDB_SEARCH_URL, params=params).json()
28
+ results = []
29
+ for m in resp.get("results", [])[:3]:
30
+ title = m.get("title")
31
+ url = f"https://www.themoviedb.org/movie/{m['id']}"
32
+ results.append({"title": title, "url": url})
33
+ return {"caption": caption, "results": results}
34
+
35
+ # 3) Интерфейс Gradio
36
+ iface = gr.Interface(
37
+ fn=caption_and_search,
38
+ inputs=[
39
+ gr.Image(type="pil", label="Постер/кадр фильма"),
40
+ gr.Textbox(visible=False) # второй аргумент не нужен, но Gradio требует
41
+ ],
42
+ outputs=[
43
+ gr.Textbox(label="Generated Caption"),
44
+ gr.JSON(label="Top 3 Matches (title + link)")
45
+ ],
46
+ title="Movie Poster Caption & Search",
47
+ description="BLIP-2 → TMDb search: получаем описание и ссылки на фильмы"
48
+ )
49
+
50
+ if __name__ == "__main__":
51
+ iface.launch()