cabelo commited on
Commit
3b15220
·
1 Parent(s): 64e4a83

Add application file

Browse files
Files changed (2) hide show
  1. app.py +12 -0
  2. gradio_helper.py +285 -0
app.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from gradio_helper import make_demo
2
+ from transformers import AutoProcessor, TextStreamer
3
+ from PIL import Image
4
+ from io import BytesIO
5
+
6
+ pathM ="Gemma-3-Gaia-PT-BR-4b-it-int8-ov"
7
+ from optimum.intel.openvino import OVModelForVisualCausalLM
8
+ model = OVModelForVisualCausalLM.from_pretrained(pathM, device="CPU")
9
+ processor = AutoProcessor.from_pretrained(pathM)
10
+ neuroEnem = make_demo(model, processor)
11
+
12
+ neuroEnem.launch(share=True,debug=True)
gradio_helper.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from collections.abc import Iterator
5
+ from threading import Thread
6
+
7
+ from pathlib import Path
8
+ import cv2
9
+ import gradio as gr
10
+ import requests
11
+ from PIL import Image
12
+ from transformers import TextIteratorStreamer
13
+
14
+ MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
15
+
16
+ example_images = {
17
+ "barchart.png": "https://github.com/user-attachments/assets/7779e110-691a-40db-b7db-f226cd4d06bd",
18
+ "sunset.png": "https://github.com/user-attachments/assets/da3edb79-ae36-4973-9eaf-6ef712425faa",
19
+ "colors.png": "https://github.com/user-attachments/assets/d8e027f5-27d9-4d4d-9195-e89f8b972cb0",
20
+ "sign.png": "https://github.com/user-attachments/assets/491c4af5-dc55-477b-9dc0-0960742980f2",
21
+ "integral.png": "https://github.com/user-attachments/assets/8e9662f2-01fe-485d-8110-b5ce2d0d2b27",
22
+ "house.png": "https://github.com/user-attachments/assets/a395f740-6e9a-4fa7-823b-e2862b910891",
23
+ }
24
+ DESCRIPTION = """\
25
+ O MultiCortex NeuroENEM é uma Inteligência Artificial criado para executar em computadores normais usando a tecnologia openVINO. o sistema faz o processamento com o modelo Gemma3 GAIA PT-BR 4B it para Português do Brasil, um modelo de linguagem de visão com desempenho excepcional em uma ampla gama de tarefas (superou o modelo básico Gemma no benchmark ENEM 2024). Você pode enviar imagens, imagens intercaladas e vídeos. Observe que a entrada de vídeo suporta apenas conversas de uma só vez e entrada MP4. <br> <b>Autor: Alessandro de Oliveira Faria - cabelo@multicortex.ai</b>
26
+ """
27
+
28
+
29
+ logo_path = "https://service.assuntonerd.com.br/imgs/neuroenem.png"
30
+
31
+ # Definir o título e descrição com HTML para alinhar corretamente
32
+ title_with_logo = '<style>footer {display:none !important}</style><div style="display: flex; align-items: center;">' \
33
+ '<span>MultiCortex NeuroENEM for CPU</span></div>'
34
+
35
+ description_with_logo = f'<div style="display: flex; align-items: center;">' \
36
+ f'<img src="{logo_path}" style="height: 100px; margin-right: 10px;" />' \
37
+ f'<span>{DESCRIPTION}</span></div>'
38
+
39
+ def download_example_images():
40
+ for file_name, url in example_images.items():
41
+ if not Path(file_name).exists():
42
+ Image.open(requests.get(url, stream=True).raw).save(file_name)
43
+
44
+
45
+ def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
46
+ image_count = 0
47
+ video_count = 0
48
+ for path in paths:
49
+ if path.endswith(".mp4"):
50
+ video_count += 1
51
+ else:
52
+ image_count += 1
53
+ return image_count, video_count
54
+
55
+
56
+ def count_files_in_history(history: list[dict]) -> tuple[int, int]:
57
+ image_count = 0
58
+ video_count = 0
59
+ for item in history:
60
+ if item["role"] != "user" or isinstance(item["content"], str):
61
+ continue
62
+ if item["content"][0].endswith(".mp4"):
63
+ video_count += 1
64
+ else:
65
+ image_count += 1
66
+ return image_count, video_count
67
+
68
+
69
+ def validate_media_constraints(message: dict, history: list[dict]) -> bool:
70
+ new_image_count, new_video_count = count_files_in_new_message(message["files"])
71
+ history_image_count, history_video_count = count_files_in_history(history)
72
+ image_count = history_image_count + new_image_count
73
+ video_count = history_video_count + new_video_count
74
+ if video_count > 1:
75
+ gr.Warning("Only one video is supported.")
76
+ return False
77
+ if video_count == 1:
78
+ if image_count > 0:
79
+ gr.Warning("Não é permitido misturar imagens e vídeos.")
80
+ return False
81
+ if "<image>" in message["text"]:
82
+ gr.Warning("O uso de tags <image> com arquivos de vídeo não é suportado.")
83
+ return False
84
+ # TODO: Add frame count validation for videos similar to image count limits # noqa: FIX002, TD002, TD003
85
+ if video_count == 0 and image_count > MAX_NUM_IMAGES:
86
+ gr.Warning(f"Você pode carregar até {MAX_NUM_IMAGES} imagens.")
87
+ return False
88
+ if "<image>" in message["text"] and message["text"].count("<image>") != new_image_count:
89
+ gr.Warning("O número de tags <image> no texto não corresponde ao número de imagens.")
90
+ return False
91
+ return True
92
+
93
+
94
+ def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
95
+ vidcap = cv2.VideoCapture(video_path)
96
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
97
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
98
+
99
+ frame_interval = int(fps / 3)
100
+ frames = []
101
+
102
+ for i in range(0, total_frames, frame_interval):
103
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
104
+ success, image = vidcap.read()
105
+ if success:
106
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
107
+ pil_image = Image.fromarray(image)
108
+ timestamp = round(i / fps, 2)
109
+ frames.append((pil_image, timestamp))
110
+
111
+ vidcap.release()
112
+ return frames
113
+
114
+
115
+ def process_video(video_path: str) -> list[dict]:
116
+ content = []
117
+ frames = downsample_video(video_path)
118
+ for frame in frames:
119
+ pil_image, timestamp = frame
120
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
121
+ pil_image.save(temp_file.name)
122
+ content.append({"type": "text", "text": f"Frame {timestamp}:"})
123
+ content.append({"type": "image", "url": temp_file.name})
124
+ return content
125
+
126
+
127
+ def process_interleaved_images(message: dict) -> list[dict]:
128
+ parts = re.split(r"(<image>)", message["text"])
129
+
130
+ content = []
131
+ image_index = 0
132
+ for part in parts:
133
+ if part == "<image>":
134
+ content.append({"type": "image", "url": message["files"][image_index]})
135
+ image_index += 1
136
+ elif part.strip():
137
+ content.append({"type": "text", "text": part.strip()})
138
+ elif isinstance(part, str) and part != "<image>":
139
+ content.append({"type": "text", "text": part})
140
+ return content
141
+
142
+
143
+ def process_new_user_message(message: dict) -> list[dict]:
144
+ if not message["files"]:
145
+ return [{"type": "text", "text": message["text"]}]
146
+
147
+ if message["files"][0].endswith(".mp4"):
148
+ return [{"type": "text", "text": message["text"]}, *process_video(message["files"][0])]
149
+
150
+ if "<image>" in message["text"]:
151
+ return process_interleaved_images(message)
152
+
153
+ return [
154
+ {"type": "text", "text": message["text"]},
155
+ *[{"type": "image", "url": path} for path in message["files"]],
156
+ ]
157
+
158
+
159
+ def process_history(history: list[dict]) -> list[dict]:
160
+ messages = []
161
+ current_user_content: list[dict] = []
162
+ for item in history:
163
+ if item["role"] == "assistant":
164
+ if current_user_content:
165
+ messages.append({"role": "user", "content": current_user_content})
166
+ current_user_content = []
167
+ messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
168
+ else:
169
+ content = item["content"]
170
+ if isinstance(content, str):
171
+ current_user_content.append({"type": "text", "text": content})
172
+ else:
173
+ current_user_content.append({"type": "image", "url": content[0]})
174
+ return messages
175
+
176
+
177
+ def make_demo(model, processor):
178
+ download_example_images()
179
+
180
+ def run(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
181
+ if not validate_media_constraints(message, history):
182
+ yield ""
183
+ return
184
+
185
+ messages = []
186
+ if system_prompt:
187
+ messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
188
+ messages.extend(process_history(history))
189
+ messages.append({"role": "user", "content": process_new_user_message(message)})
190
+
191
+ inputs = processor.apply_chat_template(
192
+ messages,
193
+ add_generation_prompt=True,
194
+ tokenize=True,
195
+ return_dict=True,
196
+ return_tensors="pt",
197
+ ).to(device=model.device)
198
+
199
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
200
+ generate_kwargs = dict(
201
+ inputs,
202
+ streamer=streamer,
203
+ max_new_tokens=max_new_tokens,
204
+ )
205
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
206
+ t.start()
207
+
208
+ output = ""
209
+ for delta in streamer:
210
+ output += delta
211
+ yield output
212
+
213
+ examples = [
214
+ [
215
+ {
216
+ "text": "Preciso ficar no Japão por 10 dias, visitando Tóquio, Kyoto e Osaka. Pense no número de atrações em cada uma delas e reserve um tempo para cada cidade. Faça recomendações de transporte público.",
217
+ "files": [],
218
+ }
219
+ ],
220
+ [
221
+ {
222
+ "text": "Escreva o código matplotlib para gerar o mesmo gráfico de barras.",
223
+ "files": ["barchart.png"],
224
+ }
225
+ ],
226
+ [
227
+ {
228
+ "text": "Escreva uma história curta sobre o que pode ter acontecido nesta casa.",
229
+ "files": ["house.png"],
230
+ }
231
+ ],
232
+ [
233
+ {
234
+ "text": "Resolva esta integral.",
235
+ "files": ["integral.png"],
236
+ }
237
+ ],
238
+ [
239
+ {
240
+ "text": "O que diz a placa?",
241
+ "files": ["sign.png"],
242
+ }
243
+ ],
244
+ [
245
+ {
246
+ "text": "Lista todos os objetos na imagem e suas cores.",
247
+ "files": ["colors.png"],
248
+ }
249
+ ],
250
+ [
251
+ {
252
+ "text": "Descreva a atmosfera da cena.",
253
+ "files": ["sunset.png"],
254
+ }
255
+ ],
256
+ ]
257
+
258
+ demo = gr.ChatInterface(
259
+ fn=run,
260
+ type="messages",
261
+ chatbot=gr.Chatbot(type="messages", scale=1),
262
+ textbox=gr.MultimodalTextbox(file_types=["image", ".mp4"], file_count="multiple", autofocus=True),
263
+ multimodal=True,
264
+ additional_inputs=[
265
+ gr.Textbox(label="System Prompt", value="Você é um assistente útil."),
266
+ gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
267
+
268
+ ],
269
+ stop_btn=False,
270
+ title=title_with_logo,
271
+ description=description_with_logo,
272
+ examples=examples,
273
+ run_examples_on_click=False,
274
+ cache_examples=False,
275
+ delete_cache=(1800, 1800),
276
+ )
277
+
278
+
279
+ return demo
280
+
281
+
282
+ # title="NeuroENEM for CPU",
283
+ # description=DESCRIPTION,
284
+
285
+ # gr.Dropdown(["CPU", "GPU", "NPU"], label="Device", info="Your device!"),