openfree commited on
Commit
59d9300
ยท
verified ยท
1 Parent(s): f881e03

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +409 -254
app.py CHANGED
@@ -2,10 +2,13 @@
2
 
3
  import os
4
  import re
 
5
  import gc
6
  from collections.abc import Iterator
7
  from threading import Thread
 
8
  import requests
 
9
  import gradio as gr
10
  import spaces
11
  import torch
@@ -15,7 +18,12 @@ from PIL import Image
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
  import time
17
  import warnings
18
- from typing import Union
 
 
 
 
 
19
 
20
  warnings.filterwarnings('ignore')
21
 
@@ -24,7 +32,9 @@ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
24
  ##############################################################################
25
  # ์ƒ์ˆ˜ ์ •์˜
26
  ##############################################################################
 
27
  MAX_INPUT_LENGTH = 2096
 
28
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
29
 
30
  ##############################################################################
@@ -75,17 +85,17 @@ def do_web_search(query: str) -> str:
75
  "domain": "google.com",
76
  "serp_type": "web",
77
  "device": "desktop",
78
- "lang": "ko",
79
- "num": "5" # 5๊ฐœ๋กœ ์ œํ•œ (๋กœ๋ด‡ ์ž‘์—…์— ์ถฉ๋ถ„)
80
  }
81
 
82
  headers = {
83
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
84
  }
85
 
86
- logger.info(f"๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ ๊ฒ€์ƒ‰: {query}")
87
 
88
- response = requests.get(url, headers=headers, params=params, timeout=30)
89
  response.raise_for_status()
90
 
91
  data = response.json()
@@ -94,22 +104,91 @@ def do_web_search(query: str) -> str:
94
  organic = results.get("organic", []) if isinstance(results, dict) else []
95
 
96
  if not organic:
97
- return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
98
-
99
- # ๋กœ๋ด‡ ์ž‘์—…์— ์œ ์šฉํ•œ ์ •๋ณด๋งŒ ์ถ”์ถœ
100
- search_info = "# ๋กœ๋ด‡ ์ž‘์—… ์ฐธ๊ณ  ์ •๋ณด\n\n"
101
- for idx, item in enumerate(organic[:5], 1):
102
- title = item.get("title", "")
103
- snippet = item.get("snippet", "")
104
- link = item.get("link", "")
 
 
 
105
 
106
- search_info += f"{idx}. {title}\n{snippet}\n์ถœ์ฒ˜: {link}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- return search_info
 
109
 
110
  except Exception as e:
111
  logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
112
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  ##############################################################################
115
  # ๋ชจ๋ธ ๋กœ๋“œ
@@ -145,12 +224,12 @@ def load_model():
145
  return False
146
 
147
  ##############################################################################
148
- # ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ํ•จ์ˆ˜
149
  ##############################################################################
150
  @spaces.GPU(duration=60)
151
- def analyze_for_robot_task(
152
  image: Union[np.ndarray, Image.Image],
153
- task_description: str,
154
  task_type: str = "general",
155
  use_web_search: bool = False,
156
  enable_thinking: bool = True,
@@ -168,78 +247,44 @@ def analyze_for_robot_task(
168
  if isinstance(image, np.ndarray):
169
  image = Image.fromarray(image).convert('RGB')
170
 
171
- # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
172
  system_prompts = {
173
- "general": """๋‹น์‹ ์€ ๊ณ ๊ธ‰ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค.
174
- ์ด๋ฏธ์ง€๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์‹๋ณ„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.
175
- ํ™˜๊ฒฝ์˜ ๊ฐ์ฒด, ์œ„์น˜, ์ž ์žฌ์  ์œ„ํ—˜ ์š”์†Œ๋ฅผ ํฌํ•จํ•˜์—ฌ ์ƒ์„ธํžˆ ๋ถ„์„ํ•˜์„ธ์š”.""",
176
-
177
- "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
178
- ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
179
- ๊ฐ ๋‹จ๊ณ„๋Š” ๋ช…ํ™•ํ•˜๊ณ  ์‹คํ–‰ ๊ฐ€๋Šฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
180
-
181
- ํ˜•์‹:
182
- Step_1: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
183
- Step_2: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
184
- ...
185
- Step_n: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
186
-
187
- ์•ˆ์ „ ๊ณ ๋ ค์‚ฌํ•ญ: [์ฃผ์˜์‚ฌํ•ญ]""",
188
-
189
- "grounding": """๋‹น์‹ ์€ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค.
190
- ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์ •ํ™•ํ•œ ์œ„์น˜๋ฅผ ์ฐพ์•„ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค ์ขŒํ‘œ๋ฅผ ์ œ๊ณตํ•˜์„ธ์š”.
191
-
192
- ์ถœ๋ ฅ ํ˜•์‹: [x1, y1, x2, y2]
193
- - (x1, y1): ์ขŒ์ƒ๋‹จ ์ขŒํ‘œ
194
- - (x2, y2): ์šฐํ•˜๋‹จ ์ขŒํ‘œ
195
- - ์ขŒํ‘œ๋Š” 0-1000 ๋ฒ”์œ„์˜ ์ •์ˆ˜""",
196
-
197
- "affordance": """๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ทธ๋ฆฌํผ ์ œ์–ด ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
198
- ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์œ„์น˜์™€ ๋ฐฉ๋ฒ•์„ ๋ถ„์„ํ•˜์„ธ์š”.
199
-
200
- ์ถœ๋ ฅ ํ˜•์‹:
201
- ํŒŒ์ง€ ์˜์—ญ: [x1, y1, x2, y2]
202
- ํŒŒ์ง€ ๋ฐฉํ–ฅ: [์ˆ˜์ง/์ˆ˜ํ‰/๊ฐ๋„]
203
- ํŒŒ์ง€ ๊ฐ•๋„: [์•ฝํ•จ/์ค‘๊ฐ„/๊ฐ•ํ•จ]
204
- ์ฃผ์˜์‚ฌํ•ญ: [ํŠน๋ณ„ ๊ณ ๋ ค์‚ฌํ•ญ]""",
205
-
206
- "trajectory": """๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ ๊ณ„ํš ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
207
- ์‹œ์ž‘์ ์—์„œ ๋ชฉํ‘œ์ ๊นŒ์ง€์˜ ์ตœ์  ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜์„ธ์š”.
208
-
209
- ์ถœ๋ ฅ ํ˜•์‹:
210
- ๊ฒฝ๋กœ: [(x1,y1), (x2,y2), ..., (xn,yn)]
211
- ์˜ˆ์ƒ ์†Œ์š” ์‹œ๊ฐ„: [์ดˆ]
212
- ์žฅ์• ๋ฌผ ํšŒํ”ผ: [์„ค๋ช…]
213
- ์•ˆ์ „ ์—ฌ์œ : [cm]"""
214
  }
215
 
216
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
217
 
218
  # Chain-of-Thought ์ถ”๊ฐ€
219
  if enable_thinking:
220
- system_prompt += "\n\n๋จผ์ € <thinking> ํƒœ๊ทธ ์•ˆ์— ๋ถ„์„ ๊ณผ์ •์„ ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„, <answer> ํƒœ๊ทธ ์•ˆ์— ์ตœ์ข… ๊ฒฐ๊ณผ๋ฅผ ์ œ์‹œํ•˜์„ธ์š”."
221
-
222
- # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ (๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ)
223
- web_info = ""
224
- if use_web_search and task_type in ["planning", "affordance"]:
225
- # ๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ ํ‚ค์›Œ๋“œ๋กœ ๊ฒ€์ƒ‰
226
- search_query = f"๋กœ๋ด‡ {task_description} ๋ฐฉ๋ฒ• ๊ฐ€์ด๋“œ"
227
- web_info = do_web_search(search_query)
228
 
229
- # ์ตœ์ข… ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
230
- final_system = web_info + "\n\n" + system_prompt if web_info else system_prompt
 
 
 
 
 
 
231
 
232
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
233
  messages = [
234
  {
235
  "role": "system",
236
- "content": [{"type": "text", "text": final_system}]
237
  },
238
  {
239
  "role": "user",
240
  "content": [
241
  {"type": "image", "url": image},
242
- {"type": "text", "text": f"์ž‘์—…: {task_description}"}
243
  ]
244
  }
245
  ]
@@ -279,220 +324,312 @@ Step_n: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
279
  return response
280
 
281
  except Exception as e:
282
- logger.error(f"๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
283
  import traceback
284
  return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
285
  finally:
286
  clear_cuda_cache()
287
 
288
  ##############################################################################
289
- # Gradio UI (๋กœ๋ด‡ ์ž‘์—… ์ค‘์‹ฌ)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  ##############################################################################
291
  css = """
292
  .robot-header {
293
  text-align: center;
294
  background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
295
  color: white;
296
- padding: 25px;
297
- border-radius: 15px;
298
  margin-bottom: 20px;
299
- box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
300
- }
301
- .robot-header h1 {
302
- margin: 0 0 10px 0;
303
- font-size: 2.5em;
304
  }
305
  .status-box {
306
  text-align: center;
307
- padding: 12px;
308
- border-radius: 8px;
309
- margin: 15px 0;
310
  font-weight: bold;
311
- font-size: 1.1em;
312
  }
313
- .task-card {
314
- background: white;
315
- padding: 20px;
316
- border-radius: 10px;
317
- box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
318
- margin-bottom: 20px;
319
  }
320
  .task-button {
321
- min-height: 80px;
322
- font-size: 1.2em;
323
- font-weight: bold;
324
- transition: all 0.3s ease;
325
- }
326
- .task-button:hover {
327
- transform: translateY(-2px);
328
- box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2);
329
  }
330
  .webcam-container {
331
  border: 3px solid #2a5298;
332
- border-radius: 15px;
333
- padding: 15px;
334
- background: #f8f9fa;
335
- }
336
- .result-container {
337
- background: #f0f8ff;
338
- border-left: 5px solid #2a5298;
339
- padding: 20px;
340
  border-radius: 10px;
 
 
341
  }
342
  """
343
 
344
- with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ", css=css) as demo:
345
  gr.HTML("""
346
  <div class="robot-header">
347
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
348
- <p style="font-size: 1.2em; margin: 0;">์‹ค์‹œ๊ฐ„ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ</p>
349
- <p style="font-size: 0.9em; opacity: 0.9; margin-top: 5px;">Powered by Gemma3-R1984-4B</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  </div>
351
  """)
352
 
353
  with gr.Row():
354
- # ์™ผ์ชฝ: ์›น์บ  ์˜์—ญ
355
  with gr.Column(scale=1):
 
 
356
  with gr.Group(elem_classes="webcam-container"):
357
- gr.Markdown("## ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
358
-
359
  webcam = gr.Image(
360
  sources=["webcam"],
361
  streaming=True,
362
  type="numpy",
363
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
364
- height=400
365
  )
366
 
367
- with gr.Row():
368
- capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", size="lg")
369
- clear_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", size="lg")
370
-
371
  captured_image = gr.Image(
372
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
373
- height=300,
374
  visible=False
375
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
- # ์˜ค๋ฅธ์ชฝ: ์ž‘์—… ์ œ์–ด ๋ฐ ๊ฒฐ๊ณผ
378
  with gr.Column(scale=2):
379
- # ์ž‘์—… ์„ค์ •
380
- with gr.Group(elem_classes="task-card"):
381
- gr.Markdown("## โš™๏ธ ์ž‘์—… ์„ค์ •")
382
-
383
- task_input = gr.Textbox(
384
- label="์ž‘์—… ์„ค๋ช…",
385
- placeholder="์˜ˆ: ๋นจ๊ฐ„์ƒ‰ ์ปต์„ ์žก์•„์„œ ํŒŒ๋ž€์ƒ‰ ๋ฐ•์Šค์— ๋„ฃ๊ธฐ",
386
- value="์ด ์žฅ๋ฉด์—์„œ ์ˆ˜ํ–‰ํ•  ๋กœ๋ด‡ ์ž‘์—…์„ ์„ค๋ช…ํ•˜์„ธ์š”",
387
- lines=2
388
- )
389
-
390
- with gr.Row():
391
- use_search = gr.Checkbox(
392
- label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
393
- value=False,
394
- info="์ž‘์—… ์ˆ˜ํ–‰ ๋ฐฉ๋ฒ•์„ ์›น์—์„œ ๊ฒ€์ƒ‰"
395
  )
396
 
397
- show_thinking = gr.Checkbox(
398
- label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
399
- value=True,
400
- info="AI์˜ ๋ถ„์„ ๊ณผ์ •์„ ํˆฌ๋ช…ํ•˜๊ฒŒ ํ‘œ์‹œ"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  )
402
-
403
- max_tokens = gr.Slider(
404
- label="์‘๋‹ต ๊ธธ์ด",
405
- minimum=256,
406
- maximum=2048,
407
- value=1024,
408
- step=256
409
- )
410
 
411
- # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ
412
- with gr.Group(elem_classes="task-card"):
413
- gr.Markdown("## ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„")
414
-
415
- with gr.Row():
416
- analyze_btn = gr.Button(
417
- "๐Ÿ” ์žฅ๋ฉด ๋ถ„์„",
418
- variant="secondary",
419
- elem_classes="task-button"
420
- )
421
- planning_btn = gr.Button(
422
- "๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš",
423
- variant="primary",
424
- elem_classes="task-button"
425
- )
426
-
427
- with gr.Row():
428
- grounding_btn = gr.Button(
429
- "๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜",
430
- variant="secondary",
431
- elem_classes="task-button"
432
- )
433
- affordance_btn = gr.Button(
434
- "๐Ÿค ํŒŒ์ง€์  ๋ถ„์„",
435
- variant="secondary",
436
- elem_classes="task-button"
437
- )
438
-
439
- trajectory_btn = gr.Button(
440
- "๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš",
441
- variant="secondary",
442
- elem_classes="task-button",
443
- elem_id="trajectory"
444
- )
445
 
446
- # ์ƒํƒœ ํ‘œ์‹œ
447
  status_display = gr.HTML(
448
- '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŸข ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
449
  )
450
 
451
- # ๋ถ„์„ ๊ฒฐ๊ณผ
452
- with gr.Group(elem_classes="result-container"):
453
- gr.Markdown("## ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
454
- result_output = gr.Textbox(
455
- label="",
456
- lines=20,
457
- max_lines=40,
458
- show_copy_button=True
459
- )
460
-
461
- # ์ƒํƒœ ๊ด€๋ฆฌ
462
- captured_state = gr.State(None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
463
 
464
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
465
- def capture_frame(frame):
 
 
466
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
467
  if frame is None:
468
- return None, gr.update(visible=False), '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ ์ด ์—ฐ๊ฒฐ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค</div>'
469
-
470
- return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d1ecf1; color:#0c5460;">๐Ÿ“ธ ์ด๋ฏธ์ง€๊ฐ€ ์บก์ฒ˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>'
471
 
472
  def clear_capture():
473
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
474
- return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŸข ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>', ""
475
 
476
- def analyze_task(image, task_desc, task_type, use_search, thinking, tokens):
477
- """๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„"""
478
  if image is None:
479
- return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”!", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค</div>'
480
 
481
- # ์ž‘์—… ์œ ํ˜•๋ณ„ ์ƒํƒœ ๋ฉ”์‹œ์ง€
482
- task_messages = {
483
- "general": "์žฅ๋ฉด ๋ถ„์„",
484
- "planning": "์ž‘์—… ๊ณ„ํš ์ˆ˜๋ฆฝ",
485
- "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
486
- "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
487
- "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
488
- }
489
-
490
- status_msg = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿ”„ {task_messages[task_type]} ์ค‘...</div>'
491
 
492
- # ๋ถ„์„ ์ˆ˜ํ–‰
493
- result = analyze_for_robot_task(
494
  image=image,
495
- task_description=task_desc,
496
  task_type=task_type,
497
  use_web_search=use_search,
498
  enable_thinking=thinking,
@@ -500,83 +637,101 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ", css=css) as demo:
500
  )
501
 
502
  # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
503
- timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
504
 
505
- formatted_result = f"""โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
506
- โ•‘ ๐Ÿค– ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๊ฒฐ๊ณผ โ•‘
507
- โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
508
-
509
- ๐Ÿ“‹ ์ž‘์—… ์œ ํ˜•: {task_messages[task_type]}
510
- ๐Ÿ“ ์ž‘์—… ์„ค๋ช…: {task_desc}
511
- โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
512
 
513
- โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•๏ฟฝ๏ฟฝ๏ฟฝโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
514
 
 
515
  {result}
516
 
517
- โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
518
-
519
  ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
520
- ๐Ÿ”ง ๋ถ„์„ ์™„๋ฃŒ"""
521
 
522
- complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>'
523
  return formatted_result, complete_status
524
 
525
- # ๋ฒ„ํŠผ ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
526
- capture_btn.click(
527
- fn=lambda frame: capture_frame(frame) if frame is not None else capture_frame(webcam.value),
528
  inputs=[webcam],
529
- outputs=[captured_state, captured_image, status_display]
530
  )
531
 
532
- clear_btn.click(
533
- fn=clear_capture,
534
- outputs=[captured_state, captured_image, status_display, result_output]
 
 
535
  )
536
 
537
- # ์ž‘์—… ๋ถ„์„ ๋ฒ„ํŠผ๋“ค
538
- analyze_btn.click(
539
- fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "general", s, t, tk),
540
- inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
541
- outputs=[result_output, status_display]
542
  )
543
 
 
544
  planning_btn.click(
545
- fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "planning", s, t, tk),
546
- inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
547
  outputs=[result_output, status_display]
548
  )
549
 
550
  grounding_btn.click(
551
- fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "grounding", s, t, tk),
552
- inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
553
  outputs=[result_output, status_display]
554
  )
555
 
556
  affordance_btn.click(
557
- fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "affordance", s, t, tk),
558
- inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
559
  outputs=[result_output, status_display]
560
  )
561
 
562
  trajectory_btn.click(
563
- fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "trajectory", s, t, tk),
564
- inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
565
  outputs=[result_output, status_display]
566
  )
567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
569
  def initial_load():
570
  load_model()
571
- return '<div class="status-box" style="background:#d4edda; color:#155724;">๐Ÿš€ ์‹œ์Šคํ…œ์ด ์ค€๋น„๋˜์—ˆ์Šต๋‹ˆ๋‹ค!</div>'
572
 
573
  demo.load(
574
  fn=initial_load,
575
- outputs=[status_display]
576
  )
577
 
578
  if __name__ == "__main__":
579
- print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘...")
580
  demo.launch(
581
  server_name="0.0.0.0",
582
  server_port=7860,
 
2
 
3
  import os
4
  import re
5
+ import tempfile
6
  import gc
7
  from collections.abc import Iterator
8
  from threading import Thread
9
+ import json
10
  import requests
11
+ import cv2
12
  import gradio as gr
13
  import spaces
14
  import torch
 
18
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
  import time
20
  import warnings
21
+ from typing import Dict, List, Optional, Union
22
+
23
+ # CSV/TXT ๋ถ„์„
24
+ import pandas as pd
25
+ # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
26
+ import PyPDF2
27
 
28
  warnings.filterwarnings('ignore')
29
 
 
32
  ##############################################################################
33
  # ์ƒ์ˆ˜ ์ •์˜
34
  ##############################################################################
35
+ MAX_CONTENT_CHARS = 2000
36
  MAX_INPUT_LENGTH = 2096
37
+ MAX_NUM_IMAGES = 5
38
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
39
 
40
  ##############################################################################
 
85
  "domain": "google.com",
86
  "serp_type": "web",
87
  "device": "desktop",
88
+ "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
+ "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
  }
91
 
92
  headers = {
93
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
94
  }
95
 
96
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
97
 
98
+ response = requests.get(url, headers=headers, params=params, timeout=60)
99
  response.raise_for_status()
100
 
101
  data = response.json()
 
104
  organic = results.get("organic", []) if isinstance(results, dict) else []
105
 
106
  if not organic:
107
+ return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
+
109
+ max_results = min(10, len(organic))
110
+ limited_organic = organic[:max_results]
111
+
112
+ summary_lines = []
113
+ for idx, item in enumerate(limited_organic, start=1):
114
+ title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
115
+ link = item.get("link", "#")
116
+ snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
117
+ displayed_link = item.get("displayed_link", link)
118
 
119
+ summary_lines.append(
120
+ f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
121
+ f"{snippet}\n\n"
122
+ f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
123
+ f"---\n"
124
+ )
125
+
126
+ instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
127
+ ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
128
+ 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
129
+ 2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
130
+ 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
131
+ """
132
 
133
+ search_results = instructions + "\n".join(summary_lines)
134
+ return search_results
135
 
136
  except Exception as e:
137
  logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
138
+ return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
139
+
140
+ ##############################################################################
141
+ # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
142
+ ##############################################################################
143
+ def analyze_csv_file(path: str) -> str:
144
+ """CSV ํŒŒ์ผ ๋ถ„์„"""
145
+ try:
146
+ df = pd.read_csv(path)
147
+ if df.shape[0] > 50 or df.shape[1] > 10:
148
+ df = df.iloc[:50, :10]
149
+ df_str = df.to_string()
150
+ if len(df_str) > MAX_CONTENT_CHARS:
151
+ df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
152
+ return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
153
+ except Exception as e:
154
+ return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
155
+
156
+ def analyze_txt_file(path: str) -> str:
157
+ """TXT ํŒŒ์ผ ๋ถ„์„"""
158
+ try:
159
+ with open(path, "r", encoding="utf-8") as f:
160
+ text = f.read()
161
+ if len(text) > MAX_CONTENT_CHARS:
162
+ text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
163
+ return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
164
+ except Exception as e:
165
+ return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
166
+
167
+ def pdf_to_markdown(pdf_path: str) -> str:
168
+ """PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
169
+ text_chunks = []
170
+ try:
171
+ with open(pdf_path, "rb") as f:
172
+ reader = PyPDF2.PdfReader(f)
173
+ max_pages = min(5, len(reader.pages))
174
+ for page_num in range(max_pages):
175
+ page = reader.pages[page_num]
176
+ page_text = page.extract_text() or ""
177
+ page_text = page_text.strip()
178
+ if page_text:
179
+ if len(page_text) > MAX_CONTENT_CHARS // max_pages:
180
+ page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
181
+ text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
182
+ if len(reader.pages) > max_pages:
183
+ text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
184
+ except Exception as e:
185
+ return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
186
+
187
+ full_text = "\n".join(text_chunks)
188
+ if len(full_text) > MAX_CONTENT_CHARS:
189
+ full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
190
+
191
+ return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
192
 
193
  ##############################################################################
194
  # ๋ชจ๋ธ ๋กœ๋“œ
 
224
  return False
225
 
226
  ##############################################################################
227
+ # ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
228
  ##############################################################################
229
  @spaces.GPU(duration=60)
230
+ def analyze_image_for_robot(
231
  image: Union[np.ndarray, Image.Image],
232
+ prompt: str,
233
  task_type: str = "general",
234
  use_web_search: bool = False,
235
  enable_thinking: bool = True,
 
247
  if isinstance(image, np.ndarray):
248
  image = Image.fromarray(image).convert('RGB')
249
 
250
+ # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
251
  system_prompts = {
252
+ "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€๋ฅผ ์ž์„ธํžˆ ๋ถ„์„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.",
253
+ "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš์„ ์ˆ˜๋ฆฝํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค.
254
+ ์ฃผ์–ด์ง„ ์ด๋ฏธ์ง€์™€ ์ž‘์—…์„ ๋ถ„์„ํ•˜์—ฌ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
255
+ ํ˜•์‹: Step_1: xxx\nStep_2: xxx\n...\nStep_n: xxx""",
256
+ "grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜๋ฅผ ์ฐพ๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์œ„์น˜๋ฅผ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
257
+ "affordance": "๋‹น์‹ ์€ ๋กœ๋ด‡ ํŒŒ์ง€์ ์„ ๋ถ„์„ํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ์˜ˆ์ธกํ•˜์„ธ์š”.",
258
+ "trajectory": "๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ๋ชฉํ‘œ ์ง€์ ๊นŒ์ง€์˜ ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
259
+ "pointing": "๋‹น์‹ ์€ ๋‹ค์ค‘ ์ง€์ ์„ ์ง€์ •ํ•˜๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ์œ„์น˜๋“ค์„ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  }
261
 
262
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
263
 
264
  # Chain-of-Thought ์ถ”๊ฐ€
265
  if enable_thinking:
266
+ system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œํ•˜์„ธ์š”."
 
 
 
 
 
 
 
267
 
268
+ # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
269
+ combined_system = system_prompt
270
+ if use_web_search:
271
+ keywords = extract_keywords(prompt, top_k=5)
272
+ if keywords:
273
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
274
+ search_results = do_web_search(keywords)
275
+ combined_system = f"{search_results}\n\n{system_prompt}"
276
 
277
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
278
  messages = [
279
  {
280
  "role": "system",
281
+ "content": [{"type": "text", "text": combined_system}]
282
  },
283
  {
284
  "role": "user",
285
  "content": [
286
  {"type": "image", "url": image},
287
+ {"type": "text", "text": prompt}
288
  ]
289
  }
290
  ]
 
324
  return response
325
 
326
  except Exception as e:
327
+ logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
328
  import traceback
329
  return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
330
  finally:
331
  clear_cuda_cache()
332
 
333
  ##############################################################################
334
+ # ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
335
+ ##############################################################################
336
+ def _model_gen_with_oom_catch(**kwargs):
337
+ """OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
338
+ global model
339
+ try:
340
+ model.generate(**kwargs)
341
+ except torch.cuda.OutOfMemoryError:
342
+ raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
343
+ finally:
344
+ clear_cuda_cache()
345
+
346
+ @spaces.GPU(duration=120)
347
+ def analyze_documents_streaming(
348
+ files: List[str],
349
+ prompt: str,
350
+ use_web_search: bool = False,
351
+ max_new_tokens: int = 2048
352
+ ) -> Iterator[str]:
353
+ """๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
354
+ global model, processor
355
+
356
+ if not model_loaded:
357
+ if not load_model():
358
+ yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
359
+ return
360
+
361
+ try:
362
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
363
+ system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
364
+
365
+ # ์›น ๊ฒ€์ƒ‰
366
+ if use_web_search:
367
+ keywords = extract_keywords(prompt, top_k=5)
368
+ if keywords:
369
+ search_results = do_web_search(keywords)
370
+ system_content = f"{search_results}\n\n{system_content}"
371
+
372
+ # ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
373
+ doc_contents = []
374
+ for file_path in files:
375
+ if file_path.lower().endswith('.csv'):
376
+ content = analyze_csv_file(file_path)
377
+ elif file_path.lower().endswith('.txt'):
378
+ content = analyze_txt_file(file_path)
379
+ elif file_path.lower().endswith('.pdf'):
380
+ content = pdf_to_markdown(file_path)
381
+ else:
382
+ continue
383
+ doc_contents.append(content)
384
+
385
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
386
+ messages = [
387
+ {
388
+ "role": "system",
389
+ "content": [{"type": "text", "text": system_content}]
390
+ },
391
+ {
392
+ "role": "user",
393
+ "content": [
394
+ {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
395
+ ]
396
+ }
397
+ ]
398
+
399
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
400
+ inputs = processor.apply_chat_template(
401
+ messages,
402
+ add_generation_prompt=True,
403
+ tokenize=True,
404
+ return_dict=True,
405
+ return_tensors="pt",
406
+ ).to(device=model.device, dtype=torch.bfloat16)
407
+
408
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
409
+ streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
410
+ gen_kwargs = dict(
411
+ inputs,
412
+ streamer=streamer,
413
+ max_new_tokens=max_new_tokens,
414
+ temperature=0.8,
415
+ top_p=0.9,
416
+ )
417
+
418
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
419
+ t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
420
+ t.start()
421
+
422
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
423
+ output = ""
424
+ for new_text in streamer:
425
+ output += new_text
426
+ yield output
427
+
428
+ except Exception as e:
429
+ logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
430
+ yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
431
+ finally:
432
+ clear_cuda_cache()
433
+
434
+ ##############################################################################
435
+ # Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
436
  ##############################################################################
437
  css = """
438
  .robot-header {
439
  text-align: center;
440
  background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
441
  color: white;
442
+ padding: 20px;
443
+ border-radius: 10px;
444
  margin-bottom: 20px;
445
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 
 
 
 
446
  }
447
  .status-box {
448
  text-align: center;
449
+ padding: 10px;
450
+ border-radius: 5px;
451
+ margin: 10px 0;
452
  font-weight: bold;
 
453
  }
454
+ .info-box {
455
+ background: #f0f0f0;
456
+ padding: 15px;
457
+ border-radius: 8px;
458
+ margin: 10px 0;
459
+ border-left: 4px solid #2a5298;
460
  }
461
  .task-button {
462
+ min-height: 60px;
463
+ font-size: 1.1em;
 
 
 
 
 
 
464
  }
465
  .webcam-container {
466
  border: 3px solid #2a5298;
 
 
 
 
 
 
 
 
467
  border-radius: 10px;
468
+ padding: 10px;
469
+ background: #f8f9fa;
470
  }
471
  """
472
 
473
+ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
474
  gr.HTML("""
475
  <div class="robot-header">
476
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
477
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
478
+ <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
479
+ </div>
480
+ """)
481
+
482
+ gr.HTML("""
483
+ <div class="info-box">
484
+ <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
485
+ <ul>
486
+ <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
487
+ <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
488
+ <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
489
+ <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
490
+ <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
491
+ <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
492
+ <li>๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„ (PDF, CSV, TXT)</li>
493
+ </ul>
494
  </div>
495
  """)
496
 
497
  with gr.Row():
498
+ # ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
499
  with gr.Column(scale=1):
500
+ gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
501
+
502
  with gr.Group(elem_classes="webcam-container"):
 
 
503
  webcam = gr.Image(
504
  sources=["webcam"],
505
  streaming=True,
506
  type="numpy",
507
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
508
+ height=350
509
  )
510
 
511
+ # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
 
 
 
512
  captured_image = gr.Image(
513
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
514
+ height=200,
515
  visible=False
516
  )
517
+
518
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
519
+ gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
520
+ with gr.Row():
521
+ capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
522
+ clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
523
+
524
+ with gr.Row():
525
+ planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
526
+ grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
527
+
528
+ with gr.Row():
529
+ affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
530
+ trajectory_btn = gr.Button("๐Ÿ›ค๏ฟฝ๏ฟฝ๏ฟฝ ๊ฒฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
531
 
532
+ # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
533
  with gr.Column(scale=2):
534
+ gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
535
+
536
+ with gr.Row():
537
+ with gr.Column():
538
+ task_prompt = gr.Textbox(
539
+ label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
540
+ placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
541
+ value="์ด ์žฅ๋ฉด์—์„œ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ๋ถ„์„ํ•˜์„ธ์š”.",
542
+ lines=2
 
 
 
 
 
 
 
543
  )
544
 
545
+ with gr.Row():
546
+ use_web_search = gr.Checkbox(
547
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
548
+ value=False,
549
+ info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
550
+ )
551
+
552
+ enable_thinking = gr.Checkbox(
553
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
554
+ value=True,
555
+ info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
556
+ )
557
+
558
+ max_tokens = gr.Slider(
559
+ label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
560
+ minimum=256,
561
+ maximum=4096,
562
+ value=1024,
563
+ step=256
564
  )
 
 
 
 
 
 
 
 
565
 
566
+ gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
567
+ result_output = gr.Textbox(
568
+ label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
569
+ lines=20,
570
+ max_lines=40,
571
+ show_copy_button=True,
572
+ elem_id="result"
573
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
574
 
 
575
  status_display = gr.HTML(
576
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
577
  )
578
 
579
+ # ๋ฌธ์„œ ๋ถ„์„ ํƒญ
580
+ with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„"):
581
+ with gr.Row():
582
+ with gr.Column():
583
+ doc_files = gr.File(
584
+ label="๋ฌธ์„œ ์—…๋กœ๋“œ",
585
+ file_count="multiple",
586
+ file_types=[".pdf", ".csv", ".txt"],
587
+ type="filepath"
588
+ )
589
+
590
+ doc_prompt = gr.Textbox(
591
+ label="๋ถ„์„ ์š”์ฒญ",
592
+ placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
593
+ lines=3
594
+ )
595
+
596
+ doc_web_search = gr.Checkbox(
597
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
598
+ value=False
599
+ )
600
+
601
+ analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
602
+
603
+ with gr.Column():
604
+ doc_result = gr.Textbox(
605
+ label="๋ถ„์„ ๊ฒฐ๊ณผ",
606
+ lines=25,
607
+ max_lines=50
608
+ )
609
 
610
  # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
611
+ webcam_state = gr.State(None)
612
+
613
+ def capture_webcam(frame):
614
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
615
  if frame is None:
616
+ return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
617
+ return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
 
618
 
619
  def clear_capture():
620
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
621
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
622
 
623
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
624
+ """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
625
  if image is None:
626
+ return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
627
 
628
+ status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
 
 
 
 
 
 
 
 
 
629
 
630
+ result = analyze_image_for_robot(
 
631
  image=image,
632
+ prompt=prompt,
633
  task_type=task_type,
634
  use_web_search=use_search,
635
  enable_thinking=thinking,
 
637
  )
638
 
639
  # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
640
+ timestamp = time.strftime("%H:%M:%S")
641
+ task_names = {
642
+ "planning": "์ž‘์—… ๊ณ„ํš",
643
+ "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
644
+ "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
645
+ "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
646
+ }
647
 
648
+ formatted_result = f"""๐Ÿค– ๋กœ๋ด‡ {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ:
 
 
 
 
 
 
649
 
650
+ ๐Ÿ“ธ **์ž‘์—…**: {prompt}
651
 
652
+ ๐Ÿ“ **๋ถ„์„ ๊ฒฐ๊ณผ**:
653
  {result}
654
 
655
+ โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
 
656
  ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
657
+ ๐Ÿ”ง ํƒœ์Šคํฌ: {task_type}"""
658
 
659
+ complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
660
  return formatted_result, complete_status
661
 
662
+ # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
663
+ webcam.stream(
664
+ fn=lambda x: x,
665
  inputs=[webcam],
666
+ outputs=[webcam_state]
667
  )
668
 
669
+ # ์บก์ฒ˜ ๋ฒ„ํŠผ
670
+ capture_btn.click(
671
+ fn=capture_webcam,
672
+ inputs=[webcam_state],
673
+ outputs=[webcam_state, captured_image, status_display]
674
  )
675
 
676
+ # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
677
+ clear_capture_btn.click(
678
+ fn=clear_capture,
679
+ outputs=[webcam_state, captured_image, status_display]
 
680
  )
681
 
682
+ # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
683
  planning_btn.click(
684
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
685
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
686
  outputs=[result_output, status_display]
687
  )
688
 
689
  grounding_btn.click(
690
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
691
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
692
  outputs=[result_output, status_display]
693
  )
694
 
695
  affordance_btn.click(
696
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
697
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
698
  outputs=[result_output, status_display]
699
  )
700
 
701
  trajectory_btn.click(
702
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
703
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
704
  outputs=[result_output, status_display]
705
  )
706
 
707
+ # ๋ฌธ์„œ ๋ถ„์„
708
+ def analyze_docs(files, prompt, use_search):
709
+ if not files:
710
+ return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
711
+
712
+ output = ""
713
+ for chunk in analyze_documents_streaming(files, prompt, use_search):
714
+ output = chunk
715
+ return output
716
+
717
+ analyze_docs_btn.click(
718
+ fn=analyze_docs,
719
+ inputs=[doc_files, doc_prompt, doc_web_search],
720
+ outputs=[doc_result]
721
+ )
722
+
723
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
724
  def initial_load():
725
  load_model()
726
+ return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
727
 
728
  demo.load(
729
  fn=initial_load,
730
+ outputs=None
731
  )
732
 
733
  if __name__ == "__main__":
734
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
735
  demo.launch(
736
  server_name="0.0.0.0",
737
  server_port=7860,