openfree commited on
Commit
f881e03
ยท
verified ยท
1 Parent(s): fd897d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +254 -409
app.py CHANGED
@@ -2,13 +2,10 @@
2
 
3
  import os
4
  import re
5
- import tempfile
6
  import gc
7
  from collections.abc import Iterator
8
  from threading import Thread
9
- import json
10
  import requests
11
- import cv2
12
  import gradio as gr
13
  import spaces
14
  import torch
@@ -18,12 +15,7 @@ from PIL import Image
18
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
  import time
20
  import warnings
21
- from typing import Dict, List, Optional, Union
22
-
23
- # CSV/TXT ๋ถ„์„
24
- import pandas as pd
25
- # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
26
- import PyPDF2
27
 
28
  warnings.filterwarnings('ignore')
29
 
@@ -32,9 +24,7 @@ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
32
  ##############################################################################
33
  # ์ƒ์ˆ˜ ์ •์˜
34
  ##############################################################################
35
- MAX_CONTENT_CHARS = 2000
36
  MAX_INPUT_LENGTH = 2096
37
- MAX_NUM_IMAGES = 5
38
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
39
 
40
  ##############################################################################
@@ -85,17 +75,17 @@ def do_web_search(query: str) -> str:
85
  "domain": "google.com",
86
  "serp_type": "web",
87
  "device": "desktop",
88
- "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
- "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
  }
91
 
92
  headers = {
93
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
94
  }
95
 
96
- logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
97
 
98
- response = requests.get(url, headers=headers, params=params, timeout=60)
99
  response.raise_for_status()
100
 
101
  data = response.json()
@@ -104,91 +94,22 @@ def do_web_search(query: str) -> str:
104
  organic = results.get("organic", []) if isinstance(results, dict) else []
105
 
106
  if not organic:
107
- return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
-
109
- max_results = min(10, len(organic))
110
- limited_organic = organic[:max_results]
111
-
112
- summary_lines = []
113
- for idx, item in enumerate(limited_organic, start=1):
114
- title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
115
- link = item.get("link", "#")
116
- snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
117
- displayed_link = item.get("displayed_link", link)
118
 
119
- summary_lines.append(
120
- f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
121
- f"{snippet}\n\n"
122
- f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
123
- f"---\n"
124
- )
125
-
126
- instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
127
- ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
128
- 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
129
- 2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
130
- 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
131
- """
132
 
133
- search_results = instructions + "\n".join(summary_lines)
134
- return search_results
135
 
136
  except Exception as e:
137
  logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
138
- return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
139
-
140
- ##############################################################################
141
- # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
142
- ##############################################################################
143
- def analyze_csv_file(path: str) -> str:
144
- """CSV ํŒŒ์ผ ๋ถ„์„"""
145
- try:
146
- df = pd.read_csv(path)
147
- if df.shape[0] > 50 or df.shape[1] > 10:
148
- df = df.iloc[:50, :10]
149
- df_str = df.to_string()
150
- if len(df_str) > MAX_CONTENT_CHARS:
151
- df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
152
- return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
153
- except Exception as e:
154
- return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
155
-
156
- def analyze_txt_file(path: str) -> str:
157
- """TXT ํŒŒ์ผ ๋ถ„์„"""
158
- try:
159
- with open(path, "r", encoding="utf-8") as f:
160
- text = f.read()
161
- if len(text) > MAX_CONTENT_CHARS:
162
- text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
163
- return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
164
- except Exception as e:
165
- return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
166
-
167
- def pdf_to_markdown(pdf_path: str) -> str:
168
- """PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
169
- text_chunks = []
170
- try:
171
- with open(pdf_path, "rb") as f:
172
- reader = PyPDF2.PdfReader(f)
173
- max_pages = min(5, len(reader.pages))
174
- for page_num in range(max_pages):
175
- page = reader.pages[page_num]
176
- page_text = page.extract_text() or ""
177
- page_text = page_text.strip()
178
- if page_text:
179
- if len(page_text) > MAX_CONTENT_CHARS // max_pages:
180
- page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
181
- text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
182
- if len(reader.pages) > max_pages:
183
- text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
184
- except Exception as e:
185
- return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
186
-
187
- full_text = "\n".join(text_chunks)
188
- if len(full_text) > MAX_CONTENT_CHARS:
189
- full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
190
-
191
- return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
192
 
193
  ##############################################################################
194
  # ๋ชจ๋ธ ๋กœ๋“œ
@@ -224,12 +145,12 @@ def load_model():
224
  return False
225
 
226
  ##############################################################################
227
- # ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
228
  ##############################################################################
229
  @spaces.GPU(duration=60)
230
- def analyze_image_for_robot(
231
  image: Union[np.ndarray, Image.Image],
232
- prompt: str,
233
  task_type: str = "general",
234
  use_web_search: bool = False,
235
  enable_thinking: bool = True,
@@ -247,44 +168,78 @@ def analyze_image_for_robot(
247
  if isinstance(image, np.ndarray):
248
  image = Image.fromarray(image).convert('RGB')
249
 
250
- # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
251
  system_prompts = {
252
- "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€๋ฅผ ์ž์„ธํžˆ ๋ถ„์„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.",
253
- "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš์„ ์ˆ˜๋ฆฝํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค.
254
- ์ฃผ์–ด์ง„ ์ด๋ฏธ์ง€์™€ ์ž‘์—…์„ ๋ถ„์„ํ•˜์—ฌ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
255
- ํ˜•์‹: Step_1: xxx\nStep_2: xxx\n...\nStep_n: xxx""",
256
- "grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜๋ฅผ ์ฐพ๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์œ„์น˜๋ฅผ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
257
- "affordance": "๋‹น์‹ ์€ ๋กœ๋ด‡ ํŒŒ์ง€์ ์„ ๋ถ„์„ํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ์˜ˆ์ธกํ•˜์„ธ์š”.",
258
- "trajectory": "๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ๋ชฉํ‘œ ์ง€์ ๊นŒ์ง€์˜ ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
259
- "pointing": "๋‹น์‹ ์€ ๋‹ค์ค‘ ์ง€์ ์„ ์ง€์ •ํ•˜๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ์œ„์น˜๋“ค์„ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  }
261
 
262
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
263
 
264
  # Chain-of-Thought ์ถ”๊ฐ€
265
  if enable_thinking:
266
- system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œ๏ฟฝ๏ฟฝ์„ธ์š”."
 
 
 
 
 
 
 
267
 
268
- # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
269
- combined_system = system_prompt
270
- if use_web_search:
271
- keywords = extract_keywords(prompt, top_k=5)
272
- if keywords:
273
- logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
274
- search_results = do_web_search(keywords)
275
- combined_system = f"{search_results}\n\n{system_prompt}"
276
 
277
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
278
  messages = [
279
  {
280
  "role": "system",
281
- "content": [{"type": "text", "text": combined_system}]
282
  },
283
  {
284
  "role": "user",
285
  "content": [
286
  {"type": "image", "url": image},
287
- {"type": "text", "text": prompt}
288
  ]
289
  }
290
  ]
@@ -324,312 +279,220 @@ def analyze_image_for_robot(
324
  return response
325
 
326
  except Exception as e:
327
- logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
328
  import traceback
329
  return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
330
  finally:
331
  clear_cuda_cache()
332
 
333
  ##############################################################################
334
- # ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
335
- ##############################################################################
336
- def _model_gen_with_oom_catch(**kwargs):
337
- """OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
338
- global model
339
- try:
340
- model.generate(**kwargs)
341
- except torch.cuda.OutOfMemoryError:
342
- raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
343
- finally:
344
- clear_cuda_cache()
345
-
346
- @spaces.GPU(duration=120)
347
- def analyze_documents_streaming(
348
- files: List[str],
349
- prompt: str,
350
- use_web_search: bool = False,
351
- max_new_tokens: int = 2048
352
- ) -> Iterator[str]:
353
- """๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
354
- global model, processor
355
-
356
- if not model_loaded:
357
- if not load_model():
358
- yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
359
- return
360
-
361
- try:
362
- # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
363
- system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
364
-
365
- # ์›น ๊ฒ€์ƒ‰
366
- if use_web_search:
367
- keywords = extract_keywords(prompt, top_k=5)
368
- if keywords:
369
- search_results = do_web_search(keywords)
370
- system_content = f"{search_results}\n\n{system_content}"
371
-
372
- # ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
373
- doc_contents = []
374
- for file_path in files:
375
- if file_path.lower().endswith('.csv'):
376
- content = analyze_csv_file(file_path)
377
- elif file_path.lower().endswith('.txt'):
378
- content = analyze_txt_file(file_path)
379
- elif file_path.lower().endswith('.pdf'):
380
- content = pdf_to_markdown(file_path)
381
- else:
382
- continue
383
- doc_contents.append(content)
384
-
385
- # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
386
- messages = [
387
- {
388
- "role": "system",
389
- "content": [{"type": "text", "text": system_content}]
390
- },
391
- {
392
- "role": "user",
393
- "content": [
394
- {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
395
- ]
396
- }
397
- ]
398
-
399
- # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
400
- inputs = processor.apply_chat_template(
401
- messages,
402
- add_generation_prompt=True,
403
- tokenize=True,
404
- return_dict=True,
405
- return_tensors="pt",
406
- ).to(device=model.device, dtype=torch.bfloat16)
407
-
408
- # ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
409
- streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
410
- gen_kwargs = dict(
411
- inputs,
412
- streamer=streamer,
413
- max_new_tokens=max_new_tokens,
414
- temperature=0.8,
415
- top_p=0.9,
416
- )
417
-
418
- # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
419
- t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
420
- t.start()
421
-
422
- # ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
423
- output = ""
424
- for new_text in streamer:
425
- output += new_text
426
- yield output
427
-
428
- except Exception as e:
429
- logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
430
- yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
431
- finally:
432
- clear_cuda_cache()
433
-
434
- ##############################################################################
435
- # Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
436
  ##############################################################################
437
  css = """
438
  .robot-header {
439
  text-align: center;
440
  background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
441
  color: white;
442
- padding: 20px;
443
- border-radius: 10px;
444
  margin-bottom: 20px;
445
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 
 
 
 
446
  }
447
  .status-box {
448
  text-align: center;
449
- padding: 10px;
450
- border-radius: 5px;
451
- margin: 10px 0;
452
  font-weight: bold;
 
453
  }
454
- .info-box {
455
- background: #f0f0f0;
456
- padding: 15px;
457
- border-radius: 8px;
458
- margin: 10px 0;
459
- border-left: 4px solid #2a5298;
460
  }
461
  .task-button {
462
- min-height: 60px;
463
- font-size: 1.1em;
 
 
 
 
 
 
464
  }
465
  .webcam-container {
466
  border: 3px solid #2a5298;
467
- border-radius: 10px;
468
- padding: 10px;
469
  background: #f8f9fa;
470
  }
 
 
 
 
 
 
471
  """
472
 
473
- with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
474
  gr.HTML("""
475
  <div class="robot-header">
476
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
477
- <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
478
- <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
479
- </div>
480
- """)
481
-
482
- gr.HTML("""
483
- <div class="info-box">
484
- <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
485
- <ul>
486
- <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
487
- <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
488
- <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
489
- <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
490
- <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
491
- <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
492
- <li>๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„ (PDF, CSV, TXT)</li>
493
- </ul>
494
  </div>
495
  """)
496
 
497
  with gr.Row():
498
- # ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
499
  with gr.Column(scale=1):
500
- gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
501
-
502
  with gr.Group(elem_classes="webcam-container"):
 
 
503
  webcam = gr.Image(
504
  sources=["webcam"],
505
  streaming=True,
506
  type="numpy",
507
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
508
- height=350
509
  )
510
 
511
- # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
 
 
 
512
  captured_image = gr.Image(
513
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
514
- height=200,
515
  visible=False
516
  )
517
-
518
- # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
519
- gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
520
- with gr.Row():
521
- capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
522
- clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
523
-
524
- with gr.Row():
525
- planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
526
- grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
527
-
528
- with gr.Row():
529
- affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
530
- trajectory_btn = gr.Button("๐Ÿ›ค๏ธ ๏ฟฝ๏ฟฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
531
 
532
- # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
533
  with gr.Column(scale=2):
534
- gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
535
-
536
- with gr.Row():
537
- with gr.Column():
538
- task_prompt = gr.Textbox(
539
- label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
540
- placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
541
- value="์ด ์žฅ๋ฉด์—์„œ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ๋ถ„์„ํ•˜์„ธ์š”.",
542
- lines=2
 
 
 
 
 
 
 
543
  )
544
 
545
- with gr.Row():
546
- use_web_search = gr.Checkbox(
547
- label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
548
- value=False,
549
- info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
550
- )
551
-
552
- enable_thinking = gr.Checkbox(
553
- label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
554
- value=True,
555
- info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
556
- )
557
-
558
- max_tokens = gr.Slider(
559
- label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
560
- minimum=256,
561
- maximum=4096,
562
- value=1024,
563
- step=256
564
  )
565
-
566
- gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
567
- result_output = gr.Textbox(
568
- label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
569
- lines=20,
570
- max_lines=40,
571
- show_copy_button=True,
572
- elem_id="result"
573
- )
574
-
575
- status_display = gr.HTML(
576
- '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
577
- )
578
-
579
- # ๋ฌธ์„œ ๋ถ„์„ ํƒญ
580
- with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„"):
581
- with gr.Row():
582
- with gr.Column():
583
- doc_files = gr.File(
584
- label="๋ฌธ์„œ ์—…๋กœ๋“œ",
585
- file_count="multiple",
586
- file_types=[".pdf", ".csv", ".txt"],
587
- type="filepath"
588
- )
589
 
590
- doc_prompt = gr.Textbox(
591
- label="๋ถ„์„ ์š”์ฒญ",
592
- placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
593
- lines=3
 
 
594
  )
 
 
 
 
595
 
596
- doc_web_search = gr.Checkbox(
597
- label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
598
- value=False
599
- )
 
 
 
 
 
 
 
600
 
601
- analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
602
-
603
- with gr.Column():
604
- doc_result = gr.Textbox(
605
- label="๋ถ„์„ ๊ฒฐ๊ณผ",
606
- lines=25,
607
- max_lines=50
 
 
 
 
 
 
 
 
 
 
608
  )
 
 
 
 
 
609
 
610
- # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
611
- webcam_state = gr.State(None)
 
 
 
 
 
 
 
612
 
613
- def capture_webcam(frame):
 
 
 
 
614
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
615
  if frame is None:
616
- return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
617
- return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
 
618
 
619
  def clear_capture():
620
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
621
- return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
622
 
623
- def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
624
- """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
625
  if image is None:
626
- return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
627
 
628
- status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
 
 
 
 
 
 
 
 
 
629
 
630
- result = analyze_image_for_robot(
 
631
  image=image,
632
- prompt=prompt,
633
  task_type=task_type,
634
  use_web_search=use_search,
635
  enable_thinking=thinking,
@@ -637,101 +500,83 @@ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as dem
637
  )
638
 
639
  # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
640
- timestamp = time.strftime("%H:%M:%S")
641
- task_names = {
642
- "planning": "์ž‘์—… ๊ณ„ํš",
643
- "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
644
- "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
645
- "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
646
- }
647
 
648
- formatted_result = f"""๐Ÿค– ๋กœ๋ด‡ {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ:
 
 
 
 
 
 
649
 
650
- ๐Ÿ“ธ **์ž‘๏ฟฝ๏ฟฝ**: {prompt}
651
 
652
- ๐Ÿ“ **๋ถ„์„ ๊ฒฐ๊ณผ**:
653
  {result}
654
 
655
- โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
 
656
  ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
657
- ๐Ÿ”ง ํƒœ์Šคํฌ: {task_type}"""
658
 
659
- complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
660
  return formatted_result, complete_status
661
 
662
- # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
663
- webcam.stream(
664
- fn=lambda x: x,
665
  inputs=[webcam],
666
- outputs=[webcam_state]
667
  )
668
 
669
- # ์บก์ฒ˜ ๋ฒ„ํŠผ
670
- capture_btn.click(
671
- fn=capture_webcam,
672
- inputs=[webcam_state],
673
- outputs=[webcam_state, captured_image, status_display]
674
  )
675
 
676
- # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
677
- clear_capture_btn.click(
678
- fn=clear_capture,
679
- outputs=[webcam_state, captured_image, status_display]
 
680
  )
681
 
682
- # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
683
  planning_btn.click(
684
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
685
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
686
  outputs=[result_output, status_display]
687
  )
688
 
689
  grounding_btn.click(
690
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
691
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
692
  outputs=[result_output, status_display]
693
  )
694
 
695
  affordance_btn.click(
696
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
697
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
698
  outputs=[result_output, status_display]
699
  )
700
 
701
  trajectory_btn.click(
702
- fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
703
- inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
704
  outputs=[result_output, status_display]
705
  )
706
 
707
- # ๋ฌธ์„œ ๋ถ„์„
708
- def analyze_docs(files, prompt, use_search):
709
- if not files:
710
- return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
711
-
712
- output = ""
713
- for chunk in analyze_documents_streaming(files, prompt, use_search):
714
- output = chunk
715
- return output
716
-
717
- analyze_docs_btn.click(
718
- fn=analyze_docs,
719
- inputs=[doc_files, doc_prompt, doc_web_search],
720
- outputs=[doc_result]
721
- )
722
-
723
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
724
  def initial_load():
725
  load_model()
726
- return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
727
 
728
  demo.load(
729
  fn=initial_load,
730
- outputs=None
731
  )
732
 
733
  if __name__ == "__main__":
734
- print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
735
  demo.launch(
736
  server_name="0.0.0.0",
737
  server_port=7860,
 
2
 
3
  import os
4
  import re
 
5
  import gc
6
  from collections.abc import Iterator
7
  from threading import Thread
 
8
  import requests
 
9
  import gradio as gr
10
  import spaces
11
  import torch
 
15
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
16
  import time
17
  import warnings
18
+ from typing import Union
 
 
 
 
 
19
 
20
  warnings.filterwarnings('ignore')
21
 
 
24
  ##############################################################################
25
  # ์ƒ์ˆ˜ ์ •์˜
26
  ##############################################################################
 
27
  MAX_INPUT_LENGTH = 2096
 
28
  SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
29
 
30
  ##############################################################################
 
75
  "domain": "google.com",
76
  "serp_type": "web",
77
  "device": "desktop",
78
+ "lang": "ko",
79
+ "num": "5" # 5๊ฐœ๋กœ ์ œํ•œ (๋กœ๋ด‡ ์ž‘์—…์— ์ถฉ๋ถ„)
80
  }
81
 
82
  headers = {
83
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
84
  }
85
 
86
+ logger.info(f"๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ ๊ฒ€์ƒ‰: {query}")
87
 
88
+ response = requests.get(url, headers=headers, params=params, timeout=30)
89
  response.raise_for_status()
90
 
91
  data = response.json()
 
94
  organic = results.get("organic", []) if isinstance(results, dict) else []
95
 
96
  if not organic:
97
+ return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค."
98
+
99
+ # ๋กœ๋ด‡ ์ž‘์—…์— ์œ ์šฉํ•œ ์ •๋ณด๋งŒ ์ถ”์ถœ
100
+ search_info = "# ๋กœ๋ด‡ ์ž‘์—… ์ฐธ๊ณ  ์ •๋ณด\n\n"
101
+ for idx, item in enumerate(organic[:5], 1):
102
+ title = item.get("title", "")
103
+ snippet = item.get("snippet", "")
104
+ link = item.get("link", "")
 
 
 
105
 
106
+ search_info += f"{idx}. {title}\n{snippet}\n์ถœ์ฒ˜: {link}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
+ return search_info
 
109
 
110
  except Exception as e:
111
  logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
112
+ return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  ##############################################################################
115
  # ๋ชจ๋ธ ๋กœ๋“œ
 
145
  return False
146
 
147
  ##############################################################################
148
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ํ•จ์ˆ˜
149
  ##############################################################################
150
  @spaces.GPU(duration=60)
151
+ def analyze_for_robot_task(
152
  image: Union[np.ndarray, Image.Image],
153
+ task_description: str,
154
  task_type: str = "general",
155
  use_web_search: bool = False,
156
  enable_thinking: bool = True,
 
168
  if isinstance(image, np.ndarray):
169
  image = Image.fromarray(image).convert('RGB')
170
 
171
+ # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
172
  system_prompts = {
173
+ "general": """๋‹น์‹ ์€ ๊ณ ๊ธ‰ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค.
174
+ ์ด๋ฏธ์ง€๋ฅผ ๋ถ„์„ํ•˜์—ฌ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ์‹๋ณ„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.
175
+ ํ™˜๊ฒฝ์˜ ๊ฐ์ฒด, ์œ„์น˜, ์ž ์žฌ์  ์œ„ํ—˜ ์š”์†Œ๋ฅผ ํฌํ•จํ•˜์—ฌ ์ƒ์„ธํžˆ ๋ถ„์„ํ•˜์„ธ์š”.""",
176
+
177
+ "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
178
+ ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
179
+ ๊ฐ ๋‹จ๊ณ„๋Š” ๋ช…ํ™•ํ•˜๊ณ  ์‹คํ–‰ ๊ฐ€๋Šฅํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
180
+
181
+ ํ˜•์‹:
182
+ Step_1: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
183
+ Step_2: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
184
+ ...
185
+ Step_n: [๋™์ž‘] - [์ƒ์„ธ ์„ค๋ช…]
186
+
187
+ ์•ˆ์ „ ๊ณ ๋ ค์‚ฌํ•ญ: [์ฃผ์˜์‚ฌํ•ญ]""",
188
+
189
+ "grounding": """๋‹น์‹ ์€ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค.
190
+ ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์ •ํ™•ํ•œ ์œ„์น˜๋ฅผ ์ฐพ์•„ ๋ฐ”์šด๋”ฉ ๋ฐ•์Šค ์ขŒํ‘œ๋ฅผ ์ œ๊ณตํ•˜์„ธ์š”.
191
+
192
+ ์ถœ๋ ฅ ํ˜•์‹: [x1, y1, x2, y2]
193
+ - (x1, y1): ์ขŒ์ƒ๋‹จ ์ขŒํ‘œ
194
+ - (x2, y2): ์šฐํ•˜๋‹จ ์ขŒํ‘œ
195
+ - ์ขŒํ‘œ๋Š” 0-1000 ๋ฒ”์œ„์˜ ์ •์ˆ˜""",
196
+
197
+ "affordance": """๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ทธ๋ฆฌํผ ์ œ์–ด ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
198
+ ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์œ„์น˜์™€ ๋ฐฉ๋ฒ•์„ ๋ถ„์„ํ•˜์„ธ์š”.
199
+
200
+ ์ถœ๋ ฅ ํ˜•์‹:
201
+ ํŒŒ์ง€ ์˜์—ญ: [x1, y1, x2, y2]
202
+ ํŒŒ์ง€ ๋ฐฉํ–ฅ: [์ˆ˜์ง/์ˆ˜ํ‰/๊ฐ๋„]
203
+ ํŒŒ์ง€ ๊ฐ•๋„: [์•ฝํ•จ/์ค‘๊ฐ„/๊ฐ•ํ•จ]
204
+ ์ฃผ์˜์‚ฌํ•ญ: [ํŠน๋ณ„ ๊ณ ๋ ค์‚ฌํ•ญ]""",
205
+
206
+ "trajectory": """๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ ๊ณ„ํš ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.
207
+ ์‹œ์ž‘์ ์—์„œ ๋ชฉํ‘œ์ ๊นŒ์ง€์˜ ์ตœ์  ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜์„ธ์š”.
208
+
209
+ ์ถœ๋ ฅ ํ˜•์‹:
210
+ ๊ฒฝ๋กœ: [(x1,y1), (x2,y2), ..., (xn,yn)]
211
+ ์˜ˆ์ƒ ์†Œ์š” ์‹œ๊ฐ„: [์ดˆ]
212
+ ์žฅ์• ๋ฌผ ํšŒํ”ผ: [์„ค๋ช…]
213
+ ์•ˆ์ „ ์—ฌ์œ : [cm]"""
214
  }
215
 
216
  system_prompt = system_prompts.get(task_type, system_prompts["general"])
217
 
218
  # Chain-of-Thought ์ถ”๊ฐ€
219
  if enable_thinking:
220
+ system_prompt += "\n\n๋จผ์ € <thinking> ํƒœ๊ทธ ์•ˆ์— ๋ถ„์„ ๊ณผ์ •์„ ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„, <answer> ํƒœ๊ทธ ์•ˆ์— ์ตœ์ข… ๊ฒฐ๊ณผ๋ฅผ ์ œ์‹œํ•˜์„ธ์š”."
221
+
222
+ # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰ (๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ)
223
+ web_info = ""
224
+ if use_web_search and task_type in ["planning", "affordance"]:
225
+ # ๋กœ๋ด‡ ์ž‘์—… ๊ด€๋ จ ํ‚ค์›Œ๋“œ๋กœ ๊ฒ€์ƒ‰
226
+ search_query = f"๋กœ๋ด‡ {task_description} ๋ฐฉ๋ฒ• ๊ฐ€์ด๋“œ"
227
+ web_info = do_web_search(search_query)
228
 
229
+ # ์ตœ์ข… ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
230
+ final_system = web_info + "\n\n" + system_prompt if web_info else system_prompt
 
 
 
 
 
 
231
 
232
  # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
233
  messages = [
234
  {
235
  "role": "system",
236
+ "content": [{"type": "text", "text": final_system}]
237
  },
238
  {
239
  "role": "user",
240
  "content": [
241
  {"type": "image", "url": image},
242
+ {"type": "text", "text": f"์ž‘์—…: {task_description}"}
243
  ]
244
  }
245
  ]
 
279
  return response
280
 
281
  except Exception as e:
282
+ logger.error(f"๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
283
  import traceback
284
  return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
285
  finally:
286
  clear_cuda_cache()
287
 
288
  ##############################################################################
289
+ # Gradio UI (๋กœ๋ด‡ ์ž‘์—… ์ค‘์‹ฌ)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  ##############################################################################
291
  css = """
292
  .robot-header {
293
  text-align: center;
294
  background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
295
  color: white;
296
+ padding: 25px;
297
+ border-radius: 15px;
298
  margin-bottom: 20px;
299
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
300
+ }
301
+ .robot-header h1 {
302
+ margin: 0 0 10px 0;
303
+ font-size: 2.5em;
304
  }
305
  .status-box {
306
  text-align: center;
307
+ padding: 12px;
308
+ border-radius: 8px;
309
+ margin: 15px 0;
310
  font-weight: bold;
311
+ font-size: 1.1em;
312
  }
313
+ .task-card {
314
+ background: white;
315
+ padding: 20px;
316
+ border-radius: 10px;
317
+ box-shadow: 0 2px 10px rgba(0, 0, 0, 0.1);
318
+ margin-bottom: 20px;
319
  }
320
  .task-button {
321
+ min-height: 80px;
322
+ font-size: 1.2em;
323
+ font-weight: bold;
324
+ transition: all 0.3s ease;
325
+ }
326
+ .task-button:hover {
327
+ transform: translateY(-2px);
328
+ box-shadow: 0 5px 15px rgba(0, 0, 0, 0.2);
329
  }
330
  .webcam-container {
331
  border: 3px solid #2a5298;
332
+ border-radius: 15px;
333
+ padding: 15px;
334
  background: #f8f9fa;
335
  }
336
+ .result-container {
337
+ background: #f0f8ff;
338
+ border-left: 5px solid #2a5298;
339
+ padding: 20px;
340
+ border-radius: 10px;
341
+ }
342
  """
343
 
344
+ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ", css=css) as demo:
345
  gr.HTML("""
346
  <div class="robot-header">
347
  <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
348
+ <p style="font-size: 1.2em; margin: 0;">์‹ค์‹œ๊ฐ„ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ</p>
349
+ <p style="font-size: 0.9em; opacity: 0.9; margin-top: 5px;">Powered by Gemma3-R1984-4B</p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  </div>
351
  """)
352
 
353
  with gr.Row():
354
+ # ์™ผ์ชฝ: ์›น์บ  ์˜์—ญ
355
  with gr.Column(scale=1):
 
 
356
  with gr.Group(elem_classes="webcam-container"):
357
+ gr.Markdown("## ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
358
+
359
  webcam = gr.Image(
360
  sources=["webcam"],
361
  streaming=True,
362
  type="numpy",
363
  label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
364
+ height=400
365
  )
366
 
367
+ with gr.Row():
368
+ capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", size="lg")
369
+ clear_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", size="lg")
370
+
371
  captured_image = gr.Image(
372
  label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
373
+ height=300,
374
  visible=False
375
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
 
377
+ # ์˜ค๋ฅธ์ชฝ: ์ž‘์—… ์ œ์–ด ๋ฐ ๊ฒฐ๊ณผ
378
  with gr.Column(scale=2):
379
+ # ์ž‘์—… ์„ค์ •
380
+ with gr.Group(elem_classes="task-card"):
381
+ gr.Markdown("## โš™๏ธ ์ž‘์—… ์„ค์ •")
382
+
383
+ task_input = gr.Textbox(
384
+ label="์ž‘์—… ์„ค๋ช…",
385
+ placeholder="์˜ˆ: ๋นจ๊ฐ„์ƒ‰ ์ปต์„ ์žก์•„์„œ ํŒŒ๋ž€์ƒ‰ ๋ฐ•์Šค์— ๋„ฃ๊ธฐ",
386
+ value="์ด ์žฅ๋ฉด์—์„œ ์ˆ˜ํ–‰ํ•  ๋กœ๋ด‡ ์ž‘์—…์„ ์„ค๋ช…ํ•˜์„ธ์š”",
387
+ lines=2
388
+ )
389
+
390
+ with gr.Row():
391
+ use_search = gr.Checkbox(
392
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
393
+ value=False,
394
+ info="์ž‘์—… ์ˆ˜ํ–‰ ๋ฐฉ๋ฒ•์„ ์›น์—์„œ ๊ฒ€์ƒ‰"
395
  )
396
 
397
+ show_thinking = gr.Checkbox(
398
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
399
+ value=True,
400
+ info="AI์˜ ๋ถ„์„ ๊ณผ์ •์„ ํˆฌ๋ช…ํ•˜๊ฒŒ ํ‘œ์‹œ"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
+ max_tokens = gr.Slider(
404
+ label="์‘๋‹ต ๊ธธ์ด",
405
+ minimum=256,
406
+ maximum=2048,
407
+ value=1024,
408
+ step=256
409
  )
410
+
411
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ
412
+ with gr.Group(elem_classes="task-card"):
413
+ gr.Markdown("## ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„")
414
 
415
+ with gr.Row():
416
+ analyze_btn = gr.Button(
417
+ "๐Ÿ” ์žฅ๋ฉด ๋ถ„์„",
418
+ variant="secondary",
419
+ elem_classes="task-button"
420
+ )
421
+ planning_btn = gr.Button(
422
+ "๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš",
423
+ variant="primary",
424
+ elem_classes="task-button"
425
+ )
426
 
427
+ with gr.Row():
428
+ grounding_btn = gr.Button(
429
+ "๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜",
430
+ variant="secondary",
431
+ elem_classes="task-button"
432
+ )
433
+ affordance_btn = gr.Button(
434
+ "๐Ÿค ํŒŒ์ง€์  ๋ถ„์„",
435
+ variant="secondary",
436
+ elem_classes="task-button"
437
+ )
438
+
439
+ trajectory_btn = gr.Button(
440
+ "๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš",
441
+ variant="secondary",
442
+ elem_classes="task-button",
443
+ elem_id="trajectory"
444
  )
445
+
446
+ # ์ƒํƒœ ํ‘œ์‹œ
447
+ status_display = gr.HTML(
448
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŸข ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
449
+ )
450
 
451
+ # ๋ถ„์„ ๊ฒฐ๊ณผ
452
+ with gr.Group(elem_classes="result-container"):
453
+ gr.Markdown("## ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
454
+ result_output = gr.Textbox(
455
+ label="",
456
+ lines=20,
457
+ max_lines=40,
458
+ show_copy_button=True
459
+ )
460
 
461
+ # ์ƒํƒœ ๊ด€๋ฆฌ
462
+ captured_state = gr.State(None)
463
+
464
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
465
+ def capture_frame(frame):
466
  """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
467
  if frame is None:
468
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ ์ด ์—ฐ๊ฒฐ๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค</div>'
469
+
470
+ return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d1ecf1; color:#0c5460;">๐Ÿ“ธ ์ด๋ฏธ์ง€๊ฐ€ ์บก์ฒ˜๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>'
471
 
472
  def clear_capture():
473
  """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
474
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŸข ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>', ""
475
 
476
+ def analyze_task(image, task_desc, task_type, use_search, thinking, tokens):
477
+ """๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„"""
478
  if image is None:
479
+ return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”!", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค</div>'
480
 
481
+ # ์ž‘์—… ์œ ํ˜•๋ณ„ ์ƒํƒœ ๋ฉ”์‹œ์ง€
482
+ task_messages = {
483
+ "general": "์žฅ๋ฉด ๋ถ„์„",
484
+ "planning": "์ž‘์—… ๊ณ„ํš ์ˆ˜๋ฆฝ",
485
+ "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
486
+ "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
487
+ "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
488
+ }
489
+
490
+ status_msg = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿ”„ {task_messages[task_type]} ์ค‘...</div>'
491
 
492
+ # ๋ถ„์„ ์ˆ˜ํ–‰
493
+ result = analyze_for_robot_task(
494
  image=image,
495
+ task_description=task_desc,
496
  task_type=task_type,
497
  use_web_search=use_search,
498
  enable_thinking=thinking,
 
500
  )
501
 
502
  # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
503
+ timestamp = time.strftime("%Y-%m-%d %H:%M:%S")
 
 
 
 
 
 
504
 
505
+ formatted_result = f"""โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—
506
+ โ•‘ ๐Ÿค– ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๊ฒฐ๊ณผ โ•‘
507
+ โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
508
+
509
+ ๐Ÿ“‹ ์ž‘์—… ์œ ํ˜•: {task_messages[task_type]}
510
+ ๐Ÿ“ ์ž‘์—… ์„ค๋ช…: {task_desc}
511
+ โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
512
 
513
+ โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
514
 
 
515
  {result}
516
 
517
+ โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•
518
+
519
  ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
520
+ ๐Ÿ”ง ๋ถ„์„ ์™„๋ฃŒ"""
521
 
522
+ complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„์ด ์™„๋ฃŒ๋˜์—ˆ์Šต๋‹ˆ๋‹ค</div>'
523
  return formatted_result, complete_status
524
 
525
+ # ๋ฒ„ํŠผ ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ
526
+ capture_btn.click(
527
+ fn=lambda frame: capture_frame(frame) if frame is not None else capture_frame(webcam.value),
528
  inputs=[webcam],
529
+ outputs=[captured_state, captured_image, status_display]
530
  )
531
 
532
+ clear_btn.click(
533
+ fn=clear_capture,
534
+ outputs=[captured_state, captured_image, status_display, result_output]
 
 
535
  )
536
 
537
+ # ์ž‘์—… ๋ถ„์„ ๋ฒ„ํŠผ๋“ค
538
+ analyze_btn.click(
539
+ fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "general", s, t, tk),
540
+ inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
541
+ outputs=[result_output, status_display]
542
  )
543
 
 
544
  planning_btn.click(
545
+ fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "planning", s, t, tk),
546
+ inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
547
  outputs=[result_output, status_display]
548
  )
549
 
550
  grounding_btn.click(
551
+ fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "grounding", s, t, tk),
552
+ inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
553
  outputs=[result_output, status_display]
554
  )
555
 
556
  affordance_btn.click(
557
+ fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "affordance", s, t, tk),
558
+ inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
559
  outputs=[result_output, status_display]
560
  )
561
 
562
  trajectory_btn.click(
563
+ fn=lambda img, desc, s, t, tk: analyze_task(img, desc, "trajectory", s, t, tk),
564
+ inputs=[captured_state, task_input, use_search, show_thinking, max_tokens],
565
  outputs=[result_output, status_display]
566
  )
567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
568
  # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
569
  def initial_load():
570
  load_model()
571
+ return '<div class="status-box" style="background:#d4edda; color:#155724;">๐Ÿš€ ์‹œ์Šคํ…œ์ด ์ค€๋น„๋˜์—ˆ์Šต๋‹ˆ๋‹ค!</div>'
572
 
573
  demo.load(
574
  fn=initial_load,
575
+ outputs=[status_display]
576
  )
577
 
578
  if __name__ == "__main__":
579
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘...")
580
  demo.launch(
581
  server_name="0.0.0.0",
582
  server_port=7860,