openfree commited on
Commit
fd897d3
ยท
verified ยท
1 Parent(s): 0995092

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +546 -654
app.py CHANGED
@@ -1,9 +1,9 @@
1
- #!/usr/bin/env python
2
 
3
  import os
4
  import re
5
  import tempfile
6
- import gc # garbage collector ์ถ”๊ฐ€
7
  from collections.abc import Iterator
8
  from threading import Thread
9
  import json
@@ -12,17 +12,41 @@ import cv2
12
  import gradio as gr
13
  import spaces
14
  import torch
 
15
  from loguru import logger
16
  from PIL import Image
17
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 
 
 
18
 
19
  # CSV/TXT ๋ถ„์„
20
  import pandas as pd
21
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
22
  import PyPDF2
23
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ##############################################################################
25
- # ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ ํ•จ์ˆ˜ ์ถ”๊ฐ€
 
 
 
 
 
 
 
 
26
  ##############################################################################
27
  def clear_cuda_cache():
28
  """CUDA ์บ์‹œ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ๋น„์›๋‹ˆ๋‹ค."""
@@ -31,177 +55,117 @@ def clear_cuda_cache():
31
  gc.collect()
32
 
33
  ##############################################################################
34
- # SERPHouse API key from environment variable
35
- ##############################################################################
36
- SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
37
-
38
- ##############################################################################
39
- # ๊ฐ„๋‹จํ•œ ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜ (ํ•œ๊ธ€ + ์•ŒํŒŒ๋ฒณ + ์ˆซ์ž + ๊ณต๋ฐฑ ๋ณด์กด)
40
  ##############################################################################
41
  def extract_keywords(text: str, top_k: int = 5) -> str:
42
- """
43
- 1) ํ•œ๊ธ€(๊ฐ€-ํžฃ), ์˜์–ด(a-zA-Z), ์ˆซ์ž(0-9), ๊ณต๋ฐฑ๋งŒ ๋‚จ๊น€
44
- 2) ๊ณต๋ฐฑ ๊ธฐ์ค€ ํ† ํฐ ๋ถ„๋ฆฌ
45
- 3) ์ตœ๋Œ€ top_k๊ฐœ๋งŒ
46
- """
47
  text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
48
  tokens = text.split()
49
- key_tokens = tokens[:top_k]
 
 
 
 
 
 
 
 
50
  return " ".join(key_tokens)
51
 
52
  ##############################################################################
53
- # SerpHouse Live endpoint ํ˜ธ์ถœ
54
- # - ์ƒ์œ„ 20๊ฐœ ๊ฒฐ๊ณผ JSON์„ LLM์— ๋„˜๊ธธ ๋•Œ link, snippet ๋“ฑ ๋ชจ๋‘ ํฌํ•จ
55
  ##############################################################################
56
  def do_web_search(query: str) -> str:
57
- """
58
- ์ƒ์œ„ 20๊ฐœ 'organic' ๊ฒฐ๊ณผ item ์ „์ฒด(์ œ๋ชฉ, link, snippet ๋“ฑ)๋ฅผ
59
- JSON ๋ฌธ์ž์—ด ํ˜•ํƒœ๋กœ ๋ฐ˜ํ™˜
60
- """
61
  try:
62
  url = "https://api.serphouse.com/serp/live"
63
 
64
- # ๊ธฐ๋ณธ GET ๋ฐฉ์‹์œผ๋กœ ํŒŒ๋ผ๋ฏธํ„ฐ ๊ฐ„์†Œํ™”ํ•˜๊ณ  ๊ฒฐ๊ณผ ์ˆ˜๋ฅผ 20๊ฐœ๋กœ ์ œํ•œ
65
  params = {
66
  "q": query,
67
  "domain": "google.com",
68
- "serp_type": "web", # ๊ธฐ๋ณธ ์›น ๊ฒ€์ƒ‰
69
  "device": "desktop",
70
- "lang": "en",
71
- "num": "20" # ์ตœ๋Œ€ 20๊ฐœ ๊ฒฐ๊ณผ๋งŒ ์š”์ฒญ
72
  }
73
 
74
  headers = {
75
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
76
  }
77
 
78
- logger.info(f"SerpHouse API ํ˜ธ์ถœ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
79
- logger.info(f"์š”์ฒญ URL: {url} - ํŒŒ๋ผ๋ฏธํ„ฐ: {params}")
80
 
81
- # GET ์š”์ฒญ ์ˆ˜ํ–‰
82
  response = requests.get(url, headers=headers, params=params, timeout=60)
83
  response.raise_for_status()
84
 
85
- logger.info(f"SerpHouse API ์‘๋‹ต ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
86
  data = response.json()
87
 
88
- # ๋‹ค์–‘ํ•œ ์‘๋‹ต ๊ตฌ์กฐ ์ฒ˜๋ฆฌ
89
  results = data.get("results", {})
90
- organic = None
91
-
92
- # ๊ฐ€๋Šฅํ•œ ์‘๋‹ต ๊ตฌ์กฐ 1
93
- if isinstance(results, dict) and "organic" in results:
94
- organic = results["organic"]
95
-
96
- # ๊ฐ€๋Šฅํ•œ ์‘๋‹ต ๊ตฌ์กฐ 2 (์ค‘์ฒฉ๋œ results)
97
- elif isinstance(results, dict) and "results" in results:
98
- if isinstance(results["results"], dict) and "organic" in results["results"]:
99
- organic = results["results"]["organic"]
100
 
101
- # ๊ฐ€๋Šฅํ•œ ์‘๋‹ต ๊ตฌ์กฐ 3 (์ตœ์ƒ์œ„ organic)
102
- elif "organic" in data:
103
- organic = data["organic"]
104
-
105
  if not organic:
106
- logger.warning("์‘๋‹ต์—์„œ organic ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
107
- logger.debug(f"์‘๋‹ต ๊ตฌ์กฐ: {list(data.keys())}")
108
- if isinstance(results, dict):
109
- logger.debug(f"results ๊ตฌ์กฐ: {list(results.keys())}")
110
- return "No web search results found or unexpected API response structure."
111
-
112
- # ๊ฒฐ๊ณผ ์ˆ˜ ์ œํ•œ ๋ฐ ์ปจํ…์ŠคํŠธ ๊ธธ์ด ์ตœ์ ํ™”
113
- max_results = min(20, len(organic))
114
  limited_organic = organic[:max_results]
115
 
116
- # ๊ฒฐ๊ณผ ํ˜•์‹ ๊ฐœ์„  - ๋งˆํฌ๋‹ค์šด ํ˜•์‹์œผ๋กœ ์ถœ๋ ฅํ•˜์—ฌ ๊ฐ€๋…์„ฑ ํ–ฅ์ƒ
117
  summary_lines = []
118
  for idx, item in enumerate(limited_organic, start=1):
119
- title = item.get("title", "No title")
120
  link = item.get("link", "#")
121
- snippet = item.get("snippet", "No description")
122
  displayed_link = item.get("displayed_link", link)
123
 
124
- # ๋งˆํฌ๋‹ค์šด ํ˜•์‹ (๋งํฌ ํด๋ฆญ ๊ฐ€๋Šฅ)
125
  summary_lines.append(
126
- f"### Result {idx}: {title}\n\n"
127
  f"{snippet}\n\n"
128
  f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
129
  f"---\n"
130
  )
131
 
132
- # ๋ชจ๋ธ์—๊ฒŒ ๋ช…ํ™•ํ•œ ์ง€์นจ ์ถ”๊ฐ€
133
- instructions = """
134
- # ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
135
- ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ์งˆ๋ฌธ์— ๋‹ต๋ณ€ํ•  ๋•Œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
136
- 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š”
137
- 2. ๋‹ต๋ณ€์— ๊ด€๋ จ ์ •๋ณด์˜ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š” (์˜ˆ: "X ์ถœ์ฒ˜์— ๋”ฐ๋ฅด๋ฉด...")
138
- 3. ์‘๋‹ต์— ์‹ค์ œ ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ํฌํ•จํ•˜์„ธ์š”
139
- 4. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
140
  """
141
 
142
  search_results = instructions + "\n".join(summary_lines)
143
- logger.info(f"๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ {len(limited_organic)}๊ฐœ ์ฒ˜๋ฆฌ ์™„๋ฃŒ")
144
  return search_results
145
 
146
  except Exception as e:
147
- logger.error(f"Web search failed: {e}")
148
- return f"Web search failed: {str(e)}"
149
-
150
-
151
- ##############################################################################
152
- # ๋ชจ๋ธ/ํ”„๋กœ์„ธ์„œ ๋กœ๋”ฉ
153
- ##############################################################################
154
- MAX_CONTENT_CHARS = 2000
155
- MAX_INPUT_LENGTH = 2096 # ์ตœ๋Œ€ ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ ์ถ”๊ฐ€
156
- model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
157
-
158
- processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
159
- model = Gemma3ForConditionalGeneration.from_pretrained(
160
- model_id,
161
- device_map="auto",
162
- torch_dtype=torch.bfloat16,
163
- attn_implementation="eager" # ๊ฐ€๋Šฅํ•˜๋‹ค๋ฉด "flash_attention_2"๋กœ ๋ณ€๊ฒฝ
164
- )
165
- MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
166
-
167
 
168
  ##############################################################################
169
- # CSV, TXT, PDF ๋ถ„์„ ํ•จ์ˆ˜
170
  ##############################################################################
171
  def analyze_csv_file(path: str) -> str:
172
- """
173
- CSV ํŒŒ์ผ์„ ์ „์ฒด ๋ฌธ์ž์—ด๋กœ ๋ณ€ํ™˜. ๋„ˆ๋ฌด ๊ธธ ๊ฒฝ์šฐ ์ผ๋ถ€๋งŒ ํ‘œ์‹œ.
174
- """
175
  try:
176
  df = pd.read_csv(path)
177
  if df.shape[0] > 50 or df.shape[1] > 10:
178
  df = df.iloc[:50, :10]
179
  df_str = df.to_string()
180
  if len(df_str) > MAX_CONTENT_CHARS:
181
- df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
182
- return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
183
  except Exception as e:
184
- return f"Failed to read CSV ({os.path.basename(path)}): {str(e)}"
185
-
186
 
187
  def analyze_txt_file(path: str) -> str:
188
- """
189
- TXT ํŒŒ์ผ ์ „๋ฌธ ์ฝ๊ธฐ. ๋„ˆ๋ฌด ๊ธธ๋ฉด ์ผ๋ถ€๋งŒ ํ‘œ์‹œ.
190
- """
191
  try:
192
  with open(path, "r", encoding="utf-8") as f:
193
  text = f.read()
194
  if len(text) > MAX_CONTENT_CHARS:
195
- text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
196
- return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
197
  except Exception as e:
198
- return f"Failed to read TXT ({os.path.basename(path)}): {str(e)}"
199
-
200
 
201
  def pdf_to_markdown(pdf_path: str) -> str:
202
- """
203
- PDF ํ…์ŠคํŠธ๋ฅผ Markdown์œผ๋กœ ๋ณ€ํ™˜. ํŽ˜์ด์ง€๋ณ„๋กœ ๊ฐ„๋‹จํžˆ ํ…์ŠคํŠธ ์ถ”์ถœ.
204
- """
205
  text_chunks = []
206
  try:
207
  with open(pdf_path, "rb") as f:
@@ -213,321 +177,226 @@ def pdf_to_markdown(pdf_path: str) -> str:
213
  page_text = page_text.strip()
214
  if page_text:
215
  if len(page_text) > MAX_CONTENT_CHARS // max_pages:
216
- page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(truncated)"
217
- text_chunks.append(f"## Page {page_num+1}\n\n{page_text}\n")
218
  if len(reader.pages) > max_pages:
219
- text_chunks.append(f"\n...(Showing {max_pages} of {len(reader.pages)} pages)...")
220
  except Exception as e:
221
- return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
222
 
223
  full_text = "\n".join(text_chunks)
224
  if len(full_text) > MAX_CONTENT_CHARS:
225
- full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
226
-
227
- return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
228
 
 
229
 
230
  ##############################################################################
231
- # ์ด๋ฏธ์ง€/๋น„๋””์˜ค ์—…๋กœ๋“œ ์ œํ•œ ๊ฒ€์‚ฌ
232
  ##############################################################################
233
- def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
234
- image_count = 0
235
- video_count = 0
236
- for path in paths:
237
- if path.endswith(".mp4"):
238
- video_count += 1
239
- elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", path, re.IGNORECASE):
240
- image_count += 1
241
- return image_count, video_count
242
-
243
-
244
- def count_files_in_history(history: list[dict]) -> tuple[int, int]:
245
- image_count = 0
246
- video_count = 0
247
- for item in history:
248
- if item["role"] != "user" or isinstance(item["content"], str):
249
- continue
250
- if isinstance(item["content"], list) and len(item["content"]) > 0:
251
- file_path = item["content"][0]
252
- if isinstance(file_path, str):
253
- if file_path.endswith(".mp4"):
254
- video_count += 1
255
- elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE):
256
- image_count += 1
257
- return image_count, video_count
258
-
259
-
260
- def validate_media_constraints(message: dict, history: list[dict]) -> bool:
261
- media_files = []
262
- for f in message["files"]:
263
- if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
264
- media_files.append(f)
265
-
266
- new_image_count, new_video_count = count_files_in_new_message(media_files)
267
- history_image_count, history_video_count = count_files_in_history(history)
268
- image_count = history_image_count + new_image_count
269
- video_count = history_video_count + new_video_count
270
-
271
- if video_count > 1:
272
- gr.Warning("Only one video is supported.")
273
- return False
274
- if video_count == 1:
275
- if image_count > 0:
276
- gr.Warning("Mixing images and videos is not allowed.")
277
- return False
278
- if "<image>" in message["text"]:
279
- gr.Warning("Using <image> tags with video files is not supported.")
280
- return False
281
- if video_count == 0 and image_count > MAX_NUM_IMAGES:
282
- gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
283
- return False
284
-
285
- if "<image>" in message["text"]:
286
- image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
287
- image_tag_count = message["text"].count("<image>")
288
- if image_tag_count != len(image_files):
289
- gr.Warning("The number of <image> tags in the text does not match the number of image files.")
290
- return False
291
-
292
- return True
293
-
294
-
295
- ##############################################################################
296
- # ๋น„๋””์˜ค ์ฒ˜๋ฆฌ - ์ž„์‹œ ํŒŒ์ผ ์ถ”์  ์ฝ”๋“œ ์ถ”๊ฐ€
297
- ##############################################################################
298
- def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
299
- vidcap = cv2.VideoCapture(video_path)
300
- fps = vidcap.get(cv2.CAP_PROP_FPS)
301
- total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
302
- frame_interval = max(int(fps), int(total_frames / 10))
303
- frames = []
304
-
305
- for i in range(0, total_frames, frame_interval):
306
- vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
307
- success, image = vidcap.read()
308
- if success:
309
- image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
310
- # ์ด๋ฏธ์ง€ ํฌ๊ธฐ ์ค„์ด๊ธฐ ์ถ”๊ฐ€
311
- image = cv2.resize(image, (0, 0), fx=0.5, fy=0.5)
312
- pil_image = Image.fromarray(image)
313
- timestamp = round(i / fps, 2)
314
- frames.append((pil_image, timestamp))
315
- if len(frames) >= 5:
316
- break
317
-
318
- vidcap.release()
319
- return frames
320
-
321
-
322
- def process_video(video_path: str) -> tuple[list[dict], list[str]]:
323
- content = []
324
- temp_files = [] # ์ž„์‹œ ํŒŒ์ผ ์ถ”์ ์„ ์œ„ํ•œ ๋ฆฌ์ŠคํŠธ
325
 
326
- frames = downsample_video(video_path)
327
- for frame in frames:
328
- pil_image, timestamp = frame
329
- with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
330
- pil_image.save(temp_file.name)
331
- temp_files.append(temp_file.name) # ์ถ”์ ์„ ์œ„ํ•ด ๊ฒฝ๋กœ ์ €์žฅ
332
- content.append({"type": "text", "text": f"Frame {timestamp}:"})
333
- content.append({"type": "image", "url": temp_file.name})
334
 
335
- return content, temp_files
336
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  ##############################################################################
339
- # interleaved <image> ์ฒ˜๋ฆฌ
340
  ##############################################################################
341
- def process_interleaved_images(message: dict) -> list[dict]:
342
- parts = re.split(r"(<image>)", message["text"])
343
- content = []
344
- image_index = 0
345
-
346
- image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
 
 
 
 
 
347
 
348
- for part in parts:
349
- if part == "<image>" and image_index < len(image_files):
350
- content.append({"type": "image", "url": image_files[image_index]})
351
- image_index += 1
352
- elif part.strip():
353
- content.append({"type": "text", "text": part.strip()})
354
- else:
355
- if isinstance(part, str) and part != "<image>":
356
- content.append({"type": "text", "text": part})
357
- return content
358
-
359
-
360
- ##############################################################################
361
- # PDF + CSV + TXT + ์ด๋ฏธ์ง€/๋น„๋””์˜ค
362
- ##############################################################################
363
- def is_image_file(file_path: str) -> bool:
364
- return bool(re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE))
365
-
366
- def is_video_file(file_path: str) -> bool:
367
- return file_path.endswith(".mp4")
368
-
369
- def is_document_file(file_path: str) -> bool:
370
- return (
371
- file_path.lower().endswith(".pdf")
372
- or file_path.lower().endswith(".csv")
373
- or file_path.lower().endswith(".txt")
374
- )
375
-
376
-
377
- def process_new_user_message(message: dict) -> tuple[list[dict], list[str]]:
378
- temp_files = [] # ์ž„์‹œ ํŒŒ์ผ ์ถ”์ ์šฉ ๋ฆฌ์ŠคํŠธ
379
 
380
- if not message["files"]:
381
- return [{"type": "text", "text": message["text"]}], temp_files
382
-
383
- video_files = [f for f in message["files"] if is_video_file(f)]
384
- image_files = [f for f in message["files"] if is_image_file(f)]
385
- csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
386
- txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
387
- pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
388
-
389
- content_list = [{"type": "text", "text": message["text"]}]
390
-
391
- for csv_path in csv_files:
392
- csv_analysis = analyze_csv_file(csv_path)
393
- content_list.append({"type": "text", "text": csv_analysis})
394
-
395
- for txt_path in txt_files:
396
- txt_analysis = analyze_txt_file(txt_path)
397
- content_list.append({"type": "text", "text": txt_analysis})
398
-
399
- for pdf_path in pdf_files:
400
- pdf_markdown = pdf_to_markdown(pdf_path)
401
- content_list.append({"type": "text", "text": pdf_markdown})
402
-
403
- if video_files:
404
- video_content, video_temp_files = process_video(video_files[0])
405
- content_list += video_content
406
- temp_files.extend(video_temp_files)
407
- return content_list, temp_files
408
-
409
- if "<image>" in message["text"] and image_files:
410
- interleaved_content = process_interleaved_images({"text": message["text"], "files": image_files})
411
- if content_list and content_list[0]["type"] == "text":
412
- content_list = content_list[1:]
413
- return interleaved_content + content_list, temp_files
414
- else:
415
- for img_path in image_files:
416
- content_list.append({"type": "image", "url": img_path})
417
-
418
- return content_list, temp_files
419
-
420
-
421
- ##############################################################################
422
- # history -> LLM ๋ฉ”์‹œ์ง€ ๋ณ€ํ™˜
423
- ##############################################################################
424
- def process_history(history: list[dict]) -> list[dict]:
425
- messages = []
426
- current_user_content: list[dict] = []
427
- for item in history:
428
- if item["role"] == "assistant":
429
- if current_user_content:
430
- messages.append({"role": "user", "content": current_user_content})
431
- current_user_content = []
432
- messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
433
- else:
434
- content = item["content"]
435
- if isinstance(content, str):
436
- current_user_content.append({"type": "text", "text": content})
437
- elif isinstance(content, list) and len(content) > 0:
438
- file_path = content[0]
439
- if is_image_file(file_path):
440
- current_user_content.append({"type": "image", "url": file_path})
441
- else:
442
- current_user_content.append({"type": "text", "text": f"[File: {os.path.basename(file_path)}]"})
443
-
444
- if current_user_content:
445
- messages.append({"role": "user", "content": current_user_content})
446
 
447
- return messages
448
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
  ##############################################################################
451
- # ๋ชจ๋ธ ์ƒ์„ฑ ํ•จ์ˆ˜์—์„œ OOM ์บ์น˜
452
  ##############################################################################
453
  def _model_gen_with_oom_catch(**kwargs):
454
- """
455
- ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ OutOfMemoryError๋ฅผ ์žก์•„์ฃผ๊ธฐ ์œ„ํ•ด
456
- """
457
  try:
458
  model.generate(**kwargs)
459
  except torch.cuda.OutOfMemoryError:
460
- raise RuntimeError(
461
- "[OutOfMemoryError] GPU ๋ฉ”๋ชจ๋ฆฌ๊ฐ€ ๋ถ€์กฑํ•ฉ๋‹ˆ๋‹ค. "
462
- "Max New Tokens์„ ์ค„์ด๊ฑฐ๋‚˜, ํ”„๋กฌํ”„ํŠธ ๊ธธ์ด๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”."
463
- )
464
  finally:
465
- # ์ƒ์„ฑ ์™„๋ฃŒ ํ›„ ํ•œ๋ฒˆ ๋” ์บ์‹œ ๋น„์šฐ๊ธฐ
466
  clear_cuda_cache()
467
 
468
-
469
- ##############################################################################
470
- # ๋ฉ”์ธ ์ถ”๋ก  ํ•จ์ˆ˜ (web search ์ฒดํฌ ์‹œ ์ž๋™ ํ‚ค์›Œ๋“œ์ถ”์ถœ->๊ฒ€์ƒ‰->๊ฒฐ๊ณผ system msg)
471
- ##############################################################################
472
  @spaces.GPU(duration=120)
473
- def run(
474
- message: dict,
475
- history: list[dict],
476
- system_prompt: str = "",
477
- max_new_tokens: int = 512,
478
  use_web_search: bool = False,
479
- web_search_query: str = "",
480
  ) -> Iterator[str]:
481
-
482
- if not validate_media_constraints(message, history):
483
- yield ""
484
- return
485
-
486
- temp_files = [] # ์ž„์‹œ ํŒŒ์ผ ์ถ”์ ์šฉ
 
487
 
488
  try:
489
- combined_system_msg = ""
490
-
491
- # ๋‚ด๋ถ€์ ์œผ๋กœ๋งŒ ์‚ฌ์šฉ (UI์—์„œ๋Š” ๋ณด์ด์ง€ ์•Š์Œ)
492
- if system_prompt.strip():
493
- combined_system_msg += f"[System Prompt]\n{system_prompt.strip()}\n\n"
494
-
495
  if use_web_search:
496
- user_text = message["text"]
497
- ws_query = extract_keywords(user_text, top_k=5)
498
- if ws_query.strip():
499
- logger.info(f"[Auto WebSearch Keyword] {ws_query!r}")
500
- ws_result = do_web_search(ws_query)
501
- combined_system_msg += f"[Search top-20 Full Items Based on user prompt]\n{ws_result}\n\n"
502
- # >>> ์ถ”๊ฐ€๋œ ์•ˆ๋‚ด ๋ฌธ๊ตฌ (๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์˜ link ๋“ฑ ์ถœ์ฒ˜๋ฅผ ํ™œ์šฉ)
503
- combined_system_msg += "[์ฐธ๊ณ : ์œ„ ๊ฒ€์ƒ‰๊ฒฐ๊ณผ ๋‚ด์šฉ๊ณผ link๋ฅผ ์ถœ์ฒ˜๋กœ ์ธ์šฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•ด ์ฃผ์„ธ์š”.]\n\n"
504
- combined_system_msg += """
505
- [์ค‘์š” ์ง€์‹œ์‚ฌํ•ญ]
506
- 1. ๋‹ต๋ณ€์— ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์—์„œ ์ฐพ์€ ์ •๋ณด์˜ ์ถœ์ฒ˜๋ฅผ ๋ฐ˜๋“œ์‹œ ์ธ์šฉํ•˜์„ธ์š”.
507
- 2. ์ถœ์ฒ˜ ์ธ์šฉ ์‹œ "[์ถœ์ฒ˜ ์ œ๋ชฉ](๋งํฌ)" ํ˜•์‹์˜ ๋งˆํฌ๋‹ค์šด ๋งํฌ๋ฅผ ์‚ฌ์šฉํ•˜์„ธ์š”.
508
- 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”.
509
- 4. ๋‹ต๋ณ€ ๋งˆ์ง€๋ง‰์— "์ฐธ๊ณ  ์ž๋ฃŒ:" ์„น์…˜์„ ์ถ”๊ฐ€ํ•˜๊ณ  ์‚ฌ์šฉํ•œ ์ฃผ์š” ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ๋‚˜์—ดํ•˜์„ธ์š”.
510
- """
511
  else:
512
- combined_system_msg += "[No valid keywords found, skipping WebSearch]\n\n"
513
-
514
- messages = []
515
- if combined_system_msg.strip():
516
- messages.append({
 
517
  "role": "system",
518
- "content": [{"type": "text", "text": combined_system_msg.strip()}],
519
- })
520
-
521
- messages.extend(process_history(history))
522
-
523
- user_content, user_temp_files = process_new_user_message(message)
524
- temp_files.extend(user_temp_files) # ์ž„์‹œ ํŒŒ์ผ ์ถ”์ 
 
 
525
 
526
- for item in user_content:
527
- if item["type"] == "text" and len(item["text"]) > MAX_CONTENT_CHARS:
528
- item["text"] = item["text"][:MAX_CONTENT_CHARS] + "\n...(truncated)..."
529
- messages.append({"role": "user", "content": user_content})
530
-
531
  inputs = processor.apply_chat_template(
532
  messages,
533
  add_generation_prompt=True,
@@ -536,314 +405,337 @@ def run(
536
  return_tensors="pt",
537
  ).to(device=model.device, dtype=torch.bfloat16)
538
 
539
- # ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ ์ถ”๊ฐ€
540
- if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
541
- inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
542
- if 'attention_mask' in inputs:
543
- inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
544
-
545
  streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
546
  gen_kwargs = dict(
547
  inputs,
548
  streamer=streamer,
549
  max_new_tokens=max_new_tokens,
 
 
550
  )
551
-
 
552
  t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
553
  t.start()
554
-
 
555
  output = ""
556
  for new_text in streamer:
557
  output += new_text
558
  yield output
559
-
560
  except Exception as e:
561
- logger.error(f"Error in run: {str(e)}")
562
- yield f"์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค. ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
563
-
564
  finally:
565
- # ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
566
- for temp_file in temp_files:
567
- try:
568
- if os.path.exists(temp_file):
569
- os.unlink(temp_file)
570
- logger.info(f"Deleted temp file: {temp_file}")
571
- except Exception as e:
572
- logger.warning(f"Failed to delete temp file {temp_file}: {e}")
573
-
574
- # ๋ช…์‹œ์  ๋ฉ”๋ชจ๋ฆฌ ์ •๋ฆฌ
575
- try:
576
- del inputs, streamer
577
- except:
578
- pass
579
-
580
  clear_cuda_cache()
581
 
582
-
583
-
584
  ##############################################################################
585
- # ์˜ˆ์‹œ๋“ค (๋ชจ๋‘ ์˜์–ด๋กœ)
586
- ##############################################################################
587
- examples = [
588
- [
589
- {
590
- "text": "Compare the contents of the two PDF files.",
591
- "files": [
592
- "assets/additional-examples/before.pdf",
593
- "assets/additional-examples/after.pdf",
594
- ],
595
- }
596
- ],
597
- [
598
- {
599
- "text": "Summarize and analyze the contents of the CSV file.",
600
- "files": ["assets/additional-examples/sample-csv.csv"],
601
- }
602
- ],
603
- [
604
- {
605
- "text": "Assume the role of a friendly and understanding girlfriend. Describe this video.",
606
- "files": ["assets/additional-examples/tmp.mp4"],
607
- }
608
- ],
609
- [
610
- {
611
- "text": "Describe the cover and read the text on it.",
612
- "files": ["assets/additional-examples/maz.jpg"],
613
- }
614
- ],
615
- [
616
- {
617
- "text": "I already have this supplement <image> and I plan to buy this product <image>. Are there any precautions when taking them together?",
618
- "files": ["assets/additional-examples/pill1.png", "assets/additional-examples/pill2.png"],
619
- }
620
- ],
621
- [
622
- {
623
- "text": "Solve this integral.",
624
- "files": ["assets/additional-examples/4.png"],
625
- }
626
- ],
627
- [
628
- {
629
- "text": "When was this ticket issued, and what is its price?",
630
- "files": ["assets/additional-examples/2.png"],
631
- }
632
- ],
633
- [
634
- {
635
- "text": "Based on the sequence of these images, create a short story.",
636
- "files": [
637
- "assets/sample-images/09-1.png",
638
- "assets/sample-images/09-2.png",
639
- "assets/sample-images/09-3.png",
640
- "assets/sample-images/09-4.png",
641
- "assets/sample-images/09-5.png",
642
- ],
643
- }
644
- ],
645
- [
646
- {
647
- "text": "Write Python code using matplotlib to plot a bar chart that matches this image.",
648
- "files": ["assets/additional-examples/barchart.png"],
649
- }
650
- ],
651
- [
652
- {
653
- "text": "Read the text in the image and write it out in Markdown format.",
654
- "files": ["assets/additional-examples/3.png"],
655
- }
656
- ],
657
- [
658
- {
659
- "text": "What does this sign say?",
660
- "files": ["assets/sample-images/02.png"],
661
- }
662
- ],
663
- [
664
- {
665
- "text": "Compare the two images and describe their similarities and differences.",
666
- "files": ["assets/sample-images/03.png"],
667
- }
668
- ],
669
- ]
670
-
671
- ##############################################################################
672
- # Gradio UI (Blocks) ๊ตฌ์„ฑ (์ขŒ์ธก ์‚ฌ์ด๋“œ ๋ฉ”๋‰ด ์—†์ด ์ „์ฒดํ™”๋ฉด ์ฑ„ํŒ…)
673
  ##############################################################################
674
  css = """
675
- /* 1) UI๋ฅผ ์ฒ˜์Œ๋ถ€ํ„ฐ ๊ฐ€์žฅ ๋„“๊ฒŒ (width 100%) ๊ณ ์ •ํ•˜์—ฌ ํ‘œ์‹œ */
676
- .gradio-container {
677
- background: rgba(255, 255, 255, 0.7); /* ๋ฐฐ๊ฒฝ ํˆฌ๋ช…๋„ ์ฆ๊ฐ€ */
678
- padding: 30px 40px;
679
- margin: 20px auto; /* ์œ„์•„๋ž˜ ์—ฌ๋ฐฑ๋งŒ ์œ ์ง€ */
680
- width: 100% !important;
681
- max-width: none !important; /* 1200px ์ œํ•œ ์ œ๊ฑฐ */
 
682
  }
683
- .fillable {
684
- width: 100% !important;
685
- max-width: 100% !important;
686
- }
687
- /* 2) ๋ฐฐ๊ฒฝ์„ ์™„์ „ํžˆ ํˆฌ๋ช…ํ•˜๊ฒŒ ๋ณ€๊ฒฝ */
688
- body {
689
- background: transparent; /* ์™„์ „ ํˆฌ๋ช… ๋ฐฐ๊ฒฝ */
690
- margin: 0;
691
- padding: 0;
692
- font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
693
- color: #333;
694
- }
695
- /* ๋ฒ„ํŠผ ์ƒ‰์ƒ ์™„์ „ํžˆ ์ œ๊ฑฐํ•˜๊ณ  ํˆฌ๋ช…ํ•˜๊ฒŒ */
696
- button, .btn {
697
- background: transparent !important; /* ์ƒ‰์ƒ ์™„์ „ํžˆ ์ œ๊ฑฐ */
698
- border: 1px solid #ddd; /* ๊ฒฝ๊ณ„์„ ๋งŒ ์‚ด์ง ์ถ”๊ฐ€ */
699
- color: #333;
700
- padding: 12px 24px;
701
- text-transform: uppercase;
702
  font-weight: bold;
703
- letter-spacing: 1px;
704
- cursor: pointer;
705
- }
706
- button:hover, .btn:hover {
707
- background: rgba(0, 0, 0, 0.05) !important; /* ํ˜ธ๋ฒ„ ์‹œ ์•„์ฃผ ์‚ด์ง ์–ด๋‘ก๊ฒŒ๋งŒ */
708
  }
709
-
710
- /* examples ๊ด€๋ จ ๋ชจ๋“  ์ƒ‰์ƒ ์ œ๊ฑฐ */
711
- #examples_container, .examples-container {
712
- margin: auto;
713
- width: 90%;
714
- background: transparent !important;
715
- }
716
- #examples_row, .examples-row {
717
- justify-content: center;
718
- background: transparent !important;
719
- }
720
-
721
- /* examples ๋ฒ„ํŠผ ๋‚ด๋ถ€์˜ ๋ชจ๋“  ์ƒ‰์ƒ ์ œ๊ฑฐ */
722
- .gr-samples-table button,
723
- .gr-samples-table .gr-button,
724
- .gr-samples-table .gr-sample-btn,
725
- .gr-examples button,
726
- .gr-examples .gr-button,
727
- .gr-examples .gr-sample-btn,
728
- .examples button,
729
- .examples .gr-button,
730
- .examples .gr-sample-btn {
731
- background: transparent !important;
732
- border: 1px solid #ddd;
733
- color: #333;
734
- }
735
-
736
- /* examples ๋ฒ„ํŠผ ํ˜ธ๋ฒ„ ์‹œ์—๋„ ์ƒ‰์ƒ ์—†๊ฒŒ */
737
- .gr-samples-table button:hover,
738
- .gr-samples-table .gr-button:hover,
739
- .gr-samples-table .gr-sample-btn:hover,
740
- .gr-examples button:hover,
741
- .gr-examples .gr-button:hover,
742
- .gr-examples .gr-sample-btn:hover,
743
- .examples button:hover,
744
- .examples .gr-button:hover,
745
- .examples .gr-sample-btn:hover {
746
- background: rgba(0, 0, 0, 0.05) !important;
747
- }
748
-
749
- /* ์ฑ„ํŒ… ์ธํ„ฐํŽ˜์ด์Šค ์š”์†Œ๋“ค๋„ ํˆฌ๋ช…ํ•˜๊ฒŒ */
750
- .chatbox, .chatbot, .message {
751
- background: transparent !important;
752
  }
753
-
754
- /* ์ž…๋ ฅ์ฐฝ ํˆฌ๋ช…๋„ ์กฐ์ • */
755
- .multimodal-textbox, textarea, input {
756
- background: rgba(255, 255, 255, 0.5) !important;
757
  }
758
-
759
- /* ๋ชจ๋“  ์ปจํ…Œ์ด๋„ˆ ์š”์†Œ์— ๋ฐฐ๊ฒฝ์ƒ‰ ์ œ๊ฑฐ */
760
- .container, .wrap, .box, .panel, .gr-panel {
761
- background: transparent !important;
762
- }
763
-
764
- /* ์˜ˆ์ œ ์„น์…˜์˜ ๋ชจ๋“  ์š”์†Œ์—์„œ ๋ฐฐ๊ฒฝ์ƒ‰ ์ œ๊ฑฐ */
765
- .gr-examples-container, .gr-examples, .gr-sample, .gr-sample-row, .gr-sample-cell {
766
- background: transparent !important;
767
  }
768
  """
769
 
770
- title_html = """
771
- <h1 align="center" style="margin-bottom: 0.2em; font-size: 1.6em;"> ๐Ÿค— Gemma3-R1984-4B </h1>
772
- <p align="center" style="font-size:1.1em; color:#555;">
773
- โœ…Agentic AI Platform โœ…Reasoning & Uncensored โœ…Multimodal & VLM โœ…Deep-Research & RAG <br>
774
- Operates on an โœ…'NVIDIA L40s / A100(ZeroGPU) GPU' as an independent local server, enhancing security and preventing information leakage.<br>
775
- @Model Rpository: VIDraft/Gemma-3-R1984-4B, @Based by 'Google Gemma-3-4b', @Powered by 'MOUSE-II'(VIDRAFT)
776
- </p>
777
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
778
 
 
779
 
780
- with gr.Blocks(css=css, title="Gemma3-R1984-4B") as demo:
781
- gr.Markdown(title_html)
782
 
783
- # Display the web search option (while the system prompt and token slider remain hidden)
784
- web_search_checkbox = gr.Checkbox(
785
- label="Deep Research",
786
- value=False
 
 
 
 
 
 
 
 
787
  )
788
-
789
- # Used internally but not visible to the user
790
- system_prompt_box = gr.Textbox(
791
- lines=3,
792
- value="You are a deep thinking AI that may use extremely long chains of thought to thoroughly analyze the problem and deliberate using systematic reasoning processes to arrive at a correct solution before answering.",
793
- visible=False # hidden from view
794
  )
795
 
796
- max_tokens_slider = gr.Slider(
797
- label="Max New Tokens",
798
- minimum=100,
799
- maximum=8000,
800
- step=50,
801
- value=1000,
802
- visible=False # hidden from view
803
  )
804
 
805
- web_search_text = gr.Textbox(
806
- lines=1,
807
- label="(Unused) Web Search Query",
808
- placeholder="No direct input needed",
809
- visible=False # hidden from view
810
  )
811
 
812
- # Configure the chat interface
813
- chat = gr.ChatInterface(
814
- fn=run,
815
- type="messages",
816
- chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
817
- textbox=gr.MultimodalTextbox(
818
- file_types=[
819
- ".webp", ".png", ".jpg", ".jpeg", ".gif",
820
- ".mp4", ".csv", ".txt", ".pdf"
821
- ],
822
- file_count="multiple",
823
- autofocus=True
824
- ),
825
- multimodal=True,
826
- additional_inputs=[
827
- system_prompt_box,
828
- max_tokens_slider,
829
- web_search_checkbox,
830
- web_search_text,
831
- ],
832
- stop_btn=False,
833
- title='<a href="https://discord.gg/openfreeai" target="_blank">https://discord.gg/openfreeai</a>',
834
- examples=examples,
835
- run_examples_on_click=False,
836
- cache_examples=False,
837
- css_paths=None,
838
- delete_cache=(1800, 1800),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  )
840
-
841
- # Example section - since examples are already set in ChatInterface, this is for display only
842
- with gr.Row(elem_id="examples_row"):
843
- with gr.Column(scale=12, elem_id="examples_container"):
844
- gr.Markdown("### Example Inputs (click to load)")
845
-
846
 
847
  if __name__ == "__main__":
848
- # Run locally
849
- demo.launch()
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
 
3
  import os
4
  import re
5
  import tempfile
6
+ import gc
7
  from collections.abc import Iterator
8
  from threading import Thread
9
  import json
 
12
  import gradio as gr
13
  import spaces
14
  import torch
15
+ import numpy as np
16
  from loguru import logger
17
  from PIL import Image
18
  from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
+ import time
20
+ import warnings
21
+ from typing import Dict, List, Optional, Union
22
 
23
  # CSV/TXT ๋ถ„์„
24
  import pandas as pd
25
  # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
26
  import PyPDF2
27
 
28
+ warnings.filterwarnings('ignore')
29
+
30
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
31
+
32
+ ##############################################################################
33
+ # ์ƒ์ˆ˜ ์ •์˜
34
+ ##############################################################################
35
+ MAX_CONTENT_CHARS = 2000
36
+ MAX_INPUT_LENGTH = 2096
37
+ MAX_NUM_IMAGES = 5
38
+ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
39
+
40
  ##############################################################################
41
+ # ์ „์—ญ ๋ณ€์ˆ˜
42
+ ##############################################################################
43
+ model = None
44
+ processor = None
45
+ model_loaded = False
46
+ model_name = "Gemma3-R1984-4B"
47
+
48
+ ##############################################################################
49
+ # ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
50
  ##############################################################################
51
  def clear_cuda_cache():
52
  """CUDA ์บ์‹œ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ๋น„์›๋‹ˆ๋‹ค."""
 
55
  gc.collect()
56
 
57
  ##############################################################################
58
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
 
 
 
 
 
59
  ##############################################################################
60
  def extract_keywords(text: str, top_k: int = 5) -> str:
61
+ """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
 
 
 
 
62
  text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
63
  tokens = text.split()
64
+
65
+ seen = set()
66
+ unique_tokens = []
67
+ for token in tokens:
68
+ if token not in seen and len(token) > 1:
69
+ seen.add(token)
70
+ unique_tokens.append(token)
71
+
72
+ key_tokens = unique_tokens[:top_k]
73
  return " ".join(key_tokens)
74
 
75
  ##############################################################################
76
+ # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
 
77
  ##############################################################################
78
  def do_web_search(query: str) -> str:
79
+ """SerpHouse API๋ฅผ ์‚ฌ์šฉํ•œ ์›น ๊ฒ€์ƒ‰"""
 
 
 
80
  try:
81
  url = "https://api.serphouse.com/serp/live"
82
 
 
83
  params = {
84
  "q": query,
85
  "domain": "google.com",
86
+ "serp_type": "web",
87
  "device": "desktop",
88
+ "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
+ "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
  }
91
 
92
  headers = {
93
  "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
94
  }
95
 
96
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
 
97
 
 
98
  response = requests.get(url, headers=headers, params=params, timeout=60)
99
  response.raise_for_status()
100
 
 
101
  data = response.json()
102
 
 
103
  results = data.get("results", {})
104
+ organic = results.get("organic", []) if isinstance(results, dict) else []
 
 
 
 
 
 
 
 
 
105
 
 
 
 
 
106
  if not organic:
107
+ return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
+
109
+ max_results = min(10, len(organic))
 
 
 
 
 
110
  limited_organic = organic[:max_results]
111
 
 
112
  summary_lines = []
113
  for idx, item in enumerate(limited_organic, start=1):
114
+ title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
115
  link = item.get("link", "#")
116
+ snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
117
  displayed_link = item.get("displayed_link", link)
118
 
 
119
  summary_lines.append(
120
+ f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
121
  f"{snippet}\n\n"
122
  f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
123
  f"---\n"
124
  )
125
 
126
+ instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
127
+ ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
128
+ 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
129
+ 2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
130
+ 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
 
 
 
131
  """
132
 
133
  search_results = instructions + "\n".join(summary_lines)
 
134
  return search_results
135
 
136
  except Exception as e:
137
+ logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
138
+ return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
 
140
  ##############################################################################
141
+ # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
142
  ##############################################################################
143
  def analyze_csv_file(path: str) -> str:
144
+ """CSV ํŒŒ์ผ ๋ถ„์„"""
 
 
145
  try:
146
  df = pd.read_csv(path)
147
  if df.shape[0] > 50 or df.shape[1] > 10:
148
  df = df.iloc[:50, :10]
149
  df_str = df.to_string()
150
  if len(df_str) > MAX_CONTENT_CHARS:
151
+ df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
152
+ return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
153
  except Exception as e:
154
+ return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
 
155
 
156
  def analyze_txt_file(path: str) -> str:
157
+ """TXT ํŒŒ์ผ ๋ถ„์„"""
 
 
158
  try:
159
  with open(path, "r", encoding="utf-8") as f:
160
  text = f.read()
161
  if len(text) > MAX_CONTENT_CHARS:
162
+ text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
163
+ return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
164
  except Exception as e:
165
+ return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
 
166
 
167
  def pdf_to_markdown(pdf_path: str) -> str:
168
+ """PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
 
 
169
  text_chunks = []
170
  try:
171
  with open(pdf_path, "rb") as f:
 
177
  page_text = page_text.strip()
178
  if page_text:
179
  if len(page_text) > MAX_CONTENT_CHARS // max_pages:
180
+ page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
181
+ text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
182
  if len(reader.pages) > max_pages:
183
+ text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
184
  except Exception as e:
185
+ return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
186
 
187
  full_text = "\n".join(text_chunks)
188
  if len(full_text) > MAX_CONTENT_CHARS:
189
+ full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
 
 
190
 
191
+ return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
192
 
193
  ##############################################################################
194
+ # ๋ชจ๋ธ ๋กœ๋“œ
195
  ##############################################################################
196
+ @spaces.GPU(duration=120)
197
+ def load_model():
198
+ global model, processor, model_loaded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ if model_loaded:
201
+ logger.info("๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
202
+ return True
 
 
 
 
 
203
 
204
+ try:
205
+ logger.info("Gemma3-R1984-4B ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
206
+ clear_cuda_cache()
207
+
208
+ model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
209
+
210
+ processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
211
+ model = Gemma3ForConditionalGeneration.from_pretrained(
212
+ model_id,
213
+ device_map="auto",
214
+ torch_dtype=torch.bfloat16,
215
+ attn_implementation="eager"
216
+ )
217
+
218
+ model_loaded = True
219
+ logger.info(f"โœ… {model_name} ๋กœ๋”ฉ ์™„๋ฃŒ!")
220
+ return True
221
+
222
+ except Exception as e:
223
+ logger.error(f"๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
224
+ return False
225
 
226
  ##############################################################################
227
+ # ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
228
  ##############################################################################
229
+ @spaces.GPU(duration=60)
230
+ def analyze_image_for_robot(
231
+ image: Union[np.ndarray, Image.Image],
232
+ prompt: str,
233
+ task_type: str = "general",
234
+ use_web_search: bool = False,
235
+ enable_thinking: bool = True,
236
+ max_new_tokens: int = 1024
237
+ ) -> str:
238
+ """๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„"""
239
+ global model, processor
240
 
241
+ if not model_loaded:
242
+ if not load_model():
243
+ return "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ try:
246
+ # numpy ๋ฐฐ์—ด์„ PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
247
+ if isinstance(image, np.ndarray):
248
+ image = Image.fromarray(image).convert('RGB')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
249
 
250
+ # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
251
+ system_prompts = {
252
+ "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€๋ฅผ ์ž์„ธํžˆ ๋ถ„์„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.",
253
+ "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš์„ ์ˆ˜๋ฆฝํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค.
254
+ ์ฃผ์–ด์ง„ ์ด๋ฏธ์ง€์™€ ์ž‘์—…์„ ๋ถ„์„ํ•˜์—ฌ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
255
+ ํ˜•์‹: Step_1: xxx\nStep_2: xxx\n...\nStep_n: xxx""",
256
+ "grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜๋ฅผ ์ฐพ๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์œ„์น˜๋ฅผ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
257
+ "affordance": "๋‹น์‹ ์€ ๋กœ๋ด‡ ํŒŒ์ง€์ ์„ ๋ถ„์„ํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ์˜ˆ์ธกํ•˜์„ธ์š”.",
258
+ "trajectory": "๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ๋ชฉํ‘œ ์ง€์ ๊นŒ์ง€์˜ ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
259
+ "pointing": "๋‹น์‹ ์€ ๋‹ค์ค‘ ์ง€์ ์„ ์ง€์ •ํ•˜๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ์œ„์น˜๋“ค์„ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
260
+ }
261
+
262
+ system_prompt = system_prompts.get(task_type, system_prompts["general"])
263
+
264
+ # Chain-of-Thought ์ถ”๊ฐ€
265
+ if enable_thinking:
266
+ system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œํ•˜์„ธ์š”."
267
+
268
+ # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
269
+ combined_system = system_prompt
270
+ if use_web_search:
271
+ keywords = extract_keywords(prompt, top_k=5)
272
+ if keywords:
273
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
274
+ search_results = do_web_search(keywords)
275
+ combined_system = f"{search_results}\n\n{system_prompt}"
276
+
277
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
278
+ messages = [
279
+ {
280
+ "role": "system",
281
+ "content": [{"type": "text", "text": combined_system}]
282
+ },
283
+ {
284
+ "role": "user",
285
+ "content": [
286
+ {"type": "image", "url": image},
287
+ {"type": "text", "text": prompt}
288
+ ]
289
+ }
290
+ ]
291
+
292
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
293
+ inputs = processor.apply_chat_template(
294
+ messages,
295
+ add_generation_prompt=True,
296
+ tokenize=True,
297
+ return_dict=True,
298
+ return_tensors="pt",
299
+ ).to(device=model.device, dtype=torch.bfloat16)
300
+
301
+ # ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ
302
+ if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
303
+ inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
304
+ if 'attention_mask' in inputs:
305
+ inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
306
+
307
+ # ์ƒ์„ฑ
308
+ with torch.no_grad():
309
+ outputs = model.generate(
310
+ **inputs,
311
+ max_new_tokens=max_new_tokens,
312
+ do_sample=True,
313
+ temperature=0.7,
314
+ top_p=0.9,
315
+ )
316
+
317
+ # ๋””์ฝ”๋”ฉ
318
+ response = processor.decode(outputs[0], skip_special_tokens=True)
319
+
320
+ # ํ”„๋กฌํ”„ํŠธ ์ œ๊ฑฐ
321
+ if "Assistant:" in response:
322
+ response = response.split("Assistant:")[-1].strip()
323
+
324
+ return response
325
+
326
+ except Exception as e:
327
+ logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
328
+ import traceback
329
+ return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
330
+ finally:
331
+ clear_cuda_cache()
332
 
333
  ##############################################################################
334
+ # ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
335
  ##############################################################################
336
  def _model_gen_with_oom_catch(**kwargs):
337
+ """OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
338
+ global model
 
339
  try:
340
  model.generate(**kwargs)
341
  except torch.cuda.OutOfMemoryError:
342
+ raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
 
 
 
343
  finally:
 
344
  clear_cuda_cache()
345
 
 
 
 
 
346
  @spaces.GPU(duration=120)
347
+ def analyze_documents_streaming(
348
+ files: List[str],
349
+ prompt: str,
 
 
350
  use_web_search: bool = False,
351
+ max_new_tokens: int = 2048
352
  ) -> Iterator[str]:
353
+ """๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
354
+ global model, processor
355
+
356
+ if not model_loaded:
357
+ if not load_model():
358
+ yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
359
+ return
360
 
361
  try:
362
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
363
+ system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
364
+
365
+ # ์›น ๊ฒ€์ƒ‰
 
 
366
  if use_web_search:
367
+ keywords = extract_keywords(prompt, top_k=5)
368
+ if keywords:
369
+ search_results = do_web_search(keywords)
370
+ system_content = f"{search_results}\n\n{system_content}"
371
+
372
+ # ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
373
+ doc_contents = []
374
+ for file_path in files:
375
+ if file_path.lower().endswith('.csv'):
376
+ content = analyze_csv_file(file_path)
377
+ elif file_path.lower().endswith('.txt'):
378
+ content = analyze_txt_file(file_path)
379
+ elif file_path.lower().endswith('.pdf'):
380
+ content = pdf_to_markdown(file_path)
 
381
  else:
382
+ continue
383
+ doc_contents.append(content)
384
+
385
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
386
+ messages = [
387
+ {
388
  "role": "system",
389
+ "content": [{"type": "text", "text": system_content}]
390
+ },
391
+ {
392
+ "role": "user",
393
+ "content": [
394
+ {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
395
+ ]
396
+ }
397
+ ]
398
 
399
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
 
 
 
 
400
  inputs = processor.apply_chat_template(
401
  messages,
402
  add_generation_prompt=True,
 
405
  return_tensors="pt",
406
  ).to(device=model.device, dtype=torch.bfloat16)
407
 
408
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
 
 
 
 
 
409
  streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
410
  gen_kwargs = dict(
411
  inputs,
412
  streamer=streamer,
413
  max_new_tokens=max_new_tokens,
414
+ temperature=0.8,
415
+ top_p=0.9,
416
  )
417
+
418
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
419
  t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
420
  t.start()
421
+
422
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
423
  output = ""
424
  for new_text in streamer:
425
  output += new_text
426
  yield output
427
+
428
  except Exception as e:
429
+ logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
430
+ yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
 
431
  finally:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
432
  clear_cuda_cache()
433
 
 
 
434
  ##############################################################################
435
+ # Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
436
  ##############################################################################
437
  css = """
438
+ .robot-header {
439
+ text-align: center;
440
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
441
+ color: white;
442
+ padding: 20px;
443
+ border-radius: 10px;
444
+ margin-bottom: 20px;
445
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
446
  }
447
+ .status-box {
448
+ text-align: center;
449
+ padding: 10px;
450
+ border-radius: 5px;
451
+ margin: 10px 0;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  font-weight: bold;
 
 
 
 
 
453
  }
454
+ .info-box {
455
+ background: #f0f0f0;
456
+ padding: 15px;
457
+ border-radius: 8px;
458
+ margin: 10px 0;
459
+ border-left: 4px solid #2a5298;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
460
  }
461
+ .task-button {
462
+ min-height: 60px;
463
+ font-size: 1.1em;
 
464
  }
465
+ .webcam-container {
466
+ border: 3px solid #2a5298;
467
+ border-radius: 10px;
468
+ padding: 10px;
469
+ background: #f8f9fa;
 
 
 
 
470
  }
471
  """
472
 
473
+ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
474
+ gr.HTML("""
475
+ <div class="robot-header">
476
+ <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
477
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
478
+ <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
479
+ </div>
480
+ """)
481
+
482
+ gr.HTML("""
483
+ <div class="info-box">
484
+ <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
485
+ <ul>
486
+ <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
487
+ <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
488
+ <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
489
+ <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
490
+ <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
491
+ <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
492
+ <li>๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„ (PDF, CSV, TXT)</li>
493
+ </ul>
494
+ </div>
495
+ """)
496
+
497
+ with gr.Row():
498
+ # ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
499
+ with gr.Column(scale=1):
500
+ gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
501
+
502
+ with gr.Group(elem_classes="webcam-container"):
503
+ webcam = gr.Image(
504
+ sources=["webcam"],
505
+ streaming=True,
506
+ type="numpy",
507
+ label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
508
+ height=350
509
+ )
510
+
511
+ # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
512
+ captured_image = gr.Image(
513
+ label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
514
+ height=200,
515
+ visible=False
516
+ )
517
+
518
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
519
+ gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
520
+ with gr.Row():
521
+ capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
522
+ clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
523
+
524
+ with gr.Row():
525
+ planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
526
+ grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
527
+
528
+ with gr.Row():
529
+ affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
530
+ trajectory_btn = gr.Button("๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
531
+
532
+ # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
533
+ with gr.Column(scale=2):
534
+ gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
535
+
536
+ with gr.Row():
537
+ with gr.Column():
538
+ task_prompt = gr.Textbox(
539
+ label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
540
+ placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
541
+ value="์ด ์žฅ๋ฉด์—์„œ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ๋ถ„์„ํ•˜์„ธ์š”.",
542
+ lines=2
543
+ )
544
+
545
+ with gr.Row():
546
+ use_web_search = gr.Checkbox(
547
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
548
+ value=False,
549
+ info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
550
+ )
551
+
552
+ enable_thinking = gr.Checkbox(
553
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
554
+ value=True,
555
+ info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
556
+ )
557
+
558
+ max_tokens = gr.Slider(
559
+ label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
560
+ minimum=256,
561
+ maximum=4096,
562
+ value=1024,
563
+ step=256
564
+ )
565
+
566
+ gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
567
+ result_output = gr.Textbox(
568
+ label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
569
+ lines=20,
570
+ max_lines=40,
571
+ show_copy_button=True,
572
+ elem_id="result"
573
+ )
574
+
575
+ status_display = gr.HTML(
576
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
577
+ )
578
+
579
+ # ๋ฌธ์„œ ๋ถ„์„ ํƒญ
580
+ with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„"):
581
+ with gr.Row():
582
+ with gr.Column():
583
+ doc_files = gr.File(
584
+ label="๋ฌธ์„œ ์—…๋กœ๋“œ",
585
+ file_count="multiple",
586
+ file_types=[".pdf", ".csv", ".txt"],
587
+ type="filepath"
588
+ )
589
+
590
+ doc_prompt = gr.Textbox(
591
+ label="๋ถ„์„ ์š”์ฒญ",
592
+ placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
593
+ lines=3
594
+ )
595
+
596
+ doc_web_search = gr.Checkbox(
597
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
598
+ value=False
599
+ )
600
+
601
+ analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
602
+
603
+ with gr.Column():
604
+ doc_result = gr.Textbox(
605
+ label="๋ถ„์„ ๊ฒฐ๊ณผ",
606
+ lines=25,
607
+ max_lines=50
608
+ )
609
+
610
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
611
+ webcam_state = gr.State(None)
612
+
613
+ def capture_webcam(frame):
614
+ """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
615
+ if frame is None:
616
+ return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
617
+ return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
618
+
619
+ def clear_capture():
620
+ """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
621
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
622
+
623
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
624
+ """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
625
+ if image is None:
626
+ return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
627
+
628
+ status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
629
+
630
+ result = analyze_image_for_robot(
631
+ image=image,
632
+ prompt=prompt,
633
+ task_type=task_type,
634
+ use_web_search=use_search,
635
+ enable_thinking=thinking,
636
+ max_new_tokens=tokens
637
+ )
638
+
639
+ # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
640
+ timestamp = time.strftime("%H:%M:%S")
641
+ task_names = {
642
+ "planning": "์ž‘์—… ๊ณ„ํš",
643
+ "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
644
+ "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
645
+ "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
646
+ }
647
+
648
+ formatted_result = f"""๐Ÿค– ๋กœ๋ด‡ {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ:
649
 
650
+ ๐Ÿ“ธ **์ž‘์—…**: {prompt}
651
 
652
+ ๐Ÿ“ **๋ถ„์„ ๊ฒฐ๊ณผ**:
653
+ {result}
654
 
655
+ โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
656
+ ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
657
+ ๐Ÿ”ง ํƒœ์Šคํฌ: {task_type}"""
658
+
659
+ complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
660
+ return formatted_result, complete_status
661
+
662
+ # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
663
+ webcam.stream(
664
+ fn=lambda x: x,
665
+ inputs=[webcam],
666
+ outputs=[webcam_state]
667
  )
668
+
669
+ # ์บก์ฒ˜ ๋ฒ„ํŠผ
670
+ capture_btn.click(
671
+ fn=capture_webcam,
672
+ inputs=[webcam_state],
673
+ outputs=[webcam_state, captured_image, status_display]
674
  )
675
 
676
+ # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
677
+ clear_capture_btn.click(
678
+ fn=clear_capture,
679
+ outputs=[webcam_state, captured_image, status_display]
 
 
 
680
  )
681
 
682
+ # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
683
+ planning_btn.click(
684
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
685
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
686
+ outputs=[result_output, status_display]
687
  )
688
 
689
+ grounding_btn.click(
690
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
691
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
692
+ outputs=[result_output, status_display]
693
+ )
694
+
695
+ affordance_btn.click(
696
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
697
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
698
+ outputs=[result_output, status_display]
699
+ )
700
+
701
+ trajectory_btn.click(
702
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
703
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
704
+ outputs=[result_output, status_display]
705
+ )
706
+
707
+ # ๋ฌธ์„œ ๋ถ„์„
708
+ def analyze_docs(files, prompt, use_search):
709
+ if not files:
710
+ return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
711
+
712
+ output = ""
713
+ for chunk in analyze_documents_streaming(files, prompt, use_search):
714
+ output = chunk
715
+ return output
716
+
717
+ analyze_docs_btn.click(
718
+ fn=analyze_docs,
719
+ inputs=[doc_files, doc_prompt, doc_web_search],
720
+ outputs=[doc_result]
721
+ )
722
+
723
+ # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
724
+ def initial_load():
725
+ load_model()
726
+ return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
727
+
728
+ demo.load(
729
+ fn=initial_load,
730
+ outputs=None
731
  )
 
 
 
 
 
 
732
 
733
  if __name__ == "__main__":
734
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
735
+ demo.launch(
736
+ server_name="0.0.0.0",
737
+ server_port=7860,
738
+ share=False,
739
+ show_error=True,
740
+ debug=False
741
+ )