openfree commited on
Commit
02285b9
ยท
verified ยท
1 Parent(s): 59d9300

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +741 -0
app-backup.py ADDED
@@ -0,0 +1,741 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import os
4
+ import re
5
+ import tempfile
6
+ import gc
7
+ from collections.abc import Iterator
8
+ from threading import Thread
9
+ import json
10
+ import requests
11
+ import cv2
12
+ import gradio as gr
13
+ import spaces
14
+ import torch
15
+ import numpy as np
16
+ from loguru import logger
17
+ from PIL import Image
18
+ from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
19
+ import time
20
+ import warnings
21
+ from typing import Dict, List, Optional, Union
22
+
23
+ # CSV/TXT ๋ถ„์„
24
+ import pandas as pd
25
+ # PDF ํ…์ŠคํŠธ ์ถ”์ถœ
26
+ import PyPDF2
27
+
28
+ warnings.filterwarnings('ignore')
29
+
30
+ print("๐ŸŽฎ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์ดˆ๊ธฐํ™” (Gemma3-R1984-4B)...")
31
+
32
+ ##############################################################################
33
+ # ์ƒ์ˆ˜ ์ •์˜
34
+ ##############################################################################
35
+ MAX_CONTENT_CHARS = 2000
36
+ MAX_INPUT_LENGTH = 2096
37
+ MAX_NUM_IMAGES = 5
38
+ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
39
+
40
+ ##############################################################################
41
+ # ์ „์—ญ ๋ณ€์ˆ˜
42
+ ##############################################################################
43
+ model = None
44
+ processor = None
45
+ model_loaded = False
46
+ model_name = "Gemma3-R1984-4B"
47
+
48
+ ##############################################################################
49
+ # ๋ฉ”๋ชจ๋ฆฌ ๊ด€๋ฆฌ
50
+ ##############################################################################
51
+ def clear_cuda_cache():
52
+ """CUDA ์บ์‹œ๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ๋น„์›๋‹ˆ๋‹ค."""
53
+ if torch.cuda.is_available():
54
+ torch.cuda.empty_cache()
55
+ gc.collect()
56
+
57
+ ##############################################################################
58
+ # ํ‚ค์›Œ๋“œ ์ถ”์ถœ ํ•จ์ˆ˜
59
+ ##############################################################################
60
+ def extract_keywords(text: str, top_k: int = 5) -> str:
61
+ """ํ‚ค์›Œ๋“œ ์ถ”์ถœ"""
62
+ text = re.sub(r"[^a-zA-Z0-9๊ฐ€-ํžฃ\s]", "", text)
63
+ tokens = text.split()
64
+
65
+ seen = set()
66
+ unique_tokens = []
67
+ for token in tokens:
68
+ if token not in seen and len(token) > 1:
69
+ seen.add(token)
70
+ unique_tokens.append(token)
71
+
72
+ key_tokens = unique_tokens[:top_k]
73
+ return " ".join(key_tokens)
74
+
75
+ ##############################################################################
76
+ # ์›น ๊ฒ€์ƒ‰ ํ•จ์ˆ˜
77
+ ##############################################################################
78
+ def do_web_search(query: str) -> str:
79
+ """SerpHouse API๋ฅผ ์‚ฌ์šฉํ•œ ์›น ๊ฒ€์ƒ‰"""
80
+ try:
81
+ url = "https://api.serphouse.com/serp/live"
82
+
83
+ params = {
84
+ "q": query,
85
+ "domain": "google.com",
86
+ "serp_type": "web",
87
+ "device": "desktop",
88
+ "lang": "ko", # ํ•œ๊ตญ์–ด ์šฐ์„ 
89
+ "num": "10" # 10๊ฐœ๋กœ ์ œํ•œ
90
+ }
91
+
92
+ headers = {
93
+ "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
94
+ }
95
+
96
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ์ค‘... ๊ฒ€์ƒ‰์–ด: {query}")
97
+
98
+ response = requests.get(url, headers=headers, params=params, timeout=60)
99
+ response.raise_for_status()
100
+
101
+ data = response.json()
102
+
103
+ results = data.get("results", {})
104
+ organic = results.get("organic", []) if isinstance(results, dict) else []
105
+
106
+ if not organic:
107
+ return "๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
108
+
109
+ max_results = min(10, len(organic))
110
+ limited_organic = organic[:max_results]
111
+
112
+ summary_lines = []
113
+ for idx, item in enumerate(limited_organic, start=1):
114
+ title = item.get("title", "์ œ๋ชฉ ์—†์Œ")
115
+ link = item.get("link", "#")
116
+ snippet = item.get("snippet", "์„ค๋ช… ์—†์Œ")
117
+ displayed_link = item.get("displayed_link", link)
118
+
119
+ summary_lines.append(
120
+ f"### ๊ฒฐ๊ณผ {idx}: {title}\n\n"
121
+ f"{snippet}\n\n"
122
+ f"**์ถœ์ฒ˜**: [{displayed_link}]({link})\n\n"
123
+ f"---\n"
124
+ )
125
+
126
+ instructions = """# ์›น ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ
127
+ ์•„๋ž˜๋Š” ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์ž…๋‹ˆ๋‹ค. ๋‹ต๋ณ€ ์‹œ ์ด ์ •๋ณด๋ฅผ ํ™œ์šฉํ•˜์„ธ์š”:
128
+ 1. ๊ฐ ๊ฒฐ๊ณผ์˜ ์ œ๋ชฉ, ๋‚ด์šฉ, ์ถœ์ฒ˜ ๋งํฌ๋ฅผ ์ฐธ์กฐํ•˜์„ธ์š”
129
+ 2. ๊ด€๋ จ ์ถœ์ฒ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ธ์šฉํ•˜์„ธ์š”
130
+ 3. ์—ฌ๋Ÿฌ ์ถœ์ฒ˜์˜ ์ •๋ณด๋ฅผ ์ข…ํ•ฉํ•˜์—ฌ ๋‹ต๋ณ€ํ•˜์„ธ์š”
131
+ """
132
+
133
+ search_results = instructions + "\n".join(summary_lines)
134
+ return search_results
135
+
136
+ except Exception as e:
137
+ logger.error(f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {e}")
138
+ return f"์›น ๊ฒ€์ƒ‰ ์‹คํŒจ: {str(e)}"
139
+
140
+ ##############################################################################
141
+ # ๋ฌธ์„œ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜
142
+ ##############################################################################
143
+ def analyze_csv_file(path: str) -> str:
144
+ """CSV ํŒŒ์ผ ๋ถ„์„"""
145
+ try:
146
+ df = pd.read_csv(path)
147
+ if df.shape[0] > 50 or df.shape[1] > 10:
148
+ df = df.iloc[:50, :10]
149
+ df_str = df.to_string()
150
+ if len(df_str) > MAX_CONTENT_CHARS:
151
+ df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
152
+ return f"**[CSV ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{df_str}"
153
+ except Exception as e:
154
+ return f"CSV ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
155
+
156
+ def analyze_txt_file(path: str) -> str:
157
+ """TXT ํŒŒ์ผ ๋ถ„์„"""
158
+ try:
159
+ with open(path, "r", encoding="utf-8") as f:
160
+ text = f.read()
161
+ if len(text) > MAX_CONTENT_CHARS:
162
+ text = text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
163
+ return f"**[TXT ํŒŒ์ผ: {os.path.basename(path)}]**\n\n{text}"
164
+ except Exception as e:
165
+ return f"TXT ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(path)}): {str(e)}"
166
+
167
+ def pdf_to_markdown(pdf_path: str) -> str:
168
+ """PDF๋ฅผ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ๋ณ€ํ™˜"""
169
+ text_chunks = []
170
+ try:
171
+ with open(pdf_path, "rb") as f:
172
+ reader = PyPDF2.PdfReader(f)
173
+ max_pages = min(5, len(reader.pages))
174
+ for page_num in range(max_pages):
175
+ page = reader.pages[page_num]
176
+ page_text = page.extract_text() or ""
177
+ page_text = page_text.strip()
178
+ if page_text:
179
+ if len(page_text) > MAX_CONTENT_CHARS // max_pages:
180
+ page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(์ค‘๋žต)"
181
+ text_chunks.append(f"## ํŽ˜์ด์ง€ {page_num+1}\n\n{page_text}\n")
182
+ if len(reader.pages) > max_pages:
183
+ text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} ํŽ˜์ด์ง€ ํ‘œ์‹œ)...")
184
+ except Exception as e:
185
+ return f"PDF ์ฝ๊ธฐ ์‹คํŒจ ({os.path.basename(pdf_path)}): {str(e)}"
186
+
187
+ full_text = "\n".join(text_chunks)
188
+ if len(full_text) > MAX_CONTENT_CHARS:
189
+ full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(์ค‘๋žต)..."
190
+
191
+ return f"**[PDF ํŒŒ์ผ: {os.path.basename(pdf_path)}]**\n\n{full_text}"
192
+
193
+ ##############################################################################
194
+ # ๋ชจ๋ธ ๋กœ๋“œ
195
+ ##############################################################################
196
+ @spaces.GPU(duration=120)
197
+ def load_model():
198
+ global model, processor, model_loaded
199
+
200
+ if model_loaded:
201
+ logger.info("๋ชจ๋ธ์ด ์ด๋ฏธ ๋กœ๋“œ๋˜์–ด ์žˆ์Šต๋‹ˆ๋‹ค.")
202
+ return True
203
+
204
+ try:
205
+ logger.info("Gemma3-R1984-4B ๋ชจ๋ธ ๋กœ๋”ฉ ์‹œ์ž‘...")
206
+ clear_cuda_cache()
207
+
208
+ model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
209
+
210
+ processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
211
+ model = Gemma3ForConditionalGeneration.from_pretrained(
212
+ model_id,
213
+ device_map="auto",
214
+ torch_dtype=torch.bfloat16,
215
+ attn_implementation="eager"
216
+ )
217
+
218
+ model_loaded = True
219
+ logger.info(f"โœ… {model_name} ๋กœ๋”ฉ ์™„๋ฃŒ!")
220
+ return True
221
+
222
+ except Exception as e:
223
+ logger.error(f"๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ: {e}")
224
+ return False
225
+
226
+ ##############################################################################
227
+ # ์ด๋ฏธ์ง€ ๋ถ„์„ (๋กœ๋ด‡ ํƒœ์Šคํฌ ์ค‘์‹ฌ)
228
+ ##############################################################################
229
+ @spaces.GPU(duration=60)
230
+ def analyze_image_for_robot(
231
+ image: Union[np.ndarray, Image.Image],
232
+ prompt: str,
233
+ task_type: str = "general",
234
+ use_web_search: bool = False,
235
+ enable_thinking: bool = True,
236
+ max_new_tokens: int = 1024
237
+ ) -> str:
238
+ """๋กœ๋ด‡ ์ž‘์—…์„ ์œ„ํ•œ ์ด๋ฏธ์ง€ ๋ถ„์„"""
239
+ global model, processor
240
+
241
+ if not model_loaded:
242
+ if not load_model():
243
+ return "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
244
+
245
+ try:
246
+ # numpy ๋ฐฐ์—ด์„ PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
247
+ if isinstance(image, np.ndarray):
248
+ image = Image.fromarray(image).convert('RGB')
249
+
250
+ # ํƒœ์Šคํฌ๋ณ„ ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ ๊ตฌ์„ฑ
251
+ system_prompts = {
252
+ "general": "๋‹น์‹ ์€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์ด๋ฏธ์ง€๋ฅผ ์ž์„ธํžˆ ๋ถ„์„ํ•˜๊ณ  ์„ค๋ช…ํ•˜์„ธ์š”.",
253
+ "planning": """๋‹น์‹ ์€ ๋กœ๋ด‡ ์ž‘์—… ๊ณ„ํš์„ ์ˆ˜๋ฆฝํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค.
254
+ ์ฃผ์–ด์ง„ ์ด๋ฏธ์ง€์™€ ์ž‘์—…์„ ๋ถ„์„ํ•˜์—ฌ ๋‹จ๊ณ„๋ณ„ ์‹คํ–‰ ๊ณ„ํš์„ ์ž‘์„ฑํ•˜์„ธ์š”.
255
+ ํ˜•์‹: Step_1: xxx\nStep_2: xxx\n...\nStep_n: xxx""",
256
+ "grounding": "๋‹น์‹ ์€ ๊ฐ์ฒด ์œ„์น˜๋ฅผ ์ฐพ๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ๊ฐ์ฒด์˜ ์œ„์น˜๋ฅผ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”.",
257
+ "affordance": "๋‹น์‹ ์€ ๋กœ๋ด‡ ํŒŒ์ง€์ ์„ ๋ถ„์„ํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ์ฃผ์–ด์ง„ ์ž‘์—…์„ ์œ„ํ•œ ์ตœ์ ์˜ ํŒŒ์ง€ ์˜์—ญ์„ [x1, y1, x2, y2] ์ขŒํ‘œ๋กœ ์˜ˆ์ธกํ•˜์„ธ์š”.",
258
+ "trajectory": "๋‹น์‹ ์€ ๋กœ๋ด‡ ๊ฒฝ๋กœ๋ฅผ ๊ณ„ํšํ•˜๋Š” AI์ž…๋‹ˆ๋‹ค. ๋ชฉํ‘œ ์ง€์ ๊นŒ์ง€์˜ ๊ฒฝ๋กœ๋ฅผ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ์ œ์‹œํ•˜์„ธ์š”.",
259
+ "pointing": "๋‹น์‹ ์€ ๋‹ค์ค‘ ์ง€์ ์„ ์ง€์ •ํ•˜๋Š” ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ์ž…๋‹ˆ๋‹ค. ์š”์ฒญ๋œ ์œ„์น˜๋“ค์„ [(x1,y1), (x2,y2), ...] ํ˜•์‹์œผ๋กœ ๋ฐ˜ํ™˜ํ•˜์„ธ์š”."
260
+ }
261
+
262
+ system_prompt = system_prompts.get(task_type, system_prompts["general"])
263
+
264
+ # Chain-of-Thought ์ถ”๊ฐ€
265
+ if enable_thinking:
266
+ system_prompt += "\n\n์ถ”๋ก  ๊ณผ์ •์„ <thinking></thinking> ํƒœ๊ทธ ์•ˆ์— ์ƒ์„ธํžˆ ์ž‘์„ฑํ•œ ํ›„ ์ตœ์ข… ๋‹ต๋ณ€์„ ์ œ์‹œํ•˜์„ธ์š”."
267
+
268
+ # ์›น ๊ฒ€์ƒ‰ ์ˆ˜ํ–‰
269
+ combined_system = system_prompt
270
+ if use_web_search:
271
+ keywords = extract_keywords(prompt, top_k=5)
272
+ if keywords:
273
+ logger.info(f"์›น ๊ฒ€์ƒ‰ ํ‚ค์›Œ๋“œ: {keywords}")
274
+ search_results = do_web_search(keywords)
275
+ combined_system = f"{search_results}\n\n{system_prompt}"
276
+
277
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
278
+ messages = [
279
+ {
280
+ "role": "system",
281
+ "content": [{"type": "text", "text": combined_system}]
282
+ },
283
+ {
284
+ "role": "user",
285
+ "content": [
286
+ {"type": "image", "url": image},
287
+ {"type": "text", "text": prompt}
288
+ ]
289
+ }
290
+ ]
291
+
292
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
293
+ inputs = processor.apply_chat_template(
294
+ messages,
295
+ add_generation_prompt=True,
296
+ tokenize=True,
297
+ return_dict=True,
298
+ return_tensors="pt",
299
+ ).to(device=model.device, dtype=torch.bfloat16)
300
+
301
+ # ์ž…๋ ฅ ํ† ํฐ ์ˆ˜ ์ œํ•œ
302
+ if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
303
+ inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
304
+ if 'attention_mask' in inputs:
305
+ inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
306
+
307
+ # ์ƒ์„ฑ
308
+ with torch.no_grad():
309
+ outputs = model.generate(
310
+ **inputs,
311
+ max_new_tokens=max_new_tokens,
312
+ do_sample=True,
313
+ temperature=0.7,
314
+ top_p=0.9,
315
+ )
316
+
317
+ # ๋””์ฝ”๋”ฉ
318
+ response = processor.decode(outputs[0], skip_special_tokens=True)
319
+
320
+ # ํ”„๋กฌํ”„ํŠธ ์ œ๊ฑฐ
321
+ if "Assistant:" in response:
322
+ response = response.split("Assistant:")[-1].strip()
323
+
324
+ return response
325
+
326
+ except Exception as e:
327
+ logger.error(f"์ด๋ฏธ์ง€ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
328
+ import traceback
329
+ return f"โŒ ๋ถ„์„ ์˜ค๋ฅ˜: {str(e)}\n{traceback.format_exc()}"
330
+ finally:
331
+ clear_cuda_cache()
332
+
333
+ ##############################################################################
334
+ # ๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)
335
+ ##############################################################################
336
+ def _model_gen_with_oom_catch(**kwargs):
337
+ """OOM ์ฒ˜๋ฆฌ๋ฅผ ์œ„ํ•œ ์ƒ์„ฑ ํ•จ์ˆ˜"""
338
+ global model
339
+ try:
340
+ model.generate(**kwargs)
341
+ except torch.cuda.OutOfMemoryError:
342
+ raise RuntimeError("GPU ๋ฉ”๋ชจ๋ฆฌ ๋ถ€์กฑ. Max Tokens๋ฅผ ์ค„์—ฌ์ฃผ์„ธ์š”.")
343
+ finally:
344
+ clear_cuda_cache()
345
+
346
+ @spaces.GPU(duration=120)
347
+ def analyze_documents_streaming(
348
+ files: List[str],
349
+ prompt: str,
350
+ use_web_search: bool = False,
351
+ max_new_tokens: int = 2048
352
+ ) -> Iterator[str]:
353
+ """๋ฌธ์„œ ๋ถ„์„ (์ŠคํŠธ๋ฆฌ๋ฐ)"""
354
+ global model, processor
355
+
356
+ if not model_loaded:
357
+ if not load_model():
358
+ yield "โŒ ๋ชจ๋ธ ๋กœ๋”ฉ ์‹คํŒจ"
359
+ return
360
+
361
+ try:
362
+ # ์‹œ์Šคํ…œ ํ”„๋กฌํ”„ํŠธ
363
+ system_content = "๋‹น์‹ ์€ ๋ฌธ์„œ๋ฅผ ๋ถ„์„ํ•˜๊ณ  ์š”์•ฝํ•˜๋Š” ์ „๋ฌธ AI์ž…๋‹ˆ๋‹ค."
364
+
365
+ # ์›น ๊ฒ€์ƒ‰
366
+ if use_web_search:
367
+ keywords = extract_keywords(prompt, top_k=5)
368
+ if keywords:
369
+ search_results = do_web_search(keywords)
370
+ system_content = f"{search_results}\n\n{system_content}"
371
+
372
+ # ๋ฌธ์„œ ๋‚ด์šฉ ์ฒ˜๋ฆฌ
373
+ doc_contents = []
374
+ for file_path in files:
375
+ if file_path.lower().endswith('.csv'):
376
+ content = analyze_csv_file(file_path)
377
+ elif file_path.lower().endswith('.txt'):
378
+ content = analyze_txt_file(file_path)
379
+ elif file_path.lower().endswith('.pdf'):
380
+ content = pdf_to_markdown(file_path)
381
+ else:
382
+ continue
383
+ doc_contents.append(content)
384
+
385
+ # ๋ฉ”์‹œ์ง€ ๊ตฌ์„ฑ
386
+ messages = [
387
+ {
388
+ "role": "system",
389
+ "content": [{"type": "text", "text": system_content}]
390
+ },
391
+ {
392
+ "role": "user",
393
+ "content": [
394
+ {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
395
+ ]
396
+ }
397
+ ]
398
+
399
+ # ์ž…๋ ฅ ์ฒ˜๋ฆฌ
400
+ inputs = processor.apply_chat_template(
401
+ messages,
402
+ add_generation_prompt=True,
403
+ tokenize=True,
404
+ return_dict=True,
405
+ return_tensors="pt",
406
+ ).to(device=model.device, dtype=torch.bfloat16)
407
+
408
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์„ค์ •
409
+ streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
410
+ gen_kwargs = dict(
411
+ inputs,
412
+ streamer=streamer,
413
+ max_new_tokens=max_new_tokens,
414
+ temperature=0.8,
415
+ top_p=0.9,
416
+ )
417
+
418
+ # ๋ณ„๋„ ์Šค๋ ˆ๋“œ์—์„œ ์ƒ์„ฑ
419
+ t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
420
+ t.start()
421
+
422
+ # ์ŠคํŠธ๋ฆฌ๋ฐ ์ถœ๋ ฅ
423
+ output = ""
424
+ for new_text in streamer:
425
+ output += new_text
426
+ yield output
427
+
428
+ except Exception as e:
429
+ logger.error(f"๋ฌธ์„œ ๋ถ„์„ ์˜ค๋ฅ˜: {e}")
430
+ yield f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
431
+ finally:
432
+ clear_cuda_cache()
433
+
434
+ ##############################################################################
435
+ # Gradio UI (๋กœ๋ด‡ ์‹œ๊ฐํ™” ์ค‘์‹ฌ)
436
+ ##############################################################################
437
+ css = """
438
+ .robot-header {
439
+ text-align: center;
440
+ background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
441
+ color: white;
442
+ padding: 20px;
443
+ border-radius: 10px;
444
+ margin-bottom: 20px;
445
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
446
+ }
447
+ .status-box {
448
+ text-align: center;
449
+ padding: 10px;
450
+ border-radius: 5px;
451
+ margin: 10px 0;
452
+ font-weight: bold;
453
+ }
454
+ .info-box {
455
+ background: #f0f0f0;
456
+ padding: 15px;
457
+ border-radius: 8px;
458
+ margin: 10px 0;
459
+ border-left: 4px solid #2a5298;
460
+ }
461
+ .task-button {
462
+ min-height: 60px;
463
+ font-size: 1.1em;
464
+ }
465
+ .webcam-container {
466
+ border: 3px solid #2a5298;
467
+ border-radius: 10px;
468
+ padding: 10px;
469
+ background: #f8f9fa;
470
+ }
471
+ """
472
+
473
+ with gr.Blocks(title="๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ (Gemma3-4B)", css=css) as demo:
474
+ gr.HTML("""
475
+ <div class="robot-header">
476
+ <h1>๐Ÿค– ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ</h1>
477
+ <h3>๐ŸŽฎ Gemma3-R1984-4B + ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ  + ๐Ÿ” ์›น ๊ฒ€์ƒ‰</h3>
478
+ <p>โšก ์ตœ์‹  ๋ฉ€ํ‹ฐ๋ชจ๋‹ฌ AI๋กœ ๋กœ๋ด‡ ์ž‘์—… ๋ถ„์„ ๋ฐ ๊ณ„ํš ์ˆ˜๋ฆฝ!</p>
479
+ </div>
480
+ """)
481
+
482
+ gr.HTML("""
483
+ <div class="info-box">
484
+ <h4>๐ŸŒŸ ์‹œ์Šคํ…œ ํŠน์ง•:</h4>
485
+ <ul>
486
+ <li>๐Ÿ–ผ๏ธ ๊ณ ๊ธ‰ ์ด๋ฏธ์ง€/๋น„๋””์˜ค ๋ถ„์„ (Gemma3-4B VLM)</li>
487
+ <li>๐Ÿ“‹ ๋‹ค๋‹จ๊ณ„ ์ž‘์—… ๊ณ„ํš ๋ฐ ์ถ”๋ก </li>
488
+ <li>๐Ÿ“ ์ •๋ฐ€ํ•œ ๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•… (Grounding)</li>
489
+ <li>๐Ÿค ๋กœ๋ด‡ ํŒŒ์ง€์  ๋ถ„์„ (Affordance)</li>
490
+ <li>๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš (Trajectory Planning)</li>
491
+ <li>๐Ÿ” ์‹ค์‹œ๊ฐ„ ์›น ๊ฒ€์ƒ‰ ํ†ตํ•ฉ</li>
492
+ <li>๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„ (PDF, CSV, TXT)</li>
493
+ </ul>
494
+ </div>
495
+ """)
496
+
497
+ with gr.Row():
498
+ # ์™ผ์ชฝ: ์›น์บ  ๋ฐ ์ž…๋ ฅ
499
+ with gr.Column(scale=1):
500
+ gr.Markdown("### ๐Ÿ“ท ์‹ค์‹œ๊ฐ„ ์›น์บ ")
501
+
502
+ with gr.Group(elem_classes="webcam-container"):
503
+ webcam = gr.Image(
504
+ sources=["webcam"],
505
+ streaming=True,
506
+ type="numpy",
507
+ label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ",
508
+ height=350
509
+ )
510
+
511
+ # ์บก์ฒ˜๋œ ์ด๋ฏธ์ง€ ํ‘œ์‹œ
512
+ captured_image = gr.Image(
513
+ label="์บก์ฒ˜๋œ ์ด๋ฏธ์ง€",
514
+ height=200,
515
+ visible=False
516
+ )
517
+
518
+ # ๋กœ๋ด‡ ์ž‘์—… ๋ฒ„ํŠผ๋“ค
519
+ gr.Markdown("### ๐ŸŽฏ ๋กœ๋ด‡ ์ž‘์—… ์„ ํƒ")
520
+ with gr.Row():
521
+ capture_btn = gr.Button("๐Ÿ“ธ ์บก์ฒ˜", variant="primary", elem_classes="task-button")
522
+ clear_capture_btn = gr.Button("๐Ÿ—‘๏ธ ์ดˆ๊ธฐํ™”", elem_classes="task-button")
523
+
524
+ with gr.Row():
525
+ planning_btn = gr.Button("๐Ÿ“‹ ์ž‘์—… ๊ณ„ํš", elem_classes="task-button")
526
+ grounding_btn = gr.Button("๐Ÿ“ ๊ฐ์ฒด ์œ„์น˜", elem_classes="task-button")
527
+
528
+ with gr.Row():
529
+ affordance_btn = gr.Button("๐Ÿค ํŒŒ์ง€์  ๋ถ„์„", elem_classes="task-button")
530
+ trajectory_btn = gr.Button("๐Ÿ›ค๏ธ ๊ฒฝ๋กœ ๊ณ„ํš", elem_classes="task-button")
531
+
532
+ # ์˜ค๋ฅธ์ชฝ: ๋ถ„์„ ์„ค์ • ๋ฐ ๊ฒฐ๊ณผ
533
+ with gr.Column(scale=2):
534
+ gr.Markdown("### โš™๏ธ ๋ถ„์„ ์„ค์ •")
535
+
536
+ with gr.Row():
537
+ with gr.Column():
538
+ task_prompt = gr.Textbox(
539
+ label="์ž‘์—… ์„ค๋ช… / ์งˆ๋ฌธ",
540
+ placeholder="์˜ˆ: ํ…Œ์ด๋ธ” ์œ„์˜ ์ปต์„ ์žก์•„์„œ ์‹ฑํฌ๋Œ€์— ๋†“๊ธฐ",
541
+ value="์ด ์žฅ๋ฉด์—์„œ ๋กœ๋ด‡์ด ์ˆ˜ํ–‰ํ•  ์ˆ˜ ์žˆ๋Š” ์ž‘์—…์„ ๋ถ„์„ํ•˜์„ธ์š”.",
542
+ lines=2
543
+ )
544
+
545
+ with gr.Row():
546
+ use_web_search = gr.Checkbox(
547
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
548
+ value=False,
549
+ info="๊ด€๋ จ ์ •๋ณด๋ฅผ ์›น์—์„œ ๊ฒ€์ƒ‰ํ•ฉ๋‹ˆ๋‹ค"
550
+ )
551
+
552
+ enable_thinking = gr.Checkbox(
553
+ label="๐Ÿค” ์ถ”๋ก  ๊ณผ์ • ํ‘œ์‹œ",
554
+ value=True,
555
+ info="Chain-of-Thought ์ถ”๋ก  ๊ณผ์ •์„ ๋ณด์—ฌ์ค๋‹ˆ๋‹ค"
556
+ )
557
+
558
+ max_tokens = gr.Slider(
559
+ label="์ตœ๋Œ€ ํ† ํฐ ์ˆ˜",
560
+ minimum=256,
561
+ maximum=4096,
562
+ value=1024,
563
+ step=256
564
+ )
565
+
566
+ gr.Markdown("### ๐Ÿ“Š ๋ถ„์„ ๊ฒฐ๊ณผ")
567
+ result_output = gr.Textbox(
568
+ label="AI ๋ถ„์„ ๊ฒฐ๊ณผ",
569
+ lines=20,
570
+ max_lines=40,
571
+ show_copy_button=True,
572
+ elem_id="result"
573
+ )
574
+
575
+ status_display = gr.HTML(
576
+ '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
577
+ )
578
+
579
+ # ๋ฌธ์„œ ๋ถ„์„ ํƒญ
580
+ with gr.Tab("๐Ÿ“„ ๋ฌธ์„œ ๋ถ„์„"):
581
+ with gr.Row():
582
+ with gr.Column():
583
+ doc_files = gr.File(
584
+ label="๋ฌธ์„œ ์—…๋กœ๋“œ",
585
+ file_count="multiple",
586
+ file_types=[".pdf", ".csv", ".txt"],
587
+ type="filepath"
588
+ )
589
+
590
+ doc_prompt = gr.Textbox(
591
+ label="๋ถ„์„ ์š”์ฒญ",
592
+ placeholder="์˜ˆ: ์ด ๋ฌธ์„œ๋“ค์˜ ํ•ต์‹ฌ ๋‚ด์šฉ์„ ์š”์•ฝํ•˜๊ณ  ๋น„๊ต ๋ถ„์„ํ•˜์„ธ์š”.",
593
+ lines=3
594
+ )
595
+
596
+ doc_web_search = gr.Checkbox(
597
+ label="๐Ÿ” ์›น ๊ฒ€์ƒ‰ ์‚ฌ์šฉ",
598
+ value=False
599
+ )
600
+
601
+ analyze_docs_btn = gr.Button("๐Ÿ“Š ๋ฌธ์„œ ๋ถ„์„", variant="primary")
602
+
603
+ with gr.Column():
604
+ doc_result = gr.Textbox(
605
+ label="๋ถ„์„ ๊ฒฐ๊ณผ",
606
+ lines=25,
607
+ max_lines=50
608
+ )
609
+
610
+ # ์ด๋ฒคํŠธ ํ•ธ๋“ค๋Ÿฌ
611
+ webcam_state = gr.State(None)
612
+
613
+ def capture_webcam(frame):
614
+ """์›น์บ  ํ”„๋ ˆ์ž„ ์บก์ฒ˜"""
615
+ if frame is None:
616
+ return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์›น์บ  ํ”„๋ ˆ์ž„ ์—†์Œ</div>'
617
+ return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ์ด๋ฏธ์ง€ ์บก์ฒ˜ ์™„๋ฃŒ</div>'
618
+
619
+ def clear_capture():
620
+ """์บก์ฒ˜ ์ดˆ๊ธฐํ™”"""
621
+ return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">๐ŸŽฎ ์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ</div>'
622
+
623
+ def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
624
+ """ํŠน์ • ํƒœ์Šคํฌ๋กœ ์ด๋ฏธ์ง€ ๋ถ„์„"""
625
+ if image is None:
626
+ return "โŒ ๋จผ์ € ์ด๋ฏธ์ง€๋ฅผ ์บก์ฒ˜ํ•˜์„ธ์š”.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">โŒ ์ด๋ฏธ์ง€ ์—†์Œ</div>'
627
+
628
+ status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">๐Ÿš€ {task_type} ๋ถ„์„ ์ค‘...</div>'
629
+
630
+ result = analyze_image_for_robot(
631
+ image=image,
632
+ prompt=prompt,
633
+ task_type=task_type,
634
+ use_web_search=use_search,
635
+ enable_thinking=thinking,
636
+ max_new_tokens=tokens
637
+ )
638
+
639
+ # ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
640
+ timestamp = time.strftime("%H:%M:%S")
641
+ task_names = {
642
+ "planning": "์ž‘์—… ๊ณ„ํš",
643
+ "grounding": "๊ฐ์ฒด ์œ„์น˜ ํŒŒ์•…",
644
+ "affordance": "ํŒŒ์ง€์  ๋ถ„์„",
645
+ "trajectory": "๊ฒฝ๋กœ ๊ณ„ํš"
646
+ }
647
+
648
+ formatted_result = f"""๐Ÿค– ๋กœ๋ด‡ {task_names.get(task_type, '๋ถ„์„')} ๊ฒฐ๊ณผ:
649
+
650
+ ๐Ÿ“ธ **์ž‘์—…**: {prompt}
651
+
652
+ ๐Ÿ“ **๋ถ„์„ ๊ฒฐ๊ณผ**:
653
+ {result}
654
+
655
+ โฐ ๋ถ„์„ ์‹œ๊ฐ„: {timestamp}
656
+ ๐ŸŽฏ ๋ชจ๋ธ: {model_name}
657
+ ๐Ÿ”ง ํƒœ์Šคํฌ: {task_type}"""
658
+
659
+ complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">โœ… ๋ถ„์„ ์™„๋ฃŒ!</div>'
660
+ return formatted_result, complete_status
661
+
662
+ # ์›น์บ  ์ŠคํŠธ๋ฆฌ๋ฐ
663
+ webcam.stream(
664
+ fn=lambda x: x,
665
+ inputs=[webcam],
666
+ outputs=[webcam_state]
667
+ )
668
+
669
+ # ์บก์ฒ˜ ๋ฒ„ํŠผ
670
+ capture_btn.click(
671
+ fn=capture_webcam,
672
+ inputs=[webcam_state],
673
+ outputs=[webcam_state, captured_image, status_display]
674
+ )
675
+
676
+ # ์ดˆ๊ธฐํ™” ๋ฒ„ํŠผ
677
+ clear_capture_btn.click(
678
+ fn=clear_capture,
679
+ outputs=[webcam_state, captured_image, status_display]
680
+ )
681
+
682
+ # ์ž‘์—… ๋ฒ„ํŠผ๋“ค
683
+ planning_btn.click(
684
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
685
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
686
+ outputs=[result_output, status_display]
687
+ )
688
+
689
+ grounding_btn.click(
690
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
691
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
692
+ outputs=[result_output, status_display]
693
+ )
694
+
695
+ affordance_btn.click(
696
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
697
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
698
+ outputs=[result_output, status_display]
699
+ )
700
+
701
+ trajectory_btn.click(
702
+ fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
703
+ inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
704
+ outputs=[result_output, status_display]
705
+ )
706
+
707
+ # ๋ฌธ์„œ ๋ถ„์„
708
+ def analyze_docs(files, prompt, use_search):
709
+ if not files:
710
+ return "โŒ ๋ฌธ์„œ๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”."
711
+
712
+ output = ""
713
+ for chunk in analyze_documents_streaming(files, prompt, use_search):
714
+ output = chunk
715
+ return output
716
+
717
+ analyze_docs_btn.click(
718
+ fn=analyze_docs,
719
+ inputs=[doc_files, doc_prompt, doc_web_search],
720
+ outputs=[doc_result]
721
+ )
722
+
723
+ # ์ดˆ๊ธฐ ๋ชจ๋ธ ๋กœ๋“œ
724
+ def initial_load():
725
+ load_model()
726
+ return "์‹œ์Šคํ…œ ์ค€๋น„ ์™„๋ฃŒ! ๐Ÿš€"
727
+
728
+ demo.load(
729
+ fn=initial_load,
730
+ outputs=None
731
+ )
732
+
733
+ if __name__ == "__main__":
734
+ print("๐Ÿš€ ๋กœ๋ด‡ ์‹œ๊ฐ ์‹œ์Šคํ…œ ์‹œ์ž‘ (Gemma3-R1984-4B)...")
735
+ demo.launch(
736
+ server_name="0.0.0.0",
737
+ server_port=7860,
738
+ share=False,
739
+ show_error=True,
740
+ debug=False
741
+ )