ROBO-R1984

Runtime error

App Files Files Community

openfree commited on Jun 14, 2025

Commit

fd897d3

verified ·

1 Parent(s): 0995092

Update app.py

Browse files

Files changed (1) hide show

app.py +546 -654

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
-#!/usr/bin/env python
 import os
 import re
 import tempfile
-import gc  # garbage collector 추가
 from collections.abc import Iterator
 from threading import Thread
 import json
@@ -12,17 +12,41 @@ import cv2
 import gradio as gr
 import spaces
 import torch
 from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 # CSV/TXT 분석
 import pandas as pd
 # PDF 텍스트 추출
 import PyPDF2
 ##############################################################################
-# 메모리 정리 함수 추가
 ##############################################################################
 def clear_cuda_cache():
     """CUDA 캐시를 명시적으로 비웁니다."""
@@ -31,177 +55,117 @@ def clear_cuda_cache():
         gc.collect()
 ##############################################################################
-# SERPHouse API key from environment variable
-##############################################################################
-SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
-##############################################################################
-# 간단한 키워드 추출 함수 (한글 + 알파벳 + 숫자 + 공백 보존)
 ##############################################################################
 def extract_keywords(text: str, top_k: int = 5) -> str:
-    """
-    1) 한글(가-힣), 영어(a-zA-Z), 숫자(0-9), 공백만 남김
-    2) 공백 기준 토큰 분리
-    3) 최대 top_k개만
-    """
     text = re.sub(r"[^a-zA-Z0-9가-힣\s]", "", text)
     tokens = text.split()
-    key_tokens = tokens[:top_k]
     return " ".join(key_tokens)
 ##############################################################################
-# SerpHouse Live endpoint 호출
-# - 상위 20개 결과 JSON을 LLM에 넘길 때 link, snippet 등 모두 포함
 ##############################################################################
 def do_web_search(query: str) -> str:
-    """
-    상위 20개 'organic' 결과 item 전체(제목, link, snippet 등)를
-    JSON 문자열 형태로 반환
-    """
     try:
         url = "https://api.serphouse.com/serp/live"
-        # 기본 GET 방식으로 파라미터 간소화하고 결과 수를 20개로 제한
         params = {
             "q": query,
             "domain": "google.com",
-            "serp_type": "web",  # 기본 웹 검색
             "device": "desktop",
-            "lang": "en",
-            "num": "20"  # 최대 20개 결과만 요청
         }
         headers = {
             "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
         }
-        logger.info(f"SerpHouse API 호출 중... 검색어: {query}")
-        logger.info(f"요청 URL: {url} - 파라미터: {params}")
-        # GET 요청 수행
         response = requests.get(url, headers=headers, params=params, timeout=60)
         response.raise_for_status()
-        logger.info(f"SerpHouse API 응답 상태 코드: {response.status_code}")
         data = response.json()
-        # 다양한 응답 구조 처리
         results = data.get("results", {})
-        organic = None
-        # 가능한 응답 구조 1
-        if isinstance(results, dict) and "organic" in results:
-            organic = results["organic"]
-        # 가능한 응답 구조 2 (중첩된 results)
-        elif isinstance(results, dict) and "results" in results:
-            if isinstance(results["results"], dict) and "organic" in results["results"]:
-                organic = results["results"]["organic"]
-        # 가능한 응답 구조 3 (최상위 organic)
-        elif "organic" in data:
-            organic = data["organic"]
         if not organic:
-            logger.warning("응답에서 organic 결과를 찾을 수 없습니다.")
-            logger.debug(f"응답 구조: {list(data.keys())}")
-            if isinstance(results, dict):
-                logger.debug(f"results 구조: {list(results.keys())}")
-            return "No web search results found or unexpected API response structure."
-        # 결과 수 제한 및 컨텍스트 길이 최적화
-        max_results = min(20, len(organic))
         limited_organic = organic[:max_results]
-        # 결과 형식 개선 - 마크다운 형식으로 출력하여 가독성 향상
         summary_lines = []
         for idx, item in enumerate(limited_organic, start=1):
-            title = item.get("title", "No title")
             link = item.get("link", "#")
-            snippet = item.get("snippet", "No description")
             displayed_link = item.get("displayed_link", link)
-            # 마크다운 형식 (링크 클릭 가능)
             summary_lines.append(
-                f"### Result {idx}: {title}\n\n"
                 f"{snippet}\n\n"
                 f"**출처**: [{displayed_link}]({link})\n\n"
                 f"---\n"
             )
-        # 모델에게 명확한 지침 추가
-        instructions = """
-# 웹 검색 결과
-아래는 검색 결과입니다. 질문에 답변할 때 이 정보를 활용하세요:
-1. 각 결과의 제목, 내용, 출처 링크를 참고하세요
-2. 답변에 관련 정보의 출처를 명시적으로 인용하세요 (예: "X 출처에 따르면...")
-3. 응답에 실제 출처 링크를 포함하세요
-4. 여러 출처의 정보를 종합하여 답변하세요
 """
         search_results = instructions + "\n".join(summary_lines)
-        logger.info(f"검색 결과 {len(limited_organic)}개 처리 완료")
         return search_results
     except Exception as e:
-        logger.error(f"Web search failed: {e}")
-        return f"Web search failed: {str(e)}"
-##############################################################################
-# 모델/프로세서 로딩
-##############################################################################
-MAX_CONTENT_CHARS = 2000
-MAX_INPUT_LENGTH = 2096  # 최대 입력 토큰 수 제한 추가
-model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
-processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
-model = Gemma3ForConditionalGeneration.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.bfloat16,
-    attn_implementation="eager"  # 가능하다면 "flash_attention_2"로 변경
-)
-MAX_NUM_IMAGES = int(os.getenv("MAX_NUM_IMAGES", "5"))
 ##############################################################################
-# CSV, TXT, PDF 분석 함수
 ##############################################################################
 def analyze_csv_file(path: str) -> str:
-    """
-    CSV 파일을 전체 문자열로 변환. 너무 길 경우 일부만 표시.
-    """
     try:
         df = pd.read_csv(path)
         if df.shape[0] > 50 or df.shape[1] > 10:
             df = df.iloc[:50, :10]
         df_str = df.to_string()
         if len(df_str) > MAX_CONTENT_CHARS:
-            df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
-        return f"**[CSV File: {os.path.basename(path)}]**\n\n{df_str}"
     except Exception as e:
-        return f"Failed to read CSV ({os.path.basename(path)}): {str(e)}"
 def analyze_txt_file(path: str) -> str:
-    """
-    TXT 파일 전문 읽기. 너무 길면 일부만 표시.
-    """
     try:
         with open(path, "r", encoding="utf-8") as f:
             text = f.read()
         if len(text) > MAX_CONTENT_CHARS:
-            text = text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
-        return f"**[TXT File: {os.path.basename(path)}]**\n\n{text}"
     except Exception as e:
-        return f"Failed to read TXT ({os.path.basename(path)}): {str(e)}"
 def pdf_to_markdown(pdf_path: str) -> str:
-    """
-    PDF 텍스트를 Markdown으로 변환. 페이지별로 간단히 텍스트 추출.
-    """
     text_chunks = []
     try:
         with open(pdf_path, "rb") as f:
@@ -213,321 +177,226 @@ def pdf_to_markdown(pdf_path: str) -> str:
                 page_text = page_text.strip()
                 if page_text:
                     if len(page_text) > MAX_CONTENT_CHARS // max_pages:
-                        page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(truncated)"
-                    text_chunks.append(f"## Page {page_num+1}\n\n{page_text}\n")
             if len(reader.pages) > max_pages:
-                text_chunks.append(f"\n...(Showing {max_pages} of {len(reader.pages)} pages)...")
     except Exception as e:
-        return f"Failed to read PDF ({os.path.basename(pdf_path)}): {str(e)}"
     full_text = "\n".join(text_chunks)
     if len(full_text) > MAX_CONTENT_CHARS:
-        full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(truncated)..."
-    return f"**[PDF File: {os.path.basename(pdf_path)}]**\n\n{full_text}"
 ##############################################################################
-# 이미지/비디오 업로드 제한 검사
 ##############################################################################
-def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
-    for path in paths:
-        if path.endswith(".mp4"):
-            video_count += 1
-        elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", path, re.IGNORECASE):
-            image_count += 1
-    return image_count, video_count
-def count_files_in_history(history: list[dict]) -> tuple[int, int]:
-    image_count = 0
-    video_count = 0
-    for item in history:
-        if item["role"] != "user" or isinstance(item["content"], str):
-            continue
-        if isinstance(item["content"], list) and len(item["content"]) > 0:
-            file_path = item["content"][0]
-            if isinstance(file_path, str):
-                if file_path.endswith(".mp4"):
-                    video_count += 1
-                elif re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE):
-                    image_count += 1
-    return image_count, video_count
-def validate_media_constraints(message: dict, history: list[dict]) -> bool:
-    media_files = []
-    for f in message["files"]:
-        if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE) or f.endswith(".mp4"):
-            media_files.append(f)
-    new_image_count, new_video_count = count_files_in_new_message(media_files)
-    history_image_count, history_video_count = count_files_in_history(history)
-    image_count = history_image_count + new_image_count
-    video_count = history_video_count + new_video_count
-    if video_count > 1:
-        gr.Warning("Only one video is supported.")
-        return False
-    if video_count == 1:
-        if image_count > 0:
-            gr.Warning("Mixing images and videos is not allowed.")
-            return False
-        if "<image>" in message["text"]:
-            gr.Warning("Using <image> tags with video files is not supported.")
-            return False
-    if video_count == 0 and image_count > MAX_NUM_IMAGES:
-        gr.Warning(f"You can upload up to {MAX_NUM_IMAGES} images.")
-        return False
-    if "<image>" in message["text"]:
-        image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
-        image_tag_count = message["text"].count("<image>")
-        if image_tag_count != len(image_files):
-            gr.Warning("The number of <image> tags in the text does not match the number of image files.")
-            return False
-    return True
-##############################################################################
-# 비디오 처리 - 임시 파일 추적 코드 추가
-##############################################################################
-def downsample_video(video_path: str) -> list[tuple[Image.Image, float]]:
-    vidcap = cv2.VideoCapture(video_path)
-    fps = vidcap.get(cv2.CAP_PROP_FPS)
-    total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
-    frame_interval = max(int(fps), int(total_frames / 10))
-    frames = []
-    for i in range(0, total_frames, frame_interval):
-        vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
-        success, image = vidcap.read()
-        if success:
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-            # 이미지 크기 줄이기 추가
-            image = cv2.resize(image, (0, 0), fx=0.5, fy=0.5)
-            pil_image = Image.fromarray(image)
-            timestamp = round(i / fps, 2)
-            frames.append((pil_image, timestamp))
-            if len(frames) >= 5:
-                break
-    vidcap.release()
-    return frames
-def process_video(video_path: str) -> tuple[list[dict], list[str]]:
-    content = []
-    temp_files = []  # 임시 파일 추적을 위한 리스트
-    frames = downsample_video(video_path)
-    for frame in frames:
-        pil_image, timestamp = frame
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_file:
-            pil_image.save(temp_file.name)
-            temp_files.append(temp_file.name)  # 추적을 위해 경로 저장
-            content.append({"type": "text", "text": f"Frame {timestamp}:"})
-            content.append({"type": "image", "url": temp_file.name})
-    return content, temp_files
 ##############################################################################
-# interleaved <image> 처리
 ##############################################################################
-def process_interleaved_images(message: dict) -> list[dict]:
-    parts = re.split(r"(<image>)", message["text"])
-    content = []
-    image_index = 0
-    image_files = [f for f in message["files"] if re.search(r"\.(png|jpg|jpeg|gif|webp)$", f, re.IGNORECASE)]
-    for part in parts:
-        if part == "<image>" and image_index < len(image_files):
-            content.append({"type": "image", "url": image_files[image_index]})
-            image_index += 1
-        elif part.strip():
-            content.append({"type": "text", "text": part.strip()})
-        else:
-            if isinstance(part, str) and part != "<image>":
-                content.append({"type": "text", "text": part})
-    return content
-##############################################################################
-# PDF + CSV + TXT + 이미지/비디오
-##############################################################################
-def is_image_file(file_path: str) -> bool:
-    return bool(re.search(r"\.(png|jpg|jpeg|gif|webp)$", file_path, re.IGNORECASE))
-def is_video_file(file_path: str) -> bool:
-    return file_path.endswith(".mp4")
-def is_document_file(file_path: str) -> bool:
-    return (
-        file_path.lower().endswith(".pdf")
-        or file_path.lower().endswith(".csv")
-        or file_path.lower().endswith(".txt")
-    )
-def process_new_user_message(message: dict) -> tuple[list[dict], list[str]]:
-    temp_files = []  # 임시 파일 추적용 리스트
-    if not message["files"]:
-        return [{"type": "text", "text": message["text"]}], temp_files
-    video_files = [f for f in message["files"] if is_video_file(f)]
-    image_files = [f for f in message["files"] if is_image_file(f)]
-    csv_files = [f for f in message["files"] if f.lower().endswith(".csv")]
-    txt_files = [f for f in message["files"] if f.lower().endswith(".txt")]
-    pdf_files = [f for f in message["files"] if f.lower().endswith(".pdf")]
-    content_list = [{"type": "text", "text": message["text"]}]
-    for csv_path in csv_files:
-        csv_analysis = analyze_csv_file(csv_path)
-        content_list.append({"type": "text", "text": csv_analysis})
-    for txt_path in txt_files:
-        txt_analysis = analyze_txt_file(txt_path)
-        content_list.append({"type": "text", "text": txt_analysis})
-    for pdf_path in pdf_files:
-        pdf_markdown = pdf_to_markdown(pdf_path)
-        content_list.append({"type": "text", "text": pdf_markdown})
-    if video_files:
-        video_content, video_temp_files = process_video(video_files[0])
-        content_list += video_content
-        temp_files.extend(video_temp_files)
-        return content_list, temp_files
-    if "<image>" in message["text"] and image_files:
-        interleaved_content = process_interleaved_images({"text": message["text"], "files": image_files})
-        if content_list and content_list[0]["type"] == "text":
-            content_list = content_list[1:]
-        return interleaved_content + content_list, temp_files
-    else:
-        for img_path in image_files:
-            content_list.append({"type": "image", "url": img_path})
-    return content_list, temp_files
-##############################################################################
-# history -> LLM 메시지 변환
-##############################################################################
-def process_history(history: list[dict]) -> list[dict]:
-    messages = []
-    current_user_content: list[dict] = []
-    for item in history:
-        if item["role"] == "assistant":
-            if current_user_content:
-                messages.append({"role": "user", "content": current_user_content})
-                current_user_content = []
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
-        else:
-            content = item["content"]
-            if isinstance(content, str):
-                current_user_content.append({"type": "text", "text": content})
-            elif isinstance(content, list) and len(content) > 0:
-                file_path = content[0]
-                if is_image_file(file_path):
-                    current_user_content.append({"type": "image", "url": file_path})
-                else:
-                    current_user_content.append({"type": "text", "text": f"[File: {os.path.basename(file_path)}]"})
-    if current_user_content:
-        messages.append({"role": "user", "content": current_user_content})
-    return messages
 ##############################################################################
-# 모델 생성 함수에서 OOM 캐치
 ##############################################################################
 def _model_gen_with_oom_catch(**kwargs):
-    """
-    별도 스레드에서 OutOfMemoryError를 잡아주기 위해
-    """
     try:
         model.generate(**kwargs)
     except torch.cuda.OutOfMemoryError:
-        raise RuntimeError(
-            "[OutOfMemoryError] GPU 메모리가 부족합니다. "
-            "Max New Tokens을 줄이거나, 프롬프트 길이를 줄여주세요."
-        )
     finally:
-        # 생성 완료 후 한번 더 캐시 비우기
         clear_cuda_cache()
-##############################################################################
-# 메인 추론 함수 (web search 체크 시 자동 키워드추출->검색->결과 system msg)
-##############################################################################
 @spaces.GPU(duration=120)
-def run(
-    message: dict,
-    history: list[dict],
-    system_prompt: str = "",
-    max_new_tokens: int = 512,
     use_web_search: bool = False,
-    web_search_query: str = "",
 ) -> Iterator[str]:
-    if not validate_media_constraints(message, history):
-        yield ""
-        return
-    temp_files = []  # 임시 파일 추적용
     try:
-        combined_system_msg = ""
-        # 내부적으로만 사용 (UI에서는 보이지 않음)
-        if system_prompt.strip():
-            combined_system_msg += f"[System Prompt]\n{system_prompt.strip()}\n\n"
         if use_web_search:
-            user_text = message["text"]
-            ws_query = extract_keywords(user_text, top_k=5)
-            if ws_query.strip():
-                logger.info(f"[Auto WebSearch Keyword] {ws_query!r}")
-                ws_result = do_web_search(ws_query)
-                combined_system_msg += f"[Search top-20 Full Items Based on user prompt]\n{ws_result}\n\n"
-                # >>> 추가된 안내 문구 (검색 결과의 link 등 출처를 활용)
-                combined_system_msg += "[참고: 위 검색결과 내용과 link를 출처로 인용하여 답변해 주세요.]\n\n"
-                combined_system_msg += """
-[중요 지시사항]
-1. 답변에 검색 결과에서 찾은 정보의 출처를 반드시 인용하세요.
-2. 출처 인용 시 "[출처 제목](링크)" 형식의 마크다운 링크를 사용하세요.
-3. 여러 출처의 정보를 종합하여 답변하세요.
-4. 답변 마지막에 "참고 자료:" 섹션을 추가하고 사용한 주요 출처 링크를 나열하세요.
-"""
             else:
-                combined_system_msg += "[No valid keywords found, skipping WebSearch]\n\n"
-        messages = []
-        if combined_system_msg.strip():
-            messages.append({
                 "role": "system",
-                "content": [{"type": "text", "text": combined_system_msg.strip()}],
-            })
-        messages.extend(process_history(history))
-        user_content, user_temp_files = process_new_user_message(message)
-        temp_files.extend(user_temp_files)  # 임시 파일 추적
-        for item in user_content:
-            if item["type"] == "text" and len(item["text"]) > MAX_CONTENT_CHARS:
-                item["text"] = item["text"][:MAX_CONTENT_CHARS] + "\n...(truncated)..."
-        messages.append({"role": "user", "content": user_content})
         inputs = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
@@ -536,314 +405,337 @@ def run(
             return_tensors="pt",
         ).to(device=model.device, dtype=torch.bfloat16)
-        # 입력 토큰 수 제한 추가
-        if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
-            inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
-            if 'attention_mask' in inputs:
-                inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
         streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs = dict(
             inputs,
             streamer=streamer,
             max_new_tokens=max_new_tokens,
         )
         t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
         t.start()
         output = ""
         for new_text in streamer:
             output += new_text
             yield output
     except Exception as e:
-        logger.error(f"Error in run: {str(e)}")
-        yield f"죄송합니다. 오류가 발생했습니다: {str(e)}"
     finally:
-        # 임시 파일 삭제
-        for temp_file in temp_files:
-            try:
-                if os.path.exists(temp_file):
-                    os.unlink(temp_file)
-                    logger.info(f"Deleted temp file: {temp_file}")
-            except Exception as e:
-                logger.warning(f"Failed to delete temp file {temp_file}: {e}")
-        # 명시적 메모리 정리
-        try:
-            del inputs, streamer
-        except:
-            pass
         clear_cuda_cache()
 ##############################################################################
-# 예시들 (모두 영어로)
-##############################################################################
-examples = [
-    [
-        {
-            "text": "Compare the contents of the two PDF files.",
-            "files": [
-                "assets/additional-examples/before.pdf",
-                "assets/additional-examples/after.pdf",
-            ],
-        }
-    ],
-    [
-        {
-            "text": "Summarize and analyze the contents of the CSV file.",
-            "files": ["assets/additional-examples/sample-csv.csv"],
-        }
-    ],
-    [
-        {
-            "text": "Assume the role of a friendly and understanding girlfriend. Describe this video.",
-            "files": ["assets/additional-examples/tmp.mp4"],
-        }
-    ],
-    [
-        {
-            "text": "Describe the cover and read the text on it.",
-            "files": ["assets/additional-examples/maz.jpg"],
-        }
-    ],
-    [
-        {
-            "text": "I already have this supplement <image> and I plan to buy this product <image>. Are there any precautions when taking them together?",
-            "files": ["assets/additional-examples/pill1.png", "assets/additional-examples/pill2.png"],
-        }
-    ],
-    [
-        {
-            "text": "Solve this integral.",
-            "files": ["assets/additional-examples/4.png"],
-        }
-    ],
-    [
-        {
-            "text": "When was this ticket issued, and what is its price?",
-            "files": ["assets/additional-examples/2.png"],
-        }
-    ],
-    [
-        {
-            "text": "Based on the sequence of these images, create a short story.",
-            "files": [
-                "assets/sample-images/09-1.png",
-                "assets/sample-images/09-2.png",
-                "assets/sample-images/09-3.png",
-                "assets/sample-images/09-4.png",
-                "assets/sample-images/09-5.png",
-            ],
-        }
-    ],
-    [
-        {
-            "text": "Write Python code using matplotlib to plot a bar chart that matches this image.",
-            "files": ["assets/additional-examples/barchart.png"],
-        }
-    ],
-    [
-        {
-            "text": "Read the text in the image and write it out in Markdown format.",
-            "files": ["assets/additional-examples/3.png"],
-        }
-    ],
-    [
-        {
-            "text": "What does this sign say?",
-            "files": ["assets/sample-images/02.png"],
-        }
-    ],
-    [
-        {
-            "text": "Compare the two images and describe their similarities and differences.",
-            "files": ["assets/sample-images/03.png"],
-        }
-    ],
-]
-##############################################################################
-# Gradio UI (Blocks) 구성 (좌측 사이드 메뉴 없이 전체화면 채팅)
 ##############################################################################
 css = """
-/* 1) UI를 처음부터 가장 넓게 (width 100%) 고정하여 표시 */
-.gradio-container {
-    background: rgba(255, 255, 255, 0.7); /* 배경 투명도 증가 */
-    padding: 30px 40px;
-    margin: 20px auto;  /* 위아래 여백만 유지 */
-    width: 100% !important;
-    max-width: none !important; /* 1200px 제한 제거 */
 }
-.fillable {
-    width: 100% !important;
-    max-width: 100% !important;
-}
-/* 2) 배경을 완전히 투명하게 변경 */
-body {
-    background: transparent; /* 완전 투명 배경 */
-    margin: 0;
-    padding: 0;
-    font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif;
-    color: #333;
-}
-/* 버튼 색상 완전히 제거하고 투명하게 */
-button, .btn {
-    background: transparent !important; /* 색상 완전히 제거 */
-    border: 1px solid #ddd; /* 경계선만 살짝 추가 */
-    color: #333;
-    padding: 12px 24px;
-    text-transform: uppercase;
     font-weight: bold;
-    letter-spacing: 1px;
-    cursor: pointer;
-}
-button:hover, .btn:hover {
-    background: rgba(0, 0, 0, 0.05) !important; /* 호버 시 아주 살짝 어둡게만 */
 }
-/* examples 관련 모든 색상 제거 */
-#examples_container, .examples-container {
-    margin: auto;
-    width: 90%;
-    background: transparent !important;
-}
-#examples_row, .examples-row {
-    justify-content: center;
-    background: transparent !important;
-}
-/* examples 버튼 내부의 모든 색상 제거 */
-.gr-samples-table button,
-.gr-samples-table .gr-button,
-.gr-samples-table .gr-sample-btn,
-.gr-examples button,
-.gr-examples .gr-button,
-.gr-examples .gr-sample-btn,
-.examples button,
-.examples .gr-button,
-.examples .gr-sample-btn {
-    background: transparent !important;
-    border: 1px solid #ddd;
-    color: #333;
-}
-/* examples 버튼 호버 시에도 색상 없게 */
-.gr-samples-table button:hover,
-.gr-samples-table .gr-button:hover,
-.gr-samples-table .gr-sample-btn:hover,
-.gr-examples button:hover,
-.gr-examples .gr-button:hover,
-.gr-examples .gr-sample-btn:hover,
-.examples button:hover,
-.examples .gr-button:hover,
-.examples .gr-sample-btn:hover {
-    background: rgba(0, 0, 0, 0.05) !important;
-}
-/* 채팅 인터페이스 요소들도 투명하게 */
-.chatbox, .chatbot, .message {
-    background: transparent !important;
 }
-/* 입력창 투명도 조정 */
-.multimodal-textbox, textarea, input {
-    background: rgba(255, 255, 255, 0.5) !important;
 }
-/* 모든 컨테이너 요소에 배경색 제거 */
-.container, .wrap, .box, .panel, .gr-panel {
-    background: transparent !important;
-}
-/* 예제 섹션의 모든 요소에서 배경색 제거 */
-.gr-examples-container, .gr-examples, .gr-sample, .gr-sample-row, .gr-sample-cell {
-    background: transparent !important;
 }
 """
-title_html = """
-<h1 align="center" style="margin-bottom: 0.2em; font-size: 1.6em;"> 🤗 Gemma3-R1984-4B </h1>
-<p align="center" style="font-size:1.1em; color:#555;">
-    ✅Agentic AI Platform ✅Reasoning & Uncensored ✅Multimodal & VLM ✅Deep-Research & RAG <br>
-    Operates on an ✅'NVIDIA L40s / A100(ZeroGPU) GPU' as an independent local server, enhancing security and preventing information leakage.<br>
-    @Model Rpository: VIDraft/Gemma-3-R1984-4B, @Based by 'Google Gemma-3-4b', @Powered by 'MOUSE-II'(VIDRAFT)
-</p>
-"""
-with gr.Blocks(css=css, title="Gemma3-R1984-4B") as demo:
-    gr.Markdown(title_html)
-    # Display the web search option (while the system prompt and token slider remain hidden)
-    web_search_checkbox = gr.Checkbox(
-        label="Deep Research",
-        value=False
     )
-    # Used internally but not visible to the user
-    system_prompt_box = gr.Textbox(
-        lines=3,
-        value="You are a deep thinking AI that may use extremely long chains of thought to thoroughly analyze the problem and deliberate using systematic reasoning processes to arrive at a correct solution before answering.",
-        visible=False  # hidden from view
     )
-    max_tokens_slider = gr.Slider(
-        label="Max New Tokens",
-        minimum=100,
-        maximum=8000,
-        step=50,
-        value=1000,
-        visible=False  # hidden from view
     )
-    web_search_text = gr.Textbox(
-        lines=1,
-        label="(Unused) Web Search Query",
-        placeholder="No direct input needed",
-        visible=False  # hidden from view
     )
-    # Configure the chat interface
-    chat = gr.ChatInterface(
-        fn=run,
-        type="messages",
-        chatbot=gr.Chatbot(type="messages", scale=1, allow_tags=["image"]),
-        textbox=gr.MultimodalTextbox(
-            file_types=[
-                ".webp", ".png", ".jpg", ".jpeg", ".gif",
-                ".mp4", ".csv", ".txt", ".pdf"
-            ],
-            file_count="multiple",
-            autofocus=True
-        ),
-        multimodal=True,
-        additional_inputs=[
-            system_prompt_box,
-            max_tokens_slider,
-            web_search_checkbox,
-            web_search_text,
-        ],
-        stop_btn=False,
-        title='<a href="https://discord.gg/openfreeai" target="_blank">https://discord.gg/openfreeai</a>',
-        examples=examples,
-        run_examples_on_click=False,
-        cache_examples=False,
-        css_paths=None,
-        delete_cache=(1800, 1800),
     )
-    # Example section - since examples are already set in ChatInterface, this is for display only
-    with gr.Row(elem_id="examples_row"):
-        with gr.Column(scale=12, elem_id="examples_container"):
-            gr.Markdown("### Example Inputs (click to load)")
 if __name__ == "__main__":
-    # Run locally
-    demo.launch()

+#!/usr/bin/env python3
 import os
 import re
 import tempfile
+import gc
 from collections.abc import Iterator
 from threading import Thread
 import json
 import gradio as gr
 import spaces
 import torch
+import numpy as np
 from loguru import logger
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+import time
+import warnings
+from typing import Dict, List, Optional, Union
 # CSV/TXT 분석
 import pandas as pd
 # PDF 텍스트 추출
 import PyPDF2
+warnings.filterwarnings('ignore')
+print("🎮 로봇 시각 시스템 초기화 (Gemma3-R1984-4B)...")
+##############################################################################
+# 상수 정의
+##############################################################################
+MAX_CONTENT_CHARS = 2000
+MAX_INPUT_LENGTH = 2096
+MAX_NUM_IMAGES = 5
+SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
 ##############################################################################
+# 전역 변수
+##############################################################################
+model = None
+processor = None
+model_loaded = False
+model_name = "Gemma3-R1984-4B"
+##############################################################################
+# 메모리 관리
 ##############################################################################
 def clear_cuda_cache():
     """CUDA 캐시를 명시적으로 비웁니다."""
         gc.collect()
 ##############################################################################
+# 키워드 추출 함수
 ##############################################################################
 def extract_keywords(text: str, top_k: int = 5) -> str:
+    """키워드 추출"""
     text = re.sub(r"[^a-zA-Z0-9가-힣\s]", "", text)
     tokens = text.split()
+    seen = set()
+    unique_tokens = []
+    for token in tokens:
+        if token not in seen and len(token) > 1:
+            seen.add(token)
+            unique_tokens.append(token)
+    key_tokens = unique_tokens[:top_k]
     return " ".join(key_tokens)
 ##############################################################################
+# 웹 검색 함수
 ##############################################################################
 def do_web_search(query: str) -> str:
+    """SerpHouse API를 사용한 웹 검색"""
     try:
         url = "https://api.serphouse.com/serp/live"
         params = {
             "q": query,
             "domain": "google.com",
+            "serp_type": "web",
             "device": "desktop",
+            "lang": "ko",  # 한국어 우선
+            "num": "10"   # 10개로 제한
         }
         headers = {
             "Authorization": f"Bearer {SERPHOUSE_API_KEY}"
         }
+        logger.info(f"웹 검색 중... 검색어: {query}")
         response = requests.get(url, headers=headers, params=params, timeout=60)
         response.raise_for_status()
         data = response.json()
         results = data.get("results", {})
+        organic = results.get("organic", []) if isinstance(results, dict) else []
         if not organic:
+            return "검색 결과를 찾을 수 없습니다."
+        max_results = min(10, len(organic))
         limited_organic = organic[:max_results]
         summary_lines = []
         for idx, item in enumerate(limited_organic, start=1):
+            title = item.get("title", "제목 없음")
             link = item.get("link", "#")
+            snippet = item.get("snippet", "설명 없음")
             displayed_link = item.get("displayed_link", link)
             summary_lines.append(
+                f"### 결과 {idx}: {title}\n\n"
                 f"{snippet}\n\n"
                 f"**출처**: [{displayed_link}]({link})\n\n"
                 f"---\n"
             )
+        instructions = """# 웹 검색 결과
+아래는 검색 결과입니다. 답변 시 이 정보를 활용하세요:
+1. 각 결과의 제목, 내용, 출처 링크를 참조하세요
+2. 관련 출처를 명시적으로 인용하세요
+3. 여러 출처의 정보를 종합하여 답변하세요
 """
         search_results = instructions + "\n".join(summary_lines)
         return search_results
     except Exception as e:
+        logger.error(f"웹 검색 실패: {e}")
+        return f"웹 검색 실패: {str(e)}"
 ##############################################################################
+# 문서 처리 함수
 ##############################################################################
 def analyze_csv_file(path: str) -> str:
+    """CSV 파일 분석"""
     try:
         df = pd.read_csv(path)
         if df.shape[0] > 50 or df.shape[1] > 10:
             df = df.iloc[:50, :10]
         df_str = df.to_string()
         if len(df_str) > MAX_CONTENT_CHARS:
+            df_str = df_str[:MAX_CONTENT_CHARS] + "\n...(중략)..."
+        return f"**[CSV 파일: {os.path.basename(path)}]**\n\n{df_str}"
     except Exception as e:
+        return f"CSV 읽기 실패 ({os.path.basename(path)}): {str(e)}"
 def analyze_txt_file(path: str) -> str:
+    """TXT 파일 분석"""
     try:
         with open(path, "r", encoding="utf-8") as f:
             text = f.read()
         if len(text) > MAX_CONTENT_CHARS:
+            text = text[:MAX_CONTENT_CHARS] + "\n...(중략)..."
+        return f"**[TXT 파일: {os.path.basename(path)}]**\n\n{text}"
     except Exception as e:
+        return f"TXT 읽기 실패 ({os.path.basename(path)}): {str(e)}"
 def pdf_to_markdown(pdf_path: str) -> str:
+    """PDF를 마크다운으로 변환"""
     text_chunks = []
     try:
         with open(pdf_path, "rb") as f:
                 page_text = page_text.strip()
                 if page_text:
                     if len(page_text) > MAX_CONTENT_CHARS // max_pages:
+                        page_text = page_text[:MAX_CONTENT_CHARS // max_pages] + "...(중략)"
+                    text_chunks.append(f"## 페이지 {page_num+1}\n\n{page_text}\n")
             if len(reader.pages) > max_pages:
+                text_chunks.append(f"\n...({max_pages}/{len(reader.pages)} 페이지 표시)...")
     except Exception as e:
+        return f"PDF 읽기 실패 ({os.path.basename(pdf_path)}): {str(e)}"
     full_text = "\n".join(text_chunks)
     if len(full_text) > MAX_CONTENT_CHARS:
+        full_text = full_text[:MAX_CONTENT_CHARS] + "\n...(중략)..."
+    return f"**[PDF 파일: {os.path.basename(pdf_path)}]**\n\n{full_text}"
 ##############################################################################
+# 모델 로드
 ##############################################################################
+@spaces.GPU(duration=120)
+def load_model():
+    global model, processor, model_loaded
+    if model_loaded:
+        logger.info("모델이 이미 로드되어 있습니다.")
+        return True
+    try:
+        logger.info("Gemma3-R1984-4B 모델 로딩 시작...")
+        clear_cuda_cache()
+        model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
+        processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
+        model = Gemma3ForConditionalGeneration.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype=torch.bfloat16,
+            attn_implementation="eager"
+        )
+        model_loaded = True
+        logger.info(f"✅ {model_name} 로딩 완료!")
+        return True
+    except Exception as e:
+        logger.error(f"모델 로딩 실패: {e}")
+        return False
 ##############################################################################
+# 이미지 분석 (로봇 태스크 중심)
 ##############################################################################
+@spaces.GPU(duration=60)
+def analyze_image_for_robot(
+    image: Union[np.ndarray, Image.Image],
+    prompt: str,
+    task_type: str = "general",
+    use_web_search: bool = False,
+    enable_thinking: bool = True,
+    max_new_tokens: int = 1024
+) -> str:
+    """로봇 작업을 위한 이미지 분석"""
+    global model, processor
+    if not model_loaded:
+        if not load_model():
+            return "❌ 모델 로딩 실패"
+    try:
+        # numpy 배열을 PIL 이미지로 변환
+        if isinstance(image, np.ndarray):
+            image = Image.fromarray(image).convert('RGB')
+        # 태스크별 시스템 프롬프트 구성
+        system_prompts = {
+            "general": "당신은 로봇 시각 시스템입니다. 이미지를 자세히 분석하고 설명하세요.",
+            "planning": """당신은 로봇 작업 계획을 수립하는 AI입니다.
+주어진 이미지와 작업을 분석하여 단계별 실행 계획을 작성하세요.
+형식: Step_1: xxx\nStep_2: xxx\n...\nStep_n: xxx""",
+            "grounding": "당신은 객체 위치를 찾는 로봇 시각 시스템입니다. 요청된 객체의 위치를 [x1, y1, x2, y2] 좌표로 반환하세요.",
+            "affordance": "당신은 로봇 파지점을 분석하는 AI입니다. 주어진 작업을 위한 최적의 파지 영역을 [x1, y1, x2, y2] 좌표로 예측하세요.",
+            "trajectory": "당신은 로봇 경로를 계획하는 AI입니다. 목표 지점까지의 경로를 [(x1,y1), (x2,y2), ...] 형식으로 제시하세요.",
+            "pointing": "당신은 다중 지점을 지정하는 로봇 시각 시스템입니다. 요청된 위치들을 [(x1,y1), (x2,y2), ...] 형식으로 반환하세요."
+        }
+        system_prompt = system_prompts.get(task_type, system_prompts["general"])
+        # Chain-of-Thought 추가
+        if enable_thinking:
+            system_prompt += "\n\n추론 과정을 <thinking></thinking> 태그 안에 상세히 작성한 후 최종 답변을 제시하세요."
+        # 웹 검색 수행
+        combined_system = system_prompt
+        if use_web_search:
+            keywords = extract_keywords(prompt, top_k=5)
+            if keywords:
+                logger.info(f"웹 검색 키워드: {keywords}")
+                search_results = do_web_search(keywords)
+                combined_system = f"{search_results}\n\n{system_prompt}"
+        # 메시지 구성
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": combined_system}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "url": image},
+                    {"type": "text", "text": prompt}
+                ]
+            }
+        ]
+        # 입력 처리
+        inputs = processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        ).to(device=model.device, dtype=torch.bfloat16)
+        # 입력 토큰 수 제한
+        if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
+            inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
+            if 'attention_mask' in inputs:
+                inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
+        # 생성
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=0.7,
+                top_p=0.9,
+            )
+        # 디코딩
+        response = processor.decode(outputs[0], skip_special_tokens=True)
+        # 프롬프트 제거
+        if "Assistant:" in response:
+            response = response.split("Assistant:")[-1].strip()
+        return response
+    except Exception as e:
+        logger.error(f"이미지 분석 오류: {e}")
+        import traceback
+        return f"❌ 분석 오류: {str(e)}\n{traceback.format_exc()}"
+    finally:
+        clear_cuda_cache()
 ##############################################################################
+# 문서 분석 (스트리밍)
 ##############################################################################
 def _model_gen_with_oom_catch(**kwargs):
+    """OOM 처리를 위한 생성 함수"""
+    global model
     try:
         model.generate(**kwargs)
     except torch.cuda.OutOfMemoryError:
+        raise RuntimeError("GPU 메모리 부족. Max Tokens를 줄여주세요.")
     finally:
         clear_cuda_cache()
 @spaces.GPU(duration=120)
+def analyze_documents_streaming(
+    files: List[str],
+    prompt: str,
     use_web_search: bool = False,
+    max_new_tokens: int = 2048
 ) -> Iterator[str]:
+    """문서 분석 (스트리밍)"""
+    global model, processor
+    if not model_loaded:
+        if not load_model():
+            yield "❌ 모델 로딩 실패"
+            return
     try:
+        # 시스템 프롬프트
+        system_content = "당신은 문서를 분석하고 요약하는 전문 AI입니다."
+        # 웹 검색
         if use_web_search:
+            keywords = extract_keywords(prompt, top_k=5)
+            if keywords:
+                search_results = do_web_search(keywords)
+                system_content = f"{search_results}\n\n{system_content}"
+        # 문서 내용 처리
+        doc_contents = []
+        for file_path in files:
+            if file_path.lower().endswith('.csv'):
+                content = analyze_csv_file(file_path)
+            elif file_path.lower().endswith('.txt'):
+                content = analyze_txt_file(file_path)
+            elif file_path.lower().endswith('.pdf'):
+                content = pdf_to_markdown(file_path)
             else:
+                continue
+            doc_contents.append(content)
+        # 메시지 구성
+        messages = [
+            {
                 "role": "system",
+                "content": [{"type": "text", "text": system_content}]
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
+                ]
+            }
+        ]
+        # 입력 처리
         inputs = processor.apply_chat_template(
             messages,
             add_generation_prompt=True,
             return_tensors="pt",
         ).to(device=model.device, dtype=torch.bfloat16)
+        # 스트리밍 설정
         streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
         gen_kwargs = dict(
             inputs,
             streamer=streamer,
             max_new_tokens=max_new_tokens,
+            temperature=0.8,
+            top_p=0.9,
         )
+        # 별도 스레드에서 생성
         t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
         t.start()
+        # 스트리밍 출력
         output = ""
         for new_text in streamer:
             output += new_text
             yield output
     except Exception as e:
+        logger.error(f"문서 분석 오류: {e}")
+        yield f"❌ 오류 발생: {str(e)}"
     finally:
         clear_cuda_cache()
 ##############################################################################
+# Gradio UI (로봇 시각화 중심)
 ##############################################################################
 css = """
+.robot-header {
+    text-align: center;
+    background: linear-gradient(135deg, #1e3c72 0%, #2a5298 50%, #667eea 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 10px;
+    margin-bottom: 20px;
+    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
 }
+.status-box {
+    text-align: center;
+    padding: 10px;
+    border-radius: 5px;
+    margin: 10px 0;
     font-weight: bold;
 }
+.info-box {
+    background: #f0f0f0;
+    padding: 15px;
+    border-radius: 8px;
+    margin: 10px 0;
+    border-left: 4px solid #2a5298;
 }
+.task-button {
+    min-height: 60px;
+    font-size: 1.1em;
 }
+.webcam-container {
+    border: 3px solid #2a5298;
+    border-radius: 10px;
+    padding: 10px;
+    background: #f8f9fa;
 }
 """
+with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as demo:
+    gr.HTML("""
+    <div class="robot-header">
+        <h1>🤖 로봇 시각 시스템</h1>
+        <h3>🎮 Gemma3-R1984-4B + 📷 실시간 웹캠 + 🔍 웹 검색</h3>
+        <p>⚡ 최신 멀티모달 AI로 로봇 작업 분석 및 계획 수립!</p>
+    </div>
+    """)
+    gr.HTML("""
+    <div class="info-box">
+        <h4>🌟 시스템 특징:</h4>
+        <ul>
+            <li>🖼️ 고급 이미지/비디오 분석 (Gemma3-4B VLM)</li>
+            <li>📋 다단계 작업 계획 및 추론</li>
+            <li>📍 정밀한 객체 위치 파악 (Grounding)</li>
+            <li>🤏 로봇 파지점 분석 (Affordance)</li>
+            <li>🛤️ 경로 계획 (Trajectory Planning)</li>
+            <li>🔍 실시간 웹 검색 통합</li>
+            <li>📄 문서 분석 (PDF, CSV, TXT)</li>
+        </ul>
+    </div>
+    """)
+    with gr.Row():
+        # 왼쪽: 웹캠 및 입력
+        with gr.Column(scale=1):
+            gr.Markdown("### 📷 실시간 웹캠")
+            with gr.Group(elem_classes="webcam-container"):
+                webcam = gr.Image(
+                    sources=["webcam"],
+                    streaming=True,
+                    type="numpy",
+                    label="실시간 스트리밍",
+                    height=350
+                )
+                # 캡처된 이미지 표시
+                captured_image = gr.Image(
+                    label="캡처된 이미지",
+                    height=200,
+                    visible=False
+                )
+            # 로봇 작업 버튼들
+            gr.Markdown("### 🎯 로봇 작업 선택")
+            with gr.Row():
+                capture_btn = gr.Button("📸 캡처", variant="primary", elem_classes="task-button")
+                clear_capture_btn = gr.Button("🗑️ 초기화", elem_classes="task-button")
+            with gr.Row():
+                planning_btn = gr.Button("📋 작업 계획", elem_classes="task-button")
+                grounding_btn = gr.Button("📍 객체 위치", elem_classes="task-button")
+            with gr.Row():
+                affordance_btn = gr.Button("🤏 파지점 분석", elem_classes="task-button")
+                trajectory_btn = gr.Button("🛤️ 경로 계획", elem_classes="task-button")
+        # 오른쪽: 분석 설정 및 결과
+        with gr.Column(scale=2):
+            gr.Markdown("### ⚙️ 분석 설정")
+            with gr.Row():
+                with gr.Column():
+                    task_prompt = gr.Textbox(
+                        label="작업 설명 / 질문",
+                        placeholder="예: 테이블 위의 컵을 잡아서 싱크대에 놓기",
+                        value="이 장면에서 로봇이 수행할 수 있는 작업을 분석하세요.",
+                        lines=2
+                    )
+                    with gr.Row():
+                        use_web_search = gr.Checkbox(
+                            label="🔍 웹 검색 사용",
+                            value=False,
+                            info="관련 정보를 웹에서 검색합니다"
+                        )
+                        enable_thinking = gr.Checkbox(
+                            label="🤔 추론 과정 표시",
+                            value=True,
+                            info="Chain-of-Thought 추론 과정을 보여줍니다"
+                        )
+                    max_tokens = gr.Slider(
+                        label="최대 토큰 수",
+                        minimum=256,
+                        maximum=4096,
+                        value=1024,
+                        step=256
+                    )
+            gr.Markdown("### 📊 분석 결과")
+            result_output = gr.Textbox(
+                label="AI 분석 결과",
+                lines=20,
+                max_lines=40,
+                show_copy_button=True,
+                elem_id="result"
+            )
+            status_display = gr.HTML(
+                '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비 완료</div>'
+            )
+    # 문서 분석 탭
+    with gr.Tab("📄 문서 분석"):
+        with gr.Row():
+            with gr.Column():
+                doc_files = gr.File(
+                    label="문서 업로드",
+                    file_count="multiple",
+                    file_types=[".pdf", ".csv", ".txt"],
+                    type="filepath"
+                )
+                doc_prompt = gr.Textbox(
+                    label="분석 요청",
+                    placeholder="예: 이 문서들의 핵심 내용을 요약하고 비교 분석하세요.",
+                    lines=3
+                )
+                doc_web_search = gr.Checkbox(
+                    label="🔍 웹 검색 사용",
+                    value=False
+                )
+                analyze_docs_btn = gr.Button("📊 문서 분석", variant="primary")
+            with gr.Column():
+                doc_result = gr.Textbox(
+                    label="분석 결과",
+                    lines=25,
+                    max_lines=50
+                )
+    # 이벤트 핸들러
+    webcam_state = gr.State(None)
+    def capture_webcam(frame):
+        """웹캠 프레임 캡처"""
+        if frame is None:
+            return None, None, '<div class="status-box" style="background:#f8d7da; color:#721c24;">❌ 웹캠 프레임 없음</div>'
+        return frame, gr.update(value=frame, visible=True), '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 이미지 캡처 완료</div>'
+    def clear_capture():
+        """캡처 초기화"""
+        return None, gr.update(visible=False), '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비 완료</div>'
+    def analyze_with_task(image, prompt, task_type, use_search, thinking, tokens):
+        """특정 태스크로 이미지 분석"""
+        if image is None:
+            return "❌ 먼저 이미지를 캡처하세요.", '<div class="status-box" style="background:#f8d7da; color:#721c24;">❌ 이미지 없음</div>'
+        status = f'<div class="status-box" style="background:#cce5ff; color:#004085;">🚀 {task_type} 분석 중...</div>'
+        result = analyze_image_for_robot(
+            image=image,
+            prompt=prompt,
+            task_type=task_type,
+            use_web_search=use_search,
+            enable_thinking=thinking,
+            max_new_tokens=tokens
+        )
+        # 결과 포맷팅
+        timestamp = time.strftime("%H:%M:%S")
+        task_names = {
+            "planning": "작업 계획",
+            "grounding": "객체 위치 파악",
+            "affordance": "파지점 분석",
+            "trajectory": "경로 계획"
+        }
+        formatted_result = f"""🤖 로봇 {task_names.get(task_type, '분석')} 결과:
+📸 **작업**: {prompt}
+📝 **분석 결과**:
+{result}
+⏰ 분석 시간: {timestamp}
+🎯 모델: {model_name}
+🔧 태스크: {task_type}"""
+        complete_status = '<div class="status-box" style="background:#d4edda; color:#155724;">✅ 분석 완료!</div>'
+        return formatted_result, complete_status
+    # 웹캠 스트리밍
+    webcam.stream(
+        fn=lambda x: x,
+        inputs=[webcam],
+        outputs=[webcam_state]
     )
+    # 캡처 버튼
+    capture_btn.click(
+        fn=capture_webcam,
+        inputs=[webcam_state],
+        outputs=[webcam_state, captured_image, status_display]
     )
+    # 초기화 버튼
+    clear_capture_btn.click(
+        fn=clear_capture,
+        outputs=[webcam_state, captured_image, status_display]
     )
+    # 작업 버튼들
+    planning_btn.click(
+        fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "planning", s, t, tk),
+        inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
+        outputs=[result_output, status_display]
     )
+    grounding_btn.click(
+        fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "grounding", s, t, tk),
+        inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
+        outputs=[result_output, status_display]
+    )
+    affordance_btn.click(
+        fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "affordance", s, t, tk),
+        inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
+        outputs=[result_output, status_display]
+    )
+    trajectory_btn.click(
+        fn=lambda img, p, s, t, tk: analyze_with_task(img, p, "trajectory", s, t, tk),
+        inputs=[captured_image, task_prompt, use_web_search, enable_thinking, max_tokens],
+        outputs=[result_output, status_display]
+    )
+    # 문서 분석
+    def analyze_docs(files, prompt, use_search):
+        if not files:
+            return "❌ 문서를 업로드하세요."
+        output = ""
+        for chunk in analyze_documents_streaming(files, prompt, use_search):
+            output = chunk
+        return output
+    analyze_docs_btn.click(
+        fn=analyze_docs,
+        inputs=[doc_files, doc_prompt, doc_web_search],
+        outputs=[doc_result]
+    )
+    # 초기 모델 로드
+    def initial_load():
+        load_model()
+        return "시스템 준비 완료! 🚀"
+    demo.load(
+        fn=initial_load,
+        outputs=None
     )
 if __name__ == "__main__":
+    print("🚀 로봇 시각 시스템 시작 (Gemma3-R1984-4B)...")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        debug=False
+    )