ROBO-R1984

Build error

App Files Files Community

openfree commited on Jun 16, 2025

Commit

3116318

verified ·

1 Parent(s): 59be132

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -131

app.py CHANGED Viewed

@@ -15,10 +15,15 @@ import torch
 import numpy as np
 from loguru import logger
 from PIL import Image
-from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
 import time
 import warnings
 from typing import Dict, List, Optional, Union
 # CSV/TXT 분석
 import pandas as pd
@@ -27,7 +32,7 @@ import PyPDF2
 warnings.filterwarnings('ignore')
-print("🎮 로봇 시각 시스템 초기화 (Gemma3-R1984-4B)...")
 ##############################################################################
 # 상수 정의
@@ -40,10 +45,9 @@ SERPHOUSE_API_KEY = os.getenv("SERPHOUSE_API_KEY", "")
 ##############################################################################
 # 전역 변수
 ##############################################################################
-model = None
-processor = None
 model_loaded = False
-model_name = "Gemma3-R1984-4B"
 ##############################################################################
 # 메모리 관리
@@ -85,8 +89,8 @@ def do_web_search(query: str) -> str:
             "domain": "google.com",
             "serp_type": "web",
             "device": "desktop",
-            "lang": "ko",  # 한국어 우선
-            "num": "10"   # 10개로 제한
         }
         headers = {
@@ -190,29 +194,57 @@ def pdf_to_markdown(pdf_path: str) -> str:
     return f"**[PDF 파일: {os.path.basename(pdf_path)}]**\n\n{full_text}"
 ##############################################################################
 # 모델 로드
 ##############################################################################
 @spaces.GPU(duration=120)
 def load_model():
-    global model, processor, model_loaded
     if model_loaded:
         logger.info("모델이 이미 로드되어 있습니다.")
         return True
     try:
-        logger.info("Gemma3-R1984-4B 모델 로딩 시작...")
         clear_cuda_cache()
-        model_id = os.getenv("MODEL_ID", "VIDraft/Gemma-3-R1984-4B")
-        processor = AutoProcessor.from_pretrained(model_id, padding_side="left")
-        model = Gemma3ForConditionalGeneration.from_pretrained(
-            model_id,
-            device_map="auto",
-            torch_dtype=torch.bfloat16,
-            attn_implementation="eager"
         )
         model_loaded = True
@@ -223,6 +255,38 @@ def load_model():
         logger.error(f"모델 로딩 실패: {e}")
         return False
 ##############################################################################
 # 이미지 분석 (로봇 태스크 중심)
 ##############################################################################
@@ -232,22 +296,21 @@ def analyze_image_for_robot(
     prompt: str,
     task_type: str = "general",
     use_web_search: bool = False,
-    enable_thinking: bool = False,  # 기본값 False로 변경
-    max_new_tokens: int = 300  # 장면 설명을 위해 300으로 증가
 ) -> str:
     """로봇 작업을 위한 이미지 분석"""
-    global model, processor
     if not model_loaded:
         if not load_model():
             return "❌ 모델 로딩 실패"
     try:
-        # numpy 배열을 PIL 이미지로 변환
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image).convert('RGB')
-        # 태스크별 시스템 프롬프트 구성 (더 간결하게)
         system_prompts = {
             "general": "당신은 로봇 시각 시스템입니다. 먼저 장면을 1-2줄로 설명하고, 핵심 내용을 간결하게 분석하세요.",
             "planning": """당신은 로봇 작업 계획 AI입니다.
@@ -281,64 +344,21 @@ Step_n: xxx""",
                 combined_system = f"{search_results}\n\n{system_prompt}"
         # 메시지 구성
-        messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": combined_system}]
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "url": image},
-                    {"type": "text", "text": prompt}
-                ]
-            }
-        ]
-        # 입력 처리
-        inputs = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        ).to(device=model.device, dtype=torch.bfloat16)
-        # 입력 토큰 수 제한
-        if inputs.input_ids.shape[1] > MAX_INPUT_LENGTH:
-            inputs.input_ids = inputs.input_ids[:, -MAX_INPUT_LENGTH:]
-            if 'attention_mask' in inputs:
-                inputs.attention_mask = inputs.attention_mask[:, -MAX_INPUT_LENGTH:]
         # 생성
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=max_new_tokens,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9,
-                pad_token_id=processor.tokenizer.pad_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id,
-            )
-        # 입력 토큰 제거하여 출력만 추출
-        generated_tokens = outputs[0][inputs.input_ids.shape[1]:]
-        # 디코딩
-        response = processor.decode(generated_tokens, skip_special_tokens=True).strip()
-        # 프롬프트 제거 및 정리
-        # 이미 입력 토큰을 제거했으므로 추가 정리만 수행
-        response = response.strip()
-        # 혹시 남아있는 불필요한 텍스트 제거
-        if response.startswith("model\n"):
-            response = response[6:].strip()
-        elif response.startswith("model"):
-            response = response[5:].strip()
-        return response
     except Exception as e:
         logger.error(f"이미지 분석 오류: {e}")
@@ -350,16 +370,6 @@ Step_n: xxx""",
 ##############################################################################
 # 문서 분석 (스트리밍)
 ##############################################################################
-def _model_gen_with_oom_catch(**kwargs):
-    """OOM 처리를 위한 생성 함수"""
-    global model
-    try:
-        model.generate(**kwargs)
-    except torch.cuda.OutOfMemoryError:
-        raise RuntimeError("GPU 메모리 부족. Max Tokens를 줄여주세요.")
-    finally:
-        clear_cuda_cache()
 @spaces.GPU(duration=120)
 def analyze_documents_streaming(
     files: List[str],
@@ -368,7 +378,7 @@ def analyze_documents_streaming(
     max_new_tokens: int = 2048
 ) -> Iterator[str]:
     """문서 분석 (스트리밍)"""
-    global model, processor
     if not model_loaded:
         if not load_model():
@@ -399,48 +409,32 @@ def analyze_documents_streaming(
                 continue
             doc_contents.append(content)
         # 메시지 구성
         messages = [
-            {
-                "role": "system",
-                "content": [{"type": "text", "text": system_content}]
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "\n\n".join(doc_contents) + f"\n\n{prompt}"}
-                ]
-            }
         ]
-        # 입력 처리
-        inputs = processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt",
-        ).to(device=model.device, dtype=torch.bfloat16)
-        # 스트리밍 설정
-        streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-        gen_kwargs = dict(
-            inputs,
-            streamer=streamer,
-            max_new_tokens=max_new_tokens,
             temperature=0.8,
             top_p=0.9,
         )
-        # 별도 스레드에서 생성
-        t = Thread(target=_model_gen_with_oom_catch, kwargs=gen_kwargs)
-        t.start()
         # 스트리밍 출력
         output = ""
-        for new_text in streamer:
-            output += new_text
-            yield output
     except Exception as e:
         logger.error(f"문서 분석 오류: {e}")
@@ -494,17 +488,30 @@ css = """
     background: #e8f5e9;
     color: #2e7d32;
 }
 """
-with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as demo:
     gr.HTML("""
     <div class="robot-header">
         <h1>🤖 로봇 시각 시스템</h1>
-        <h3>🎮 Gemma3-R1984-4B + 📷 실시간 웹캠 + 🔍 웹 검색</h3>
-        <p>⚡ 최신 멀티모달 AI로 로봇 작업 분석 및 계획 수립!</p>
     </div>
     """)
     with gr.Row():
         # 왼쪽: 웹캠 및 입력
@@ -575,15 +582,15 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                         enable_thinking = gr.Checkbox(
                             label="🤔 추론 과정 표시",
-                            value=False,  # 기본값 False로 변경
                             info="Chain-of-Thought 추론 과정을 보여줍니다"
                         )
                     max_tokens = gr.Slider(
                         label="최대 토큰 수",
                         minimum=100,
-                        maximum=4096,
-                        value=300,  # 장면 설명을 위해 300으로 증가
                         step=50
                     )
@@ -600,8 +607,8 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
                 '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비 완료</div>'
             )
-    # 문서 분석 탭 (숨김 처리)
-    with gr.Tab("📄 문서 분석", visible=False):  # visible=False로 숨김
         with gr.Row():
             with gr.Column():
                 doc_files = gr.File(
@@ -661,7 +668,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
             max_new_tokens=tokens
         )
-        # 결과 포맷팅 (더 간결하게)
         timestamp = time.strftime("%H:%M:%S")
         task_names = {
             "planning": "작업 계획",
@@ -776,7 +783,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     )
     # 자동 캡처 타이머 (10초마다)
-    timer = gr.Timer(10.0, active=False)  # 10초 타이머, 초기에는 비활성화
     # 자동 캡처 토글 이벤트
     def toggle_auto_capture(enabled):
@@ -809,7 +816,7 @@ with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B)", css=css) as dem
     )
 if __name__ == "__main__":
-    print("🚀 로봇 시각 시스템 시작 (Gemma3-R1984-4B)...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,

 import numpy as np
 from loguru import logger
 from PIL import Image
 import time
 import warnings
 from typing import Dict, List, Optional, Union
+import base64
+from io import BytesIO
+# llama-cpp-python for GGUF
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava16ChatHandler
 # CSV/TXT 분석
 import pandas as pd
 warnings.filterwarnings('ignore')
+print("🎮 로봇 시각 시스템 초기화 (Gemma3-R1984-4B GGUF Q4_K_M)...")
 ##############################################################################
 # 상수 정의
 ##############################################################################
 # 전역 변수
 ##############################################################################
+llm = None
 model_loaded = False
+model_name = "Gemma3-R1984-4B-Q4_K_M"
 ##############################################################################
 # 메모리 관리
             "domain": "google.com",
             "serp_type": "web",
             "device": "desktop",
+            "lang": "ko",
+            "num": "10"
         }
         headers = {
     return f"**[PDF 파일: {os.path.basename(pdf_path)}]**\n\n{full_text}"
+##############################################################################
+# 이미지를 base64로 변환
+##############################################################################
+def image_to_base64_data_uri(image: Union[np.ndarray, Image.Image]) -> str:
+    """이미지를 base64 data URI로 변환"""
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image).convert('RGB')
+    buffered = BytesIO()
+    image.save(buffered, format="JPEG", quality=85)
+    img_str = base64.b64encode(buffered.getvalue()).decode()
+    return f"data:image/jpeg;base64,{img_str}"
 ##############################################################################
 # 모델 로드
 ##############################################################################
 @spaces.GPU(duration=120)
 def load_model():
+    global llm, model_loaded
     if model_loaded:
         logger.info("모델이 이미 로드되어 있습니다.")
         return True
     try:
+        logger.info("Gemma3-R1984-4B GGUF Q4_K_M 모델 로딩 시작...")
         clear_cuda_cache()
+        # 모델 경로 설정
+        model_path = os.getenv("MODEL_PATH", "VIDraft/Gemma-3-R1984-4B-GGUF/Gemma-3-R1984-4B.Q4_K_M.gguf")
+        mmproj_path = os.getenv("MMPROJ_PATH", "VIDraft/Gemma-3-R1984-4B-GGUF/Gemma-3-R1984-4B.mmproj-Q8_0.gguf")
+        # GPU 사용 가능 여부 확인
+        n_gpu_layers = -1 if torch.cuda.is_available() else 0
+        # 채팅 핸들러 생성 (비전 지원)
+        chat_handler = Llava16ChatHandler(
+            clip_model_path=mmproj_path,
+            verbose=False
+        )
+        # 모델 로드
+        llm = Llama(
+            model_path=model_path,
+            chat_handler=chat_handler,
+            n_ctx=4096,  # 컨텍스트 크기
+            n_gpu_layers=n_gpu_layers,  # GPU 레이어
+            n_threads=8,  # CPU 스레드
+            verbose=False,
+            seed=42,
+            logits_all=True,  # 비전 모델에 필요
         )
         model_loaded = True
         logger.error(f"모델 로딩 실패: {e}")
         return False
+##############################################################################
+# 채팅 템플릿 포맷팅
+##############################################################################
+def format_chat_prompt(system_prompt: str, user_prompt: str, image_uri: Optional[str] = None) -> List[Dict]:
+    """Gemma 스타일 채팅 프롬프트 생성"""
+    messages = []
+    # 시스템 메시지
+    messages.append({
+        "role": "system",
+        "content": system_prompt
+    })
+    # 사용자 메시지
+    user_content = []
+    if image_uri:
+        user_content.append({
+            "type": "image_url",
+            "image_url": {"url": image_uri}
+        })
+    user_content.append({
+        "type": "text",
+        "text": user_prompt
+    })
+    messages.append({
+        "role": "user",
+        "content": user_content
+    })
+    return messages
 ##############################################################################
 # 이미지 분석 (로봇 태스크 중심)
 ##############################################################################
     prompt: str,
     task_type: str = "general",
     use_web_search: bool = False,
+    enable_thinking: bool = False,
+    max_new_tokens: int = 300
 ) -> str:
     """로봇 작업을 위한 이미지 분석"""
+    global llm
     if not model_loaded:
         if not load_model():
             return "❌ 모델 로딩 실패"
     try:
+        # 이미지를 base64로 변환
+        image_uri = image_to_base64_data_uri(image)
+        # 태스크별 시스템 프롬프트 구성
         system_prompts = {
             "general": "당신은 로봇 시각 시스템입니다. 먼저 장면을 1-2줄로 설명하고, 핵심 내용을 간결하게 분석하세요.",
             "planning": """당신은 로봇 작업 계획 AI입니다.
                 combined_system = f"{search_results}\n\n{system_prompt}"
         # 메시지 구성
+        messages = format_chat_prompt(combined_system, prompt, image_uri)
         # 생성
+        response = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_new_tokens,
+            temperature=0.7,
+            top_p=0.9,
+            stream=False
+        )
+        # 응답 추출
+        result = response['choices'][0]['message']['content'].strip()
+        return result
     except Exception as e:
         logger.error(f"이미지 분석 오류: {e}")
 ##############################################################################
 # 문서 분석 (스트리밍)
 ##############################################################################
 @spaces.GPU(duration=120)
 def analyze_documents_streaming(
     files: List[str],
     max_new_tokens: int = 2048
 ) -> Iterator[str]:
     """문서 분석 (스트리밍)"""
+    global llm
     if not model_loaded:
         if not load_model():
                 continue
             doc_contents.append(content)
+        # 전체 프롬프트 구성
+        full_prompt = "\n\n".join(doc_contents) + f"\n\n{prompt}"
         # 메시지 구성
         messages = [
+            {"role": "system", "content": system_content},
+            {"role": "user", "content": full_prompt}
         ]
+        # 스트리밍 생성
+        stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_new_tokens,
             temperature=0.8,
             top_p=0.9,
+            stream=True
         )
         # 스트리밍 출력
         output = ""
+        for chunk in stream:
+            if 'choices' in chunk and len(chunk['choices']) > 0:
+                delta = chunk['choices'][0].get('delta', {})
+                if 'content' in delta:
+                    output += delta['content']
+                    yield output
     except Exception as e:
         logger.error(f"문서 분석 오류: {e}")
     background: #e8f5e9;
     color: #2e7d32;
 }
+.model-info {
+    background: #fff3cd;
+    color: #856404;
+    padding: 10px;
+    border-radius: 5px;
+    margin: 10px 0;
+    text-align: center;
+}
 """
+with gr.Blocks(title="🤖 로봇 시각 시스템 (Gemma3-4B GGUF)", css=css) as demo:
     gr.HTML("""
     <div class="robot-header">
         <h1>🤖 로봇 시각 시스템</h1>
+        <h3>🎮 Gemma3-R1984-4B GGUF Q4_K_M + 📷 실시간 웹캠 + 🔍 웹 검색</h3>
+        <p>⚡ 양자화 모델로 더 빠르고 효율적인 로봇 작업 분석!</p>
     </div>
     """)
+    gr.HTML("""
+    <div class="model-info">
+        <strong>모델:</strong> Gemma3-R1984-4B Q4_K_M (2.49GB) | <strong>메모리 사용:</strong> ~3-4GB VRAM
+    </div>
+    """)
     with gr.Row():
         # 왼쪽: 웹캠 및 입력
                         enable_thinking = gr.Checkbox(
                             label="🤔 추론 과정 표시",
+                            value=False,
                             info="Chain-of-Thought 추론 과정을 보여줍니다"
                         )
                     max_tokens = gr.Slider(
                         label="최대 토큰 수",
                         minimum=100,
+                        maximum=2048,
+                        value=300,
                         step=50
                     )
                 '<div class="status-box" style="background:#d4edda; color:#155724;">🎮 시스템 준비 완료</div>'
             )
+    # 문서 분석 탭
+    with gr.Tab("📄 문서 분석", visible=False):
         with gr.Row():
             with gr.Column():
                 doc_files = gr.File(
             max_new_tokens=tokens
         )
+        # 결과 포맷팅
         timestamp = time.strftime("%H:%M:%S")
         task_names = {
             "planning": "작업 계획",
     )
     # 자동 캡처 타이머 (10초마다)
+    timer = gr.Timer(10.0, active=False)
     # 자동 캡처 토글 이벤트
     def toggle_auto_capture(enabled):
     )
 if __name__ == "__main__":
+    print("🚀 로봇 시각 시스템 시작 (Gemma3-R1984-4B GGUF Q4_K_M)...")
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,