Spaces:

Dongjin1203
/

RFP_summary_chatbot

Paused

App Files Files Community

Dongjin1203 commited on 16 days ago

Commit

03c7f28

1 Parent(s): 05fc904

Test GGUF with lightweight build

Browse files

Files changed (5) hide show

Dockerfile +30 -16
requirements.txt +3 -0
src/generator/generator.py.old +335 -0
src/generator/generator_gguf.py +1 -1
src/visualization/chatbot_app.py +103 -24

Dockerfile CHANGED Viewed

@@ -1,16 +1,29 @@
-# ===== Python 3.12 Dockerfile =====
-FROM python:3.12-slim
-# 시스템 패키지 설치
 RUN apt-get update && apt-get install -y \
     git \
     curl \
     && rm -rf /var/lib/apt/lists/*
-# 작업 디렉토리
 WORKDIR /app
-# ===== 환경변수 설정 =====
 ENV HOME=/app
 ENV STREAMLIT_SERVER_FILE_WATCHER_TYPE=none
 ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
@@ -22,25 +35,26 @@ ENV MKL_NUM_THREADS=1
 ENV NUMEXPR_NUM_THREADS=1
 ENV CHROMA_DB_PATH=/app/.cache/chroma_db
-# 캐시 디렉토리 생성 및 권한 설정
 RUN mkdir -p /app/.cache/huggingface /app/.streamlit && \
     chmod -R 777 /app/.cache /app/.streamlit
-# 의존성 복사
 COPY requirements.txt .
-# pip 업그레이드 & 의존성 설치
-RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
-    pip install --no-cache-dir -r requirements.txt
-# 프로젝트 파일 복사
-COPY . .
-# 시작 스크립트 실행 권한 부여
 RUN chmod +x /app/start.sh
-# Streamlit 포트
 EXPOSE 7860
-# 시작 스크립트 실행
 CMD ["/app/start.sh"]

+# ===== 경량 Dockerfile (사전 빌드 llama-cpp-python 사용) =====
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+ENV PYTHONUNBUFFERED=1
+# Python 3.12 설치
 RUN apt-get update && apt-get install -y \
+    software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa \
+    && apt-get update && apt-get install -y \
+    python3.12 \
+    python3.12-dev \
+    python3-pip \
     git \
     curl \
     && rm -rf /var/lib/apt/lists/*
+# Python 기본 설정
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 \
+    && update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.12
 WORKDIR /app
+# 환경변수
 ENV HOME=/app
 ENV STREAMLIT_SERVER_FILE_WATCHER_TYPE=none
 ENV STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
 ENV NUMEXPR_NUM_THREADS=1
 ENV CHROMA_DB_PATH=/app/.cache/chroma_db
 RUN mkdir -p /app/.cache/huggingface /app/.streamlit && \
     chmod -R 777 /app/.cache /app/.streamlit
 COPY requirements.txt .
+# pip 업그레이드
+RUN pip install --no-cache-dir --upgrade pip setuptools wheel
+# PyTorch CUDA 버전 설치
+RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# 사전 빌드된 llama-cpp-python (CUDA) 설치
+RUN pip install --no-cache-dir llama-cpp-python \
+    --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+# 나머지 의존성
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
 RUN chmod +x /app/start.sh
 EXPOSE 7860
 CMD ["/app/start.sh"]

requirements.txt CHANGED Viewed

@@ -28,6 +28,9 @@ transformers>=4.44.0
 sentence-transformers>=3.0.0
 rapidfuzz>=3.9.0
 # ===== Hugging Face Hub =====
 huggingface-hub>=0.25.0

 sentence-transformers>=3.0.0
 rapidfuzz>=3.9.0
+# ===== GGUF 로컬 모델 (추가!) =====
+llama-cpp-python>=0.2.90
 # ===== Hugging Face Hub =====
 huggingface-hub>=0.25.0

src/generator/generator.py.old ADDED Viewed

	@@ -0,0 +1,335 @@

+from langchain_openai import ChatOpenAI
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.runnables import RunnablePassthrough, RunnableLambda
+from langchain_core.messages import HumanMessage, AIMessage
+from langsmith import traceable
+import time
+from typing import List, Dict
+from src.utils.config import RAGConfig
+from src.retriever.retriever import RAGRetriever
+class RAGPipeline:
+    """대화형 RAG 파이프라인 - LangChain Chain 기반"""
+    def __init__(self, config: RAGConfig = None, model: str = None, top_k: int = None):
+        """초기화"""
+        self.config = config or RAGConfig()
+        self.model = model or self.config.LLM_MODEL_NAME
+        self.top_k = top_k or self.config.DEFAULT_TOP_K
+        # 검색 설정
+        self.search_mode = self.config.DEFAULT_SEARCH_MODE
+        self.alpha = self.config.DEFAULT_ALPHA
+        # LLM 초기화 (LangChain ChatOpenAI)
+        self.llm = ChatOpenAI(
+            model=self.model,
+            openai_api_key=self.config.OPENAI_API_KEY,
+            timeout=60.0,
+            max_retries=3
+        )
+        # Retriever 초기화
+        self.retriever = RAGRetriever(config=self.config)
+        # 대화 히스토리
+        self.chat_history: List[Dict] = []
+        # 마지막 검색 결과 저장 (sources 반환용)
+        self._last_retrieved_docs = []
+        # 프롬프트 템플릿 (대화 히스토리 포함)
+        self.prompt = ChatPromptTemplate.from_messages([
+            ("system", """당신은 공공입찰 RFP를 분석하는 입찰메이트 사내 분석가입니다. 제공된 컨텍스트만으로 요구사항·예산·대상 기관·제출 방식 등을 구조화해 의사결정을 지원하세요.
+            # 규칙
+            - 답변은 한국어로 작성합니다.
+            - 컨텍스트 밖 내용을 추측하지 않습니다.
+            - 정보가 없으면 "문서에서 해당 정보를 찾을 수 없습니다."라고 밝힙니다.
+            - 여러 문서를 비교할 때는 문서별 차이를 표 또는 목록으로 정리합니다.
+            - 숫자에는 가능한 단위를 포함합니다.
+            - 직전 대화 맥락을 반영합니다.
+            # 답변 형식
+            1. 한 줄 요약: 질문 핵심을 한두 문장으로 작성합니다.
+            2. 상세 답변: [요구사항], [대상 기관], [예산], [제출 형식/방법], [평가 기준] 등 문서에서 확인된 항목만 정리합니다.
+            3. 근거 정보: 위 답변의 근거가 된 문장이나 문단을 요약합니다.
+            4. 부족한 정보: 문서에서 찾을 수 없는 항목은 "문서에서 확인 불가"로 표기합니다."""),
+                        # 대화 히스토리
+                        MessagesPlaceholder(variable_name="chat_history"),
+                        # 현재 질문과 컨텍스트
+                        ("user", """# 컨텍스트
+            {context}
+            # 질문
+            {question}
+            위 규칙에 따라 답변하세요.""")
+        ])
+        # Chain 구성
+        self.chain = (
+            {
+                "context": RunnableLambda(self._retrieve_and_format),
+                "question": RunnablePassthrough(),
+                "chat_history": RunnableLambda(lambda x: self._get_chat_history())
+            }
+            | self.prompt
+            | self.llm
+            | StrOutputParser()
+        )
+        print(f"✅ RAG 파이프라인 초기화 완료")
+        print(f"   - 모델: {self.model}")
+        print(f"   - 기본 top_k: {self.top_k}")
+        print(f"   - 검색 모드: {self.search_mode}")
+    def _get_chat_history(self) -> List:
+        """대화 히스토리를 LangChain 메시지 형식으로 변환"""
+        messages = []
+        for msg in self.chat_history:
+            if msg["role"] == "user":
+                messages.append(HumanMessage(content=msg["content"]))
+            else:
+                messages.append(AIMessage(content=msg["content"]))
+        return messages
+    def _retrieve_and_format(self, query: str) -> str:
+        """검색 수행 및 컨텍스트 포맷팅"""
+        # 검색 모드에 따라 문서 검색
+        if self.search_mode == "embedding":
+            docs = self.retriever.search(query, top_k=self.top_k)
+        elif self.search_mode == "hybrid":
+            docs = self.retriever.hybrid_search(query, top_k=self.top_k, alpha=self.alpha)
+        elif self.search_mode == "hybrid_rerank":
+            docs = self.retriever.hybrid_search_with_rerank(
+                query, top_k=self.top_k, alpha=self.alpha
+            )
+        else:
+            docs = self.retriever.search(query, top_k=self.top_k)
+        # 마지막 검색 결과 저장
+        self._last_retrieved_docs = docs
+        # 컨텍스트 포맷팅
+        return self._format_context(docs)
+    def _format_context(self, retrieved_docs: list) -> str:
+        """검색된 문서를 컨텍스트로 변환"""
+        if not retrieved_docs:
+            return "관련 문서를 찾을 수 없습니다."
+        context_parts = []
+        for i, doc in enumerate(retrieved_docs, 1):
+            context_parts.append(f"[문서 {i}]\n{doc['content']}\n")
+        return "\n".join(context_parts)
+    def _format_sources(self, retrieved_docs: list) -> list:
+        """검색된 문서를 sources 형식으로 변환"""
+        sources = []
+        for doc in retrieved_docs:
+            source_info = {
+                'content': doc['content'],
+                'metadata': doc['metadata'],
+                'filename': doc.get('filename', 'N/A'),
+                'organization': doc.get('organization', 'N/A')
+            }
+            # 검색 모드에 따라 점수 필드가 다름
+            if 'rerank_score' in doc:
+                source_info['score'] = doc['rerank_score']
+                source_info['score_type'] = 'rerank'
+            elif 'hybrid_score' in doc:
+                source_info['score'] = doc['hybrid_score']
+                source_info['score_type'] = 'hybrid'
+            elif 'relevance_score' in doc:
+                source_info['score'] = doc['relevance_score']
+                source_info['score_type'] = 'embedding'
+            else:
+                source_info['score'] = 0
+                source_info['score_type'] = 'unknown'
+            sources.append(source_info)
+        return sources
+    @traceable(
+        name="RAG_Generate_Answer",
+        metadata={"component": "generator", "version": "2.0"}
+    )
+    def generate_answer(
+        self,
+        query: str,
+        top_k: int = None,
+        search_mode: str = None,
+        alpha: float = None
+    ) -> dict:
+        """
+        답변 생성 (Chain 기반)
+        Args:
+            query: 질문
+            top_k: 검색할 문서 수
+            search_mode: 검색 모드 ("embedding", "hybrid", "hybrid_rerank")
+            alpha: 임베딩 가중치 (0~1)
+        Returns:
+            dict: answer, sources, search_mode, usage
+        """
+        try:
+            start_time = time.time()
+            # 파라미터 설정
+            if top_k is not None:
+                self.top_k = top_k
+            if search_mode is not None:
+                self.search_mode = search_mode
+            if alpha is not None:
+                self.alpha = alpha
+            # Chain 실행
+            answer = self.chain.invoke(query)
+            elapsed_time = time.time() - start_time
+            # 대화 히스토리에 추가
+            self.chat_history.append({"role": "user", "content": query})
+            self.chat_history.append({"role": "assistant", "content": answer})
+            # 토큰 사용량 추정 (LangChain에서는 직접 접근 어려움)
+            estimated_tokens = len(query.split()) + len(answer.split()) * 2
+            return {
+                'answer': answer,
+                'sources': self._format_sources(self._last_retrieved_docs),
+                'search_mode': self.search_mode,
+                'elapsed_time': elapsed_time,
+                'usage': {
+                    'total_tokens': estimated_tokens,
+                    'prompt_tokens': 0,
+                    'completion_tokens': 0
+                }
+            }
+        except Exception as e:
+            print(f"❌ 답변 생성 실패: {e}")
+            import traceback
+            traceback.print_exc()
+            raise RuntimeError(f"답변 생성 실패: {str(e)}") from e
+    def chat(self, query: str) -> str:
+        """
+        간단한 대화 인터페이스
+        Args:
+            query: 질문
+        Returns:
+            str: 답변 텍스트만 반환
+        """
+        result = self.generate_answer(query)
+        return result['answer']
+    def clear_history(self):
+        """대화 히스토리 초기화"""
+        self.chat_history = []
+        print("🗑️ 대화 히스토리가 초기화되었습니다.")
+    def get_history(self) -> List[Dict]:
+        """대화 히스토리 반환"""
+        return self.chat_history.copy()
+    def set_search_config(self, search_mode: str = None, top_k: int = None, alpha: float = None):
+        """검색 설정 변경"""
+        if search_mode is not None:
+            self.search_mode = search_mode
+        if top_k is not None:
+            self.top_k = top_k
+        if alpha is not None:
+            self.alpha = alpha
+        print(f"🔧 검색 설정 변경: mode={self.search_mode}, top_k={self.top_k}, alpha={self.alpha}")
+    def print_result(self, result: dict, query: str = None):
+        """결과 출력"""
+        print("\n" + "="*60)
+        if query:
+            print(f"질문: {query}")
+        print(f"검색 모드: {result.get('search_mode', 'N/A')}")
+        if 'elapsed_time' in result:
+            print(f"소요 시간: {result['elapsed_time']:.2f}초")
+        print("="*60)
+        print(f"\n💬 답변:\n{result['answer']}")
+        print(f"\n📚 참고 문서 ({len(result['sources'])}개):")
+        for i, source in enumerate(result['sources'], 1):
+            score = source.get('score', 0)
+            score_type = source.get('score_type', '')
+            print(f"  [{i}] {source['filename']}")
+            print(f"      점수: {score:.3f} ({score_type})")
+        print("="*60)
+# 대화형 실행
+def interactive_mode():
+    """대화형 모드 실행"""
+    print("=" * 60)
+    print("대화형 RAG 시스템 초기화 중...")
+    print("=" * 60)
+    config = RAGConfig()
+    pipeline = RAGPipeline(config=config)
+    print("\n" + "=" * 60)
+    print("대화형 모드 시작")
+    print("명령어: 'quit' (종료), 'clear' (히스토리 초기화), 'mode' (검색모드 변경)")
+    print("=" * 60)
+    while True:
+        user_query = input("\n질문: ").strip()
+        if not user_query:
+            continue
+        if user_query.lower() in ['quit', 'exit', '종료', 'q']:
+            print("시스템을 종료합니다.")
+            break
+        if user_query.lower() == 'clear':
+            pipeline.clear_history()
+            continue
+        if user_query.lower() == 'mode':
+            print("\n검색 모드 선택:")
+            print("1. embedding - 임베딩 검색")
+            print("2. hybrid - BM25 + 임베딩")
+            print("3. hybrid_rerank - Hybrid + Re-ranker (권장)")
+            choice = input("선택 (1/2/3): ").strip()
+            modes = {'1': 'embedding', '2': 'hybrid', '3': 'hybrid_rerank'}
+            if choice in modes:
+                pipeline.set_search_config(search_mode=modes[choice])
+            continue
+        try:
+            result = pipeline.generate_answer(query=user_query)
+            pipeline.print_result(result, user_query)
+            # 소스 출력 여부
+            show_source = input("\n참조 문서 상세 보기? (y/n): ").strip().lower()
+            if show_source == 'y':
+                for i, source in enumerate(result['sources'], 1):
+                    print(f"\n{'='*40}")
+                    print(f"[문서 {i}] {source['filename']}")
+                    print(f"발주기관: {source['organization']}")
+                    print(f"내용:\n{source['content'][:500]}...")
+        except Exception as e:
+            print(f"❌ 오류 발생: {e}")
+# 사용 예시
+if __name__ == "__main__":
+    interactive_mode()

src/generator/generator_gguf.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# from llama_cpp import Llama
 from typing import Optional, Dict, Any, List
 import logging
 import time

+from llama_cpp import Llama
 from typing import Optional, Dict, Any, List
 import logging
 import time

src/visualization/chatbot_app.py CHANGED Viewed

@@ -2,7 +2,7 @@
 공공기관 사업제안서 RAG 챗봇
 기능:
-- 모델 선택 (API/로컬)
 - Query Router (검색 vs 직접 답변)
 - RAG 기반 질의응답 (Hybrid Search + Re-ranker)
 - 조건부 참고 문서 표시
@@ -111,6 +111,14 @@ st.markdown("""
         margin-top: 0.5rem;
         border-left: 3px solid #ff9800;
     }
 </style>
 """, unsafe_allow_html=True)
@@ -132,15 +140,47 @@ if 'show_routing_info' not in st.session_state:
 # ===== RAG 파이프라인 초기화 =====
 @st.cache_resource
 def initialize_rag(model_type):
-    """RAG 파이프라인 초기화 (API 모델 전용)"""
     try:
         config = RAGConfig()
-        from src.generator.generator import RAGPipeline
-        rag = RAGPipeline(config=config)
-        return rag, None, "API"
     except Exception as e:
-        return None, str(e), None
 # ===== 답변 생성 =====
@@ -156,12 +196,14 @@ def generate_answer(query: str, top_k: int = 10, search_mode: str = "hybrid_rera
         return result
     except Exception as e:
         return {
-            'answer': f"❌ 오류가 발생했습니다: {str(e)}",
             'sources': [],
-            'used_retrieval': False,  # ← 추가
             'search_mode': search_mode,
-            'routing_info': None,  # ← 추가
             'usage': {'total_tokens': 0, 'prompt_tokens': 0, 'completion_tokens': 0}
         }
@@ -173,8 +215,8 @@ def display_message(
     sources: list = None,
     usage: dict = None,
     search_mode: str = None,
-    used_retrieval: bool = None,  # ← 신규
-    routing_info: dict = None  # ← 신규
 ):
     """
     메시지를 화면에 표시
@@ -231,7 +273,7 @@ def display_message(
                 'hybrid': '🔀 Hybrid Search',
                 'embedding_rerank': '📊 임베딩 + Re-ranker',
                 'embedding': '📊 임베딩 검색',
-                'direct': '💬 Direct (검색 없음)'  # ← 추가
             }
             st.markdown(f"""
             <div class="search-mode-info">
@@ -299,14 +341,33 @@ def main():
         model_type = st.selectbox(
             "생성 모델 선택",
             options=[
-                "API 모델 (GPT)"
             ],
             index=0,
-            help="OpenAI API 사용 (빠르고 안정적)"
         )
-        # 모델 정보 표시
-        st.info("🌐 OpenAI GPT 모델 사용 중")
         st.markdown("---")
@@ -344,7 +405,7 @@ def main():
             "검색할 문서 개수 (Top-K)",
             min_value=1,
             max_value=20,
-            value=10,  # 기본값
             help="검색할 문서 개수"
         )
@@ -380,7 +441,7 @@ def main():
             st.rerun()
         if st.button("💾 대화 다운로드", use_container_width=True):
-            if len(st.session_state.conv_manager) > 0:  # ✅ conv_manager 사용
                 json_str = st.session_state.conv_manager.export_to_json()
                 st.download_button(
@@ -416,20 +477,38 @@ def main():
     if (st.session_state.rag_pipeline is None or
         st.session_state.model_type != model_type):
-        with st.spinner(f"🔄 {model_type} 초기화 중..."):
             rag, error, rag_type = initialize_rag(model_type)
             if error:
-                st.error(f"❌ RAG 파이프라인 초기화 실패: {error}")
                 st.info("""
                 ### 💡 해결 방법
                 1. ChromaDB가 생성되었는지 확인:
 ```bash
 python main.py --step embed
 ```
-                2. OpenAI API 키가 설정되었는지 확인:
 ```bash
 # .env 파일
 OPENAI_API_KEY=your-key-here
@@ -449,7 +528,7 @@ pip install rank-bm25 sentence-transformers
     # ===== 대화 히스토리 표시 =====
     st.markdown("---")
-    if len(st.session_state.conv_manager) == 0:  # ✅ conv_manager 사용
         st.info("""
         ### 👋 환영합니다!
@@ -470,8 +549,8 @@ pip install rank-bm25 sentence-transformers
             sources=msg.get('sources'),
             usage=msg.get('usage'),
             search_mode=msg.get('search_mode'),
-            used_retrieval=msg.get('used_retrieval'),  # ← 신규
-            routing_info=msg.get('routing_info')  # ← 신규
         )
     # ===== 질문 입력 =====

 공공기관 사업제안서 RAG 챗봇
 기능:
+- 모델 선택 (API/로컬 GGUF)
 - Query Router (검색 vs 직접 답변)
 - RAG 기반 질의응답 (Hybrid Search + Re-ranker)
 - 조건부 참고 문서 표시
         margin-top: 0.5rem;
         border-left: 3px solid #ff9800;
     }
+    .model-info {
+        background-color: #f3e5f5;
+        padding: 0.8rem 1rem;
+        border-radius: 0.3rem;
+        font-size: 0.9rem;
+        margin: 0.5rem 0;
+        border-left: 3px solid #9c27b0;
+    }
 </style>
 """, unsafe_allow_html=True)
 # ===== RAG 파이프라인 초기화 =====
 @st.cache_resource
 def initialize_rag(model_type):
+    """
+    RAG 파이프라인 초기화
+    Args:
+        model_type: "API 모델 (GPT)" 또는 "로컬 모델 (GGUF)"
+    Returns:
+        (rag_pipeline, error_message, model_name)
+    """
     try:
         config = RAGConfig()
+        if model_type == "API 모델 (GPT)":
+            # API 모델 사용
+            from src.generator.generator import RAGPipeline
+            rag = RAGPipeline(config=config)
+            return rag, None, "OpenAI GPT"
+        elif model_type == "로컬 모델 (GGUF)":
+            # GGUF 모델 사용
+            from src.generator.generator_gguf import GGUFRAGPipeline
+            # T4 GPU 최적 설정
+            rag = GGUFRAGPipeline(
+                config=config,
+                n_gpu_layers=35,  # T4에서 전체 레이어 GPU 사용
+                n_ctx=2048,       # 컨텍스트 길이
+                n_threads=4,      # CPU 스레드 (GPU 사용 시 낮게)
+                max_new_tokens=512,  # 최대 생성 토큰
+                temperature=0.7,
+                top_p=0.9
+            )
+            return rag, None, "Llama-3-Ko-8B (GGUF)"
+        else:
+            return None, f"알 수 없는 모델 타입: {model_type}", None
     except Exception as e:
+        import traceback
+        error_detail = traceback.format_exc()
+        return None, f"{str(e)}\n\n{error_detail}", None
 # ===== 답변 생성 =====
         return result
     except Exception as e:
+        import traceback
+        error_detail = traceback.format_exc()
         return {
+            'answer': f"❌ 오류가 발생했습니다: {str(e)}\n\n{error_detail}",
             'sources': [],
+            'used_retrieval': False,
             'search_mode': search_mode,
+            'routing_info': None,
             'usage': {'total_tokens': 0, 'prompt_tokens': 0, 'completion_tokens': 0}
         }
     sources: list = None,
     usage: dict = None,
     search_mode: str = None,
+    used_retrieval: bool = None,
+    routing_info: dict = None
 ):
     """
     메시지를 화면에 표시
                 'hybrid': '🔀 Hybrid Search',
                 'embedding_rerank': '📊 임베딩 + Re-ranker',
                 'embedding': '📊 임베딩 검색',
+                'direct': '💬 Direct (검색 없음)'
             }
             st.markdown(f"""
             <div class="search-mode-info">
         model_type = st.selectbox(
             "생성 모델 선택",
             options=[
+                "API 모델 (GPT)",
+                "로컬 모델 (GGUF)"
             ],
             index=0,
+            help="OpenAI API 또는 로컬 GGUF 모델 선택"
         )
+        # 모델별 정보 표시
+        if model_type == "API 모델 (GPT)":
+            st.markdown("""
+            <div class="model-info">
+                🌐 <b>OpenAI GPT 모델</b><br>
+                • 빠르고 안정적<br>
+                • API 키 필요<br>
+                • 비용 발생 (토큰당)
+            </div>
+            """, unsafe_allow_html=True)
+        else:
+            st.markdown("""
+            <div class="model-info">
+                🖥️ <b>Llama-3-Ko-8B (GGUF)</b><br>
+                • T4 GPU 가속<br>
+                • 로컬 실행 (무료)<br>
+                • 초기 로딩 시간 소요<br>
+                • 35개 레이어 GPU 사용
+            </div>
+            """, unsafe_allow_html=True)
         st.markdown("---")
             "검색할 문서 개수 (Top-K)",
             min_value=1,
             max_value=20,
+            value=10,
             help="검색할 문서 개수"
         )
             st.rerun()
         if st.button("💾 대화 다운로드", use_container_width=True):
+            if len(st.session_state.conv_manager) > 0:
                 json_str = st.session_state.conv_manager.export_to_json()
                 st.download_button(
     if (st.session_state.rag_pipeline is None or
         st.session_state.model_type != model_type):
+        with st.spinner(f"🔄 {model_type} 초기화 중... (GGUF 모델은 1~2분 소요될 수 있습니다)"):
             rag, error, rag_type = initialize_rag(model_type)
             if error:
+                st.error(f"❌ RAG 파이프라인 초기화 실패")
+                with st.expander("🔍 에러 상세 정보"):
+                    st.code(error)
                 st.info("""
                 ### 💡 해결 방법
+                **GGUF 모델 실패 시:**
+                1. llama-cpp-python 설치 확인:
+```bash
+pip install llama-cpp-python
+```
+                2. GGUF 모델 파일 확인:
+                   - config.yaml의 GGUF_MODEL_PATH 또는
+                   - MODEL_HUB_REPO 설정 확인
+                3. GPU 메모리 부족 시:
+                   - n_gpu_layers 값 감소 (35 → 20)
+                **API 모델 실패 시:**
                 1. ChromaDB가 생성되었는지 확인:
 ```bash
 python main.py --step embed
 ```
+                2. OpenAI API 키 확인:
 ```bash
 # .env 파일
 OPENAI_API_KEY=your-key-here
     # ===== 대화 히스토리 표시 =====
     st.markdown("---")
+    if len(st.session_state.conv_manager) == 0:
         st.info("""
         ### 👋 환영합니다!
             sources=msg.get('sources'),
             usage=msg.get('usage'),
             search_mode=msg.get('search_mode'),
+            used_retrieval=msg.get('used_retrieval'),
+            routing_info=msg.get('routing_info')
         )
     # ===== 질문 입력 =====