Spaces:
Paused
Paused
| """ | |
| λλ²κΉ μ μν μ½λ μΆκ° - κ²½λ‘ κ΄λ ¨ λ¬Έμ ν΄κ²° | |
| """ | |
| import os | |
| import time | |
| import hashlib | |
| import pickle | |
| import json | |
| import logging | |
| import glob | |
| from typing import List, Dict, Tuple, Any, Optional | |
| from logging.handlers import RotatingFileHandler | |
| from pathlib import Path | |
| from langchain.schema import Document | |
| from config import ( | |
| PDF_DIRECTORY, CACHE_DIRECTORY, CHUNK_SIZE, CHUNK_OVERLAP, | |
| LLM_MODEL, LOG_LEVEL, LOG_FILE, print_config, validate_config | |
| ) | |
| from optimized_document_processor import OptimizedDocumentProcessor | |
| from vector_store import VectorStore | |
| import sys | |
| print("===== Script starting =====") | |
| sys.stdout.flush() # μ¦μ μΆλ ₯ κ°μ | |
| # μ£Όμ ν¨μ/λ©μλ νΈμΆ μ νμλ λλ²κΉ μΆλ ₯ μΆκ° | |
| print("Loading config...") | |
| sys.stdout.flush() | |
| # from config import ... λ±μ μ½λ | |
| print("Config loaded!") | |
| sys.stdout.flush() | |
| # λ‘κΉ μ€μ κ°μ | |
| def setup_logging(): | |
| """μ ν리μΌμ΄μ λ‘κΉ μ€μ """ | |
| # λ‘κ·Έ λ 벨 μ€μ | |
| log_level = getattr(logging, LOG_LEVEL.upper(), logging.INFO) | |
| # λ‘κ·Έ ν¬λ§· μ€μ | |
| log_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' | |
| formatter = logging.Formatter(log_format) | |
| # λ£¨νΈ λ‘κ±° μ€μ | |
| root_logger = logging.getLogger() | |
| root_logger.setLevel(log_level) | |
| # νΈλ€λ¬ μ΄κΈ°ν | |
| # μ½μ νΈλ€λ¬ | |
| console_handler = logging.StreamHandler() | |
| console_handler.setFormatter(formatter) | |
| root_logger.addHandler(console_handler) | |
| # νμΌ νΈλ€λ¬ (νμ μ) | |
| try: | |
| file_handler = RotatingFileHandler( | |
| LOG_FILE, | |
| maxBytes=10 * 1024 * 1024, # 10 MB | |
| backupCount=5 | |
| ) | |
| file_handler.setFormatter(formatter) | |
| root_logger.addHandler(file_handler) | |
| except Exception as e: | |
| console_handler.warning(f"λ‘κ·Έ νμΌ μ€μ μ€ν¨: {e}, μ½μ λ‘κΉ λ§ μ¬μ©ν©λλ€.") | |
| return logging.getLogger("AutoRAG") | |
| # λ‘κ±° μ€μ | |
| logger = setup_logging() | |
| # νμ¬ μμ λλ ν 리 νμΈμ μν λλ²κΉ μ½λ | |
| current_dir = os.getcwd() | |
| logger.info(f"νμ¬ μμ λλ ν 리: {current_dir}") | |
| # μ€μ λ PDF λλ ν 리 νμΈ | |
| abs_pdf_dir = os.path.abspath(PDF_DIRECTORY) | |
| logger.info(f"μ€μ λ PDF λλ ν 리: {PDF_DIRECTORY}") | |
| logger.info(f"μ λ κ²½λ‘λ‘ λ³νλ PDF λλ ν 리: {abs_pdf_dir}") | |
| # PDF λλ ν 리 μ‘΄μ¬ νμΈ | |
| if os.path.exists(abs_pdf_dir): | |
| logger.info(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬ν©λλ€: {abs_pdf_dir}") | |
| # λλ ν 리 λ΄μ© νμΈ | |
| pdf_files = glob.glob(os.path.join(abs_pdf_dir, "*.pdf")) | |
| logger.info(f"λλ ν 리 λ΄ PDF νμΌ λͺ©λ‘: {pdf_files}") | |
| else: | |
| logger.error(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ΅λλ€: {abs_pdf_dir}") | |
| # μμ λλ ν 리 λ΄μ© νμΈ | |
| parent_dir = os.path.dirname(abs_pdf_dir) | |
| logger.info(f"μμ λλ ν 리: {parent_dir}") | |
| if os.path.exists(parent_dir): | |
| dir_contents = os.listdir(parent_dir) | |
| logger.info(f"μμ λλ ν 리 λ΄μ©: {dir_contents}") | |
| # μ€μ μν νμΈ | |
| logger.info("μ ν리μΌμ΄μ μ€μ κ²μ¦ μ€...") | |
| config_status = validate_config() | |
| if config_status["status"] != "valid": | |
| for warning in config_status["warnings"]: | |
| logger.warning(f"μ€μ κ²½κ³ : {warning}") | |
| # μμ ν μν¬νΈ | |
| try: | |
| from rag_chain import RAGChain | |
| RAG_CHAIN_AVAILABLE = True | |
| print("RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") | |
| except ImportError as e: | |
| logger.warning(f"RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
| RAG_CHAIN_AVAILABLE = False | |
| except Exception as e: | |
| logger.warning(f"RAG μ²΄μΈ λͺ¨λ λ‘λ μ€ μμμΉ λͺ»ν μ€λ₯: {e}") | |
| RAG_CHAIN_AVAILABLE = False | |
| # ν΄λ°± RAG κ΄λ ¨ λͺ¨λλ 미리 νμΈ | |
| try: | |
| from fallback_rag_chain import FallbackRAGChain | |
| FALLBACK_AVAILABLE = True | |
| print("ν΄λ°± RAG μ²΄μΈ λͺ¨λ λ‘λ μ±κ³΅!") | |
| except ImportError as e: | |
| logger.warning(f"ν΄λ°± RAG μ²΄μΈ λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
| FALLBACK_AVAILABLE = False | |
| try: | |
| from offline_fallback_rag import OfflineFallbackRAG | |
| OFFLINE_FALLBACK_AVAILABLE = True | |
| print("μ€νλΌμΈ ν΄λ°± RAG λͺ¨λ λ‘λ μ±κ³΅!") | |
| except ImportError as e: | |
| logger.warning(f"μ€νλΌμΈ ν΄λ°± RAG λͺ¨λμ λ‘λν μ μμ΅λλ€: {e}") | |
| OFFLINE_FALLBACK_AVAILABLE = False | |
| class DocumentProcessingError(Exception): | |
| """λ¬Έμ μ²λ¦¬ μ€ λ°μνλ μμΈ""" | |
| pass | |
| class VectorStoreError(Exception): | |
| """λ²‘ν° μ€ν μ΄ μμ μ€ λ°μνλ μμΈ""" | |
| pass | |
| class RAGInitializationError(Exception): | |
| """RAG μ²΄μΈ μ΄κΈ°ν μ€ λ°μνλ μμΈ""" | |
| pass | |
| class ConfigurationError(Exception): | |
| """μ€μ κ΄λ ¨ μ€λ₯""" | |
| pass | |
| class AutoRAGChatApp: | |
| """ | |
| documents ν΄λμ PDF νμΌμ μλμΌλ‘ μ²λ¦¬νλ RAG μ±λ΄ | |
| """ | |
| def __init__(self): | |
| """ | |
| RAG μ±λ΄ μ ν리μΌμ΄μ μ΄κΈ°ν | |
| """ | |
| try: | |
| logger.info("AutoRAGChatApp μ΄κΈ°ν μμ") | |
| # λ°μ΄ν° λλ ν 리 μ μ (μ€μ μμ κ°μ Έμ΄) | |
| # μ λ κ²½λ‘λ‘ λ³ννμ¬ μ¬μ© | |
| self.pdf_directory = os.path.abspath(PDF_DIRECTORY) | |
| self.cache_directory = os.path.abspath(CACHE_DIRECTORY) | |
| self.index_file = os.path.join(self.cache_directory, "file_index.json") | |
| self.chunks_dir = os.path.join(self.cache_directory, "chunks") | |
| self.vector_index_dir = os.path.join(self.cache_directory, "vector_index") | |
| logger.info(f"μ€μ λ PDF λλ ν 리 (μ λ κ²½λ‘): {self.pdf_directory}") | |
| # λλ ν 리 κ²μ¦ | |
| self._verify_pdf_directory() | |
| # λλ ν 리 μμ± | |
| self._ensure_directories_exist() | |
| logger.info(f"PDF λ¬Έμ λλ ν 리: '{self.pdf_directory}'") | |
| logger.info(f"μΊμ λλ ν 리: '{self.cache_directory}'") | |
| # μ»΄ν¬λνΈ μ΄κΈ°ν | |
| try: | |
| self.document_processor = OptimizedDocumentProcessor( | |
| chunk_size=CHUNK_SIZE, | |
| chunk_overlap=CHUNK_OVERLAP | |
| ) | |
| except Exception as e: | |
| logger.error(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {e}") | |
| raise DocumentProcessingError(f"λ¬Έμ μ²λ¦¬κΈ° μ΄κΈ°ν μ€ν¨: {str(e)}") | |
| # λ²‘ν° μ μ₯μ μ΄κΈ°ν | |
| try: | |
| self.vector_store = VectorStore(use_milvus=False) | |
| except Exception as e: | |
| logger.error(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {e}") | |
| raise VectorStoreError(f"λ²‘ν° μ μ₯μ μ΄κΈ°ν μ€ν¨: {str(e)}") | |
| # λ¬Έμ μΈλ±μ€ λ‘λ | |
| self.file_index = self._load_file_index() | |
| # κΈ°λ³Έ λ³μ μ΄κΈ°ν | |
| self.documents = [] | |
| self.processed_files = [] | |
| self.is_initialized = False | |
| # μμ μ μλμΌλ‘ λ¬Έμ λ‘λ λ° μ²λ¦¬ | |
| logger.info("λ¬Έμ μλ λ‘λ λ° μ²λ¦¬ μμ...") | |
| self.auto_process_documents() | |
| logger.info("AutoRAGChatApp μ΄κΈ°ν μλ£") | |
| except Exception as e: | |
| logger.critical(f"μ ν리μΌμ΄μ μ΄κΈ°ν μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) | |
| # κΈ°λ³Έ μν μ€μ μΌλ‘ μ΅μνμ κΈ°λ₯ μ μ§ | |
| self.pdf_directory = os.path.abspath(PDF_DIRECTORY) | |
| self.documents = [] | |
| self.processed_files = [] | |
| self.is_initialized = False | |
| self.file_index = {} | |
| def _ensure_directories_exist(self) -> None: | |
| """ | |
| νμν λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνκ³ μμ± | |
| """ | |
| directories = [ | |
| self.pdf_directory, | |
| self.cache_directory, | |
| self.chunks_dir, | |
| self.vector_index_dir | |
| ] | |
| for directory in directories: | |
| try: | |
| os.makedirs(directory, exist_ok=True) | |
| except Exception as e: | |
| logger.error(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {e}") | |
| raise OSError(f"λλ ν 리 μμ± μ€ν¨ '{directory}': {str(e)}") | |
| def _process_pdf_file(self, file_path: str) -> List[Document]: | |
| """ | |
| PDF νμΌ μ²λ¦¬ - docling μ€ν¨ μ PyPDFLoader μ¬μ© | |
| Args: | |
| file_path: μ²λ¦¬ν PDF νμΌ κ²½λ‘ | |
| Returns: | |
| μ²λ¦¬λ λ¬Έμ μ²ν¬ 리μ€νΈ | |
| """ | |
| if not os.path.exists(file_path): | |
| logger.error(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| try: | |
| logger.info(f"doclingμΌλ‘ μ²λ¦¬ μλ: {file_path}") | |
| # docling μ¬μ© μλ | |
| try: | |
| # 10μ΄ νμμμ μ€μ (μ΅μ ) | |
| import signal | |
| def timeout_handler(signum, frame): | |
| raise TimeoutError("docling μ²λ¦¬ μκ° μ΄κ³Ό (60μ΄)") | |
| # 리λ μ€/λ§₯μμλ§ μλ (μλμ°μμλ 무μλ¨) | |
| try: | |
| signal.signal(signal.SIGALRM, timeout_handler) | |
| signal.alarm(60) # 60μ΄ νμμμ | |
| except (AttributeError, ValueError) as se: | |
| logger.warning(f"μκ·Έλ μ€μ μ€ν¨ (μλμ° νκ²½μΌ μ μμ): {se}") | |
| # doclingμΌλ‘ μ²λ¦¬ μλ | |
| chunks = self.document_processor.process_pdf(file_path, use_docling=True) | |
| # νμμμ μ·¨μ | |
| try: | |
| signal.alarm(0) | |
| except (AttributeError, ValueError): | |
| pass | |
| return chunks | |
| except TimeoutError as te: | |
| logger.warning(f"docling μ²λ¦¬ μκ° μ΄κ³Ό: {te}") | |
| logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
| # PyPDFLoaderλ‘ λ체 | |
| try: | |
| return self.document_processor.process_pdf(file_path, use_docling=False) | |
| except Exception as inner_e: | |
| logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) | |
| raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") | |
| except Exception as e: | |
| # docling μ€λ₯ νμΈ | |
| error_str = str(e) | |
| if "Invalid code point" in error_str or "RuntimeError" in error_str: | |
| logger.warning(f"docling μ²λ¦¬ μ€λ₯ (μ½λ ν¬μΈνΈ λ¬Έμ ): {error_str}") | |
| logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
| else: | |
| logger.warning(f"docling μ²λ¦¬ μ€λ₯: {error_str}") | |
| logger.info("PyPDFLoaderλ‘ λ체ν©λλ€.") | |
| # PyPDFLoaderλ‘ λ체 | |
| try: | |
| return self.document_processor.process_pdf(file_path, use_docling=False) | |
| except Exception as inner_e: | |
| logger.error(f"PyPDFLoader μ²λ¦¬ μ€λ₯: {inner_e}", exc_info=True) | |
| raise DocumentProcessingError(f"PDF λ‘λ© μ€ν¨ (PyPDFLoader): {str(inner_e)}") | |
| except DocumentProcessingError: | |
| # μ΄λ―Έ λνλ μμΈλ κ·Έλλ‘ μ λ¬ | |
| raise | |
| except Exception as e: | |
| logger.error(f"PDF μ²λ¦¬ μ€ μ¬κ°ν μ€λ₯: {e}", exc_info=True) | |
| # λΉ μ²ν¬λΌλ λ°ννμ¬ μ 체 μ²λ¦¬κ° μ€λ¨λμ§ μλλ‘ ν¨ | |
| logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨λ‘ λΉ μ²ν¬ λͺ©λ‘ λ°ν") | |
| return [] | |
| def _load_file_index(self) -> Dict[str, Dict[str, Any]]: | |
| """ | |
| νμΌ μΈλ±μ€ λ‘λ | |
| Returns: | |
| νμΌ κ²½λ‘ -> λ©νλ°μ΄ν° λ§€ν | |
| """ | |
| if os.path.exists(self.index_file): | |
| try: | |
| with open(self.index_file, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| except json.JSONDecodeError as e: | |
| logger.error(f"μΈλ±μ€ νμΌ JSON νμ± μ€ν¨: {e}") | |
| logger.warning("μμλ μΈλ±μ€ νμΌ, μλ‘μ΄ μΈλ±μ€λ₯Ό μμ±ν©λλ€.") | |
| return {} | |
| except Exception as e: | |
| logger.error(f"μΈλ±μ€ νμΌ λ‘λ μ€ν¨: {e}") | |
| return {} | |
| return {} | |
| def _save_file_index(self) -> None: | |
| """ | |
| νμΌ μΈλ±μ€ μ μ₯ | |
| """ | |
| try: | |
| with open(self.index_file, 'w', encoding='utf-8') as f: | |
| json.dump(self.file_index, f, ensure_ascii=False, indent=2) | |
| logger.debug("νμΌ μΈλ±μ€ μ μ₯ μλ£") | |
| except Exception as e: | |
| logger.error(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {e}") | |
| raise IOError(f"νμΌ μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") | |
| def _calculate_file_hash(self, file_path: str) -> str: | |
| """ | |
| νμΌ ν΄μ κ³μ° | |
| Args: | |
| file_path: νμΌ κ²½λ‘ | |
| Returns: | |
| MD5 ν΄μκ° | |
| """ | |
| if not os.path.exists(file_path): | |
| logger.error(f"ν΄μ κ³μ° μ€ν¨ - νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| raise FileNotFoundError(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| try: | |
| hasher = hashlib.md5() | |
| with open(file_path, 'rb') as f: | |
| buf = f.read(65536) | |
| while len(buf) > 0: | |
| hasher.update(buf) | |
| buf = f.read(65536) | |
| return hasher.hexdigest() | |
| except Exception as e: | |
| logger.error(f"νμΌ ν΄μ κ³μ° μ€ μ€λ₯: {e}") | |
| raise IOError(f"νμΌ ν΄μ κ³μ° μ€ν¨: {str(e)}") | |
| def _is_file_processed(self, file_path: str) -> bool: | |
| """ | |
| νμΌμ΄ μ΄λ―Έ μ²λ¦¬λμκ³ λ³κ²½λμ§ μμλμ§ νμΈ | |
| Args: | |
| file_path: νμΌ κ²½λ‘ | |
| Returns: | |
| μ²λ¦¬ μ¬λΆ | |
| """ | |
| # νμΌ μ‘΄μ¬ νμΈ | |
| if not os.path.exists(file_path): | |
| logger.warning(f"νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| return False | |
| # μΈλ±μ€μ νμΌ μ‘΄μ¬ μ¬λΆ νμΈ | |
| if file_path not in self.file_index: | |
| return False | |
| try: | |
| # νμ¬ ν΄μκ° κ³μ° | |
| current_hash = self._calculate_file_hash(file_path) | |
| # μ μ₯λ ν΄μκ°κ³Ό λΉκ΅ | |
| if self.file_index[file_path]['hash'] != current_hash: | |
| logger.info(f"νμΌ λ³κ²½ κ°μ§: {file_path}") | |
| return False | |
| # μ²ν¬ νμΌ μ‘΄μ¬ νμΈ | |
| chunks_path = self.file_index[file_path]['chunks_path'] | |
| if not os.path.exists(chunks_path): | |
| logger.warning(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
| return False | |
| return True | |
| except Exception as e: | |
| logger.error(f"νμΌ μ²λ¦¬ μν νμΈ μ€ μ€λ₯: {e}") | |
| return False | |
| def _get_chunks_path(self, file_hash: str) -> str: | |
| """ | |
| μ²ν¬ νμΌ κ²½λ‘ μμ± | |
| Args: | |
| file_hash: νμΌ ν΄μκ° | |
| Returns: | |
| μ²ν¬ νμΌ κ²½λ‘ | |
| """ | |
| return os.path.join(self.chunks_dir, f"{file_hash}.pkl") | |
| def _save_chunks(self, file_path: str, chunks: List[Document]) -> None: | |
| """ | |
| μ²ν¬ λ°μ΄ν° μ μ₯ | |
| Args: | |
| file_path: μλ³Έ νμΌ κ²½λ‘ | |
| chunks: λ¬Έμ μ²ν¬ 리μ€νΈ | |
| """ | |
| try: | |
| # ν΄μ κ³μ° | |
| file_hash = self._calculate_file_hash(file_path) | |
| # μ²ν¬ νμΌ κ²½λ‘ | |
| chunks_path = self._get_chunks_path(file_hash) | |
| # μ²ν¬ λ°μ΄ν° μ μ₯ | |
| with open(chunks_path, 'wb') as f: | |
| pickle.dump(chunks, f) | |
| # μΈλ±μ€ μ λ°μ΄νΈ | |
| self.file_index[file_path] = { | |
| 'hash': file_hash, | |
| 'chunks_path': chunks_path, | |
| 'last_processed': time.time(), | |
| 'chunks_count': len(chunks), | |
| 'file_size': os.path.getsize(file_path), | |
| 'file_name': os.path.basename(file_path) | |
| } | |
| # μΈλ±μ€ μ μ₯ | |
| self._save_file_index() | |
| logger.info(f"μ²ν¬ μ μ₯ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") | |
| except Exception as e: | |
| logger.error(f"μ²ν¬ μ μ₯ μ€ν¨: {e}", exc_info=True) | |
| raise IOError(f"μ²ν¬ μ μ₯ μ€ν¨: {str(e)}") | |
| def _load_chunks(self, file_path: str) -> List[Document]: | |
| """ | |
| μ μ₯λ μ²ν¬ λ°μ΄ν° λ‘λ | |
| Args: | |
| file_path: νμΌ κ²½λ‘ | |
| Returns: | |
| λ¬Έμ μ²ν¬ 리μ€νΈ | |
| """ | |
| if file_path not in self.file_index: | |
| logger.error(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| raise KeyError(f"μΈλ±μ€μ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {file_path}") | |
| chunks_path = self.file_index[file_path]['chunks_path'] | |
| if not os.path.exists(chunks_path): | |
| logger.error(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
| raise FileNotFoundError(f"μ²ν¬ νμΌμ΄ μ‘΄μ¬νμ§ μμ: {chunks_path}") | |
| try: | |
| with open(chunks_path, 'rb') as f: | |
| chunks = pickle.load(f) | |
| logger.info(f"μ²ν¬ λ‘λ μλ£: {file_path} ({len(chunks)}κ° μ²ν¬)") | |
| return chunks | |
| except pickle.UnpicklingError as e: | |
| logger.error(f"μ²ν¬ νμΌ μμ§λ ¬ν μ€ν¨: {e}") | |
| raise IOError(f"μ²ν¬ νμΌ μμ: {str(e)}") | |
| except Exception as e: | |
| logger.error(f"μ²ν¬ λ‘λ μ€ν¨: {e}", exc_info=True) | |
| raise IOError(f"μ²ν¬ λ‘λ μ€ν¨: {str(e)}") | |
| def _verify_pdf_directory(self): | |
| """PDF λλ ν 리 κ²μ¦ λ° νμΌ μ‘΄μ¬ νμΈ""" | |
| try: | |
| # λλ ν 리 μ‘΄μ¬ νμΈ | |
| if not os.path.exists(self.pdf_directory): | |
| try: | |
| logger.warning(f"PDF λλ ν λ¦¬κ° μ‘΄μ¬νμ§ μμ μμ±ν©λλ€: {self.pdf_directory}") | |
| os.makedirs(self.pdf_directory, exist_ok=True) | |
| except Exception as e: | |
| logger.error(f"PDF λλ ν 리 μμ± μ€ν¨: {e}") | |
| raise | |
| # λλ ν 리μΈμ§ νμΈ | |
| if not os.path.isdir(self.pdf_directory): | |
| logger.error(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") | |
| raise ConfigurationError(f"PDF κ²½λ‘κ° λλ ν λ¦¬κ° μλλλ€: {self.pdf_directory}") | |
| # PDF νμΌ μ‘΄μ¬ νμΈ | |
| pdf_files = [f for f in os.listdir(self.pdf_directory) if f.lower().endswith('.pdf')] | |
| if pdf_files: | |
| logger.info(f"PDF λλ ν 리μμ {len(pdf_files)}κ°μ PDF νμΌμ μ°Ύμμ΅λλ€: {pdf_files}") | |
| else: | |
| # μ¬λ¬ κ²½λ‘μμ PDF νμΌ νμ μλ | |
| alternative_paths = [ | |
| "./documents", | |
| "../documents", | |
| "documents", | |
| os.path.join(os.getcwd(), "documents") | |
| ] | |
| found_pdfs = False | |
| for alt_path in alternative_paths: | |
| if os.path.exists(alt_path) and os.path.isdir(alt_path): | |
| alt_pdf_files = [f for f in os.listdir(alt_path) if f.lower().endswith('.pdf')] | |
| if alt_pdf_files: | |
| logger.warning(f"λ체 κ²½λ‘ '{alt_path}'μμ PDF νμΌμ μ°Ύμμ΅λλ€. μ΄ κ²½λ‘λ₯Ό μ¬μ©ν©λλ€.") | |
| self.pdf_directory = os.path.abspath(alt_path) | |
| found_pdfs = True | |
| break | |
| if not found_pdfs: | |
| logger.warning(f"PDF λλ ν 리μ PDF νμΌμ΄ μμ΅λλ€: {self.pdf_directory}") | |
| logger.info("PDF νμΌμ λλ ν 리μ μΆκ°ν΄μ£ΌμΈμ.") | |
| except Exception as e: | |
| logger.error(f"PDF λλ ν 리 κ²μ¦ μ€ μ€λ₯: {e}", exc_info=True) | |
| raise | |
| def auto_process_documents(self) -> str: | |
| """ | |
| documents ν΄λμ PDF νμΌ μλ μ²λ¦¬ | |
| Returns: | |
| μ²λ¦¬ κ²°κ³Ό λ©μμ§ | |
| """ | |
| try: | |
| start_time = time.time() | |
| # PDF νμΌ λͺ©λ‘ μμ§μ κ°μ νμ¬ λ€μν κ²½λ‘ μ²λ¦¬ | |
| try: | |
| pdf_files = [] | |
| # μ€μ λ λλ ν 리μμ PDF νμΌ μ°ΎκΈ° | |
| logger.info(f"PDF νμΌ κ²μ κ²½λ‘: {self.pdf_directory}") | |
| if os.path.exists(self.pdf_directory) and os.path.isdir(self.pdf_directory): | |
| # λλ ν 리 λ΄μ© μΆλ ₯ (λλ²κΉ μ©) | |
| dir_contents = os.listdir(self.pdf_directory) | |
| logger.info(f"λλ ν 리 λ΄μ©: {dir_contents}") | |
| # PDF νμΌλ§ νν°λ§ | |
| for filename in os.listdir(self.pdf_directory): | |
| if filename.lower().endswith('.pdf'): | |
| file_path = os.path.join(self.pdf_directory, filename) | |
| if os.path.isfile(file_path): # μ€μ νμΌμΈμ§ νμΈ | |
| pdf_files.append(file_path) | |
| logger.info(f"PDF νμΌ μ°Ύμ: {file_path}") | |
| # λ°κ²¬λ λͺ¨λ νμΌ λ‘κ·Έ | |
| logger.info(f"λ°κ²¬λ λͺ¨λ PDF νμΌ: {pdf_files}") | |
| except FileNotFoundError: | |
| logger.error(f"PDF λλ ν 리λ₯Ό μ°Ύμ μ μμ: {self.pdf_directory}") | |
| return f"'{self.pdf_directory}' λλ ν 리λ₯Ό μ°Ύμ μ μμ΅λλ€. λλ ν λ¦¬κ° μ‘΄μ¬νλμ§ νμΈνμΈμ." | |
| except PermissionError: | |
| logger.error(f"PDF λλ ν 리 μ κ·Ό κΆν μμ: {self.pdf_directory}") | |
| return f"'{self.pdf_directory}' λλ ν 리μ μ κ·Όν μ μμ΅λλ€. κΆνμ νμΈνμΈμ." | |
| if not pdf_files: | |
| logger.warning(f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€.") | |
| return f"'{self.pdf_directory}' ν΄λμ PDF νμΌμ΄ μμ΅λλ€." | |
| logger.info(f"λ°κ²¬λ PDF νμΌ: {len(pdf_files)}κ°") | |
| # ν΄λ λ΄ PDF νμΌ μ²λ¦¬ | |
| new_files = [] | |
| updated_files = [] | |
| cached_files = [] | |
| failed_files = [] | |
| all_chunks = [] | |
| for file_path in pdf_files: | |
| try: | |
| if self._is_file_processed(file_path): | |
| # μΊμμμ μ²ν¬ λ‘λ | |
| try: | |
| chunks = self._load_chunks(file_path) | |
| all_chunks.extend(chunks) | |
| cached_files.append(file_path) | |
| self.processed_files.append(os.path.basename(file_path)) | |
| except Exception as e: | |
| logger.error(f"μΊμλ μ²ν¬ λ‘λ μ€ν¨: {e}") | |
| # νμΌμ λ€μ μ²λ¦¬ | |
| logger.info(f"μΊμ μ€ν¨λ‘ νμΌ μ¬μ²λ¦¬: {file_path}") | |
| chunks = self._process_pdf_file(file_path) | |
| if chunks: | |
| self._save_chunks(file_path, chunks) | |
| all_chunks.extend(chunks) | |
| updated_files.append(file_path) | |
| self.processed_files.append(os.path.basename(file_path)) | |
| else: | |
| failed_files.append(file_path) | |
| else: | |
| # μ νμΌ λλ λ³κ²½λ νμΌ μ²λ¦¬ | |
| logger.info(f"μ²λ¦¬ μ€: {file_path}") | |
| try: | |
| # κ°μ λ PDF μ²λ¦¬ λ©μλ μ¬μ© | |
| chunks = self._process_pdf_file(file_path) | |
| if chunks: # μ²ν¬κ° μλ κ²½μ°μλ§ μ μ₯ | |
| # μ²ν¬ μ μ₯ | |
| self._save_chunks(file_path, chunks) | |
| all_chunks.extend(chunks) | |
| if file_path in self.file_index: | |
| updated_files.append(file_path) | |
| else: | |
| new_files.append(file_path) | |
| self.processed_files.append(os.path.basename(file_path)) | |
| else: | |
| logger.warning(f"'{file_path}' μ²λ¦¬ μ€ν¨: μΆμΆλ μ²ν¬ μμ") | |
| failed_files.append(file_path) | |
| except Exception as e: | |
| logger.error(f"'{file_path}' μ²λ¦¬ μ€ μ€λ₯: {e}", exc_info=True) | |
| failed_files.append(file_path) | |
| except Exception as e: | |
| logger.error(f"'{file_path}' νμΌ μ²λ¦¬ 루ν μ€ μ€λ₯: {e}", exc_info=True) | |
| failed_files.append(file_path) | |
| # λͺ¨λ μ²ν¬ μ μ₯ | |
| self.documents = all_chunks | |
| processing_time = time.time() - start_time | |
| logger.info(f"λ¬Έμ μ²λ¦¬ μλ£: {len(all_chunks)}κ° μ²ν¬, {processing_time:.2f}μ΄") | |
| # λ²‘ν° μΈλ±μ€ μ²λ¦¬ | |
| try: | |
| self._process_vector_index(new_files, updated_files) | |
| except Exception as e: | |
| logger.error(f"λ²‘ν° μΈλ±μ€ μ²λ¦¬ μ€ν¨: {e}", exc_info=True) | |
| return f"λ¬Έμλ μ²λ¦¬λμμΌλ λ²‘ν° μΈλ±μ€ μμ±μ μ€ν¨νμ΅λλ€: {str(e)}" | |
| # RAG μ²΄μΈ μ΄κΈ°ν | |
| if RAG_CHAIN_AVAILABLE: | |
| try: | |
| logger.info("RAGChainμΌλ‘ μ΄κΈ°νλ₯Ό μλν©λλ€.") | |
| self.rag_chain = RAGChain(self.vector_store) | |
| self.is_initialized = True | |
| logger.info("RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
| except Exception as e: | |
| logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) | |
| # FallbackRAGChainμΌλ‘ λ체 μλ | |
| try: | |
| logger.info("FallbackRAGChainμΌλ‘ λ체ν©λλ€...") | |
| from fallback_rag_chain import FallbackRAGChain | |
| self.rag_chain = FallbackRAGChain(self.vector_store) | |
| self.is_initialized = True | |
| logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
| except Exception as fallback_e: | |
| logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) | |
| # SimpleRAGChain μλ (μ΅νμ μλ¨) | |
| try: | |
| logger.info("SimpleRAGChainμΌλ‘ λ체ν©λλ€...") | |
| from simple_rag_chain import SimpleRAGChain | |
| # API μ 보 κ°μ Έμ€κΈ° | |
| try: | |
| from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT | |
| logger.info(f"μ€μ νμΌμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") | |
| except ImportError: | |
| # μ€μ νμΌμμ κ°μ Έμ¬ μ μλ κ²½μ° νκ²½ λ³μ νμΈ | |
| DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") | |
| DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") | |
| DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", | |
| "https://api.deepseek.com/v1/chat/completions") | |
| logger.info(f"νκ²½ λ³μμμ DeepSeek API μ 보λ₯Ό λ‘λνμ΅λλ€: λͺ¨λΈ={DEEPSEEK_MODEL}") | |
| # SimpleRAGChain μ΄κΈ°ν μλ | |
| self.rag_chain = SimpleRAGChain(self.vector_store) | |
| self.is_initialized = True | |
| logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") | |
| except Exception as simple_e: | |
| logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) | |
| return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" | |
| else: | |
| # RAGChainμ μ¬μ©ν μ μλ κ²½μ° | |
| try: | |
| logger.info("κΈ°λ³Έ RAG Chainμ μ¬μ©ν μ μμ΄ λ체 λ²μ μ μλν©λλ€...") | |
| # FallbackRAGChain μλ | |
| try: | |
| from fallback_rag_chain import FallbackRAGChain | |
| self.rag_chain = FallbackRAGChain(self.vector_store) | |
| self.is_initialized = True | |
| logger.info("ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ±κ³΅") | |
| except Exception as fallback_e: | |
| logger.error(f"ν΄λ°± RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {fallback_e}", exc_info=True) | |
| # SimpleRAGChain μλ (μ΅νμ μλ¨) | |
| try: | |
| from simple_rag_chain import SimpleRAGChain | |
| self.rag_chain = SimpleRAGChain(self.vector_store) | |
| self.is_initialized = True | |
| logger.info("SimpleRAGChain μ΄κΈ°ν μ±κ³΅") | |
| except Exception as simple_e: | |
| logger.error(f"λͺ¨λ RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {simple_e}", exc_info=True) | |
| return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€" | |
| except Exception as e: | |
| logger.error(f"RAG μ²΄μΈ μ΄κΈ°ν μ€ν¨: {e}", exc_info=True) | |
| return f"λ¬Έμμ λ²‘ν° μΈλ±μ€λ μ²λ¦¬λμμΌλ RAG μ²΄μΈ μ΄κΈ°νμ μ€ν¨νμ΅λλ€: {str(e)}" | |
| # μ±κ³΅ λ©μμ§ μμ± | |
| result_message = f"""λ¬Έμ μ²λ¦¬ μλ£! | |
| - μ²λ¦¬λ νμΌ: {len(pdf_files)}κ° | |
| - μΊμλ νμΌ: {len(cached_files)}κ° | |
| - μ νμΌ: {len(new_files)}κ° | |
| - μ λ°μ΄νΈλ νμΌ: {len(updated_files)}κ° | |
| - μ€ν¨ν νμΌ: {len(failed_files)}κ° | |
| - μ΄ μ²ν¬ μ: {len(all_chunks)}κ° | |
| - μ²λ¦¬ μκ°: {processing_time:.2f}μ΄ | |
| μ΄μ μ§λ¬Έν μ€λΉκ° λμμ΅λλ€!""" | |
| return result_message | |
| except Exception as e: | |
| error_message = f"λ¬Έμ μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| logger.error(error_message, exc_info=True) | |
| return error_message | |
| def _process_vector_index(self, new_files: List[str], updated_files: List[str]) -> None: | |
| """ | |
| λ²‘ν° μΈλ±μ€ μ²λ¦¬ | |
| Args: | |
| new_files: μλ‘ μΆκ°λ νμΌ λͺ©λ‘ | |
| updated_files: μ λ°μ΄νΈλ νμΌ λͺ©λ‘ | |
| """ | |
| # λ²‘ν° μΈλ±μ€ μ μ₯ κ²½λ‘ νμΈ | |
| if os.path.exists(self.vector_index_dir) and any(os.listdir(self.vector_index_dir)): | |
| # κΈ°μ‘΄ λ²‘ν° μΈλ±μ€ λ‘λ | |
| try: | |
| logger.info("μ μ₯λ λ²‘ν° μΈλ±μ€ λ‘λ μ€...") | |
| vector_store_loaded = self.vector_store.load_local(self.vector_index_dir) | |
| # μΈλ±μ€ λ‘λ μ±κ³΅ νμΈ | |
| if self.vector_store.vector_store is not None: | |
| # μ λ¬Έμλ λ³κ²½λ λ¬Έμκ° μμΌλ©΄ μΈλ±μ€ μ λ°μ΄νΈ | |
| if new_files or updated_files: | |
| logger.info("λ²‘ν° μΈλ±μ€ μ λ°μ΄νΈ μ€...") | |
| self.vector_store.add_documents(self.documents) | |
| logger.info("λ²‘ν° μΈλ±μ€ λ‘λ μλ£") | |
| else: | |
| logger.warning("λ²‘ν° μΈλ±μ€λ₯Ό λ‘λνμΌλ μ ν¨νμ§ μμ, μλ‘ μμ±ν©λλ€.") | |
| self.vector_store.create_or_load(self.documents) | |
| except Exception as e: | |
| logger.error(f"λ²‘ν° μΈλ±μ€ λ‘λ μ€ν¨, μλ‘ μμ±ν©λλ€: {e}", exc_info=True) | |
| # μ λ²‘ν° μΈλ±μ€ μμ± | |
| self.vector_store.create_or_load(self.documents) | |
| else: | |
| # μ λ²‘ν° μΈλ±μ€ μμ± | |
| logger.info("μ λ²‘ν° μΈλ±μ€ μμ± μ€...") | |
| self.vector_store.create_or_load(self.documents) | |
| # λ²‘ν° μΈλ±μ€ μ μ₯ | |
| if self.vector_store and self.vector_store.vector_store is not None: | |
| try: | |
| logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€: {self.vector_index_dir}") | |
| save_result = self.vector_store.save_local(self.vector_index_dir) | |
| logger.info(f"λ²‘ν° μΈλ±μ€ μ μ₯ μλ£: {self.vector_index_dir}") | |
| except Exception as e: | |
| logger.error(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {e}", exc_info=True) | |
| raise VectorStoreError(f"λ²‘ν° μΈλ±μ€ μ μ₯ μ€ν¨: {str(e)}") | |
| else: | |
| logger.warning("λ²‘ν° μΈλ±μ€κ° μ΄κΈ°νλμ§ μμ μ μ₯νμ§ μμ΅λλ€.") | |
| def reset_cache(self) -> str: | |
| """ | |
| μΊμ μ΄κΈ°ν | |
| Returns: | |
| κ²°κ³Ό λ©μμ§ | |
| """ | |
| try: | |
| # μ²ν¬ νμΌ μμ | |
| try: | |
| for filename in os.listdir(self.chunks_dir): | |
| file_path = os.path.join(self.chunks_dir, filename) | |
| if os.path.isfile(file_path): | |
| os.remove(file_path) | |
| logger.info("μ²ν¬ μΊμ νμΌ μμ μλ£") | |
| except Exception as e: | |
| logger.error(f"μ²ν¬ νμΌ μμ μ€ μ€λ₯: {e}") | |
| return f"μ²ν¬ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| # μΈλ±μ€ μ΄κΈ°ν | |
| self.file_index = {} | |
| try: | |
| self._save_file_index() | |
| logger.info("νμΌ μΈλ±μ€ μ΄κΈ°ν μλ£") | |
| except Exception as e: | |
| logger.error(f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯: {e}") | |
| return f"μΈλ±μ€ νμΌ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" | |
| # λ²‘ν° μΈλ±μ€ μμ | |
| try: | |
| for filename in os.listdir(self.vector_index_dir): | |
| file_path = os.path.join(self.vector_index_dir, filename) | |
| if os.path.isfile(file_path): | |
| os.remove(file_path) | |
| logger.info("λ²‘ν° μΈλ±μ€ νμΌ μμ μλ£") | |
| except Exception as e: | |
| logger.error(f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯: {e}") | |
| return f"λ²‘ν° μΈλ±μ€ νμΌ μμ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| self.documents = [] | |
| self.processed_files = [] | |
| self.is_initialized = False | |
| logger.info("μΊμ μ΄κΈ°ν μλ£") | |
| return "μΊμκ° μ΄κΈ°νλμμ΅λλ€. λ€μ μ€ν μ λͺ¨λ λ¬Έμκ° λ€μ μ²λ¦¬λ©λλ€." | |
| except Exception as e: | |
| error_msg = f"μΊμ μ΄κΈ°ν μ€ μ€λ₯ λ°μ: {str(e)}" | |
| logger.error(error_msg, exc_info=True) | |
| return error_msg | |
| def process_query(self, query: str, chat_history: List[Tuple[str, str]]) -> Tuple[str, List[Tuple[str, str]]]: | |
| """ | |
| μ¬μ©μ 쿼리 μ²λ¦¬ | |
| Args: | |
| query: μ¬μ©μ μ§λ¬Έ | |
| chat_history: λν κΈ°λ‘ | |
| Returns: | |
| μλ΅ λ° μ λ°μ΄νΈλ λν κΈ°λ‘ | |
| """ | |
| if not query or not query.strip(): | |
| response = "μ§λ¬Έμ΄ λΉμ΄ μμ΅λλ€. μ§λ¬Έμ μ λ ₯ν΄ μ£ΌμΈμ." | |
| chat_history.append((query, response)) | |
| return "", chat_history | |
| if not self.is_initialized: | |
| response = "λ¬Έμ λ‘λκ° μ΄κΈ°νλμ§ μμμ΅λλ€. μλ λ‘λλ₯Ό μλν©λλ€." | |
| chat_history.append((query, response)) | |
| # μλ λ‘λ μλ | |
| try: | |
| init_result = self.auto_process_documents() | |
| if not self.is_initialized: | |
| response = f"λ¬Έμλ₯Ό λ‘λν μ μμ΅λλ€. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνμΈμ. μ΄κΈ°ν κ²°κ³Ό: {init_result}" | |
| chat_history.append((query, response)) | |
| return "", chat_history | |
| except Exception as e: | |
| response = f"λ¬Έμ λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| logger.error(f"μλ λ‘λ μ€ν¨: {e}", exc_info=True) | |
| chat_history.append((query, response)) | |
| return "", chat_history | |
| try: | |
| # RAG μ²΄μΈ μ€ν λ° μλ΅ μμ± | |
| start_time = time.time() | |
| logger.info(f"쿼리 μ²λ¦¬ μμ: {query}") | |
| # rag_chainμ΄ μ΄κΈ°νλμλμ§ νμΈ | |
| if not hasattr(self, 'rag_chain') or self.rag_chain is None: | |
| raise RAGInitializationError("RAG 체μΈμ΄ μ΄κΈ°νλμ§ μμμ΅λλ€") | |
| # 1. λ¨Όμ νμ€ RAG 체μΈμΌλ‘ μλ | |
| try: | |
| response = self.rag_chain.run(query) | |
| logger.info(f"κΈ°λ³Έ RAG 체μΈμΌλ‘ μλ΅ μμ± μ±κ³΅") | |
| except Exception as rag_error: | |
| logger.error(f"κΈ°λ³Έ RAG μ²΄μΈ μ€ν μ€ν¨: {rag_error}, λμ μλ") | |
| # 2. DeepSeek API μ§μ νΈμΆ μλ (RAG μ²΄μΈ μ°ν) | |
| try: | |
| # DeepSeek API μ 보 κ°μ Έμ€κΈ° | |
| try: | |
| from config import DEEPSEEK_API_KEY, DEEPSEEK_MODEL, DEEPSEEK_ENDPOINT | |
| except ImportError: | |
| # μ€μ λͺ¨λμμ κ°μ Έμ¬ μ μλ κ²½μ° κΈ°λ³Έκ° μ€μ | |
| DEEPSEEK_API_KEY = os.environ.get("DEEPSEEK_API_KEY", "") | |
| DEEPSEEK_MODEL = os.environ.get("DEEPSEEK_MODEL", "deepseek-chat") | |
| DEEPSEEK_ENDPOINT = os.environ.get("DEEPSEEK_ENDPOINT", | |
| "https://api.deepseek.com/v1/chat/completions") | |
| # μ§μ API νΈμΆ ν¨μ μ μ (μΈλΆ λͺ¨λ μμ‘΄μ± μ κ±°) | |
| def direct_api_call(query, context, api_key, model_name, endpoint, max_retries=3, timeout=60): | |
| """DeepSeek API μ§μ νΈμΆ ν¨μ""" | |
| import requests | |
| import json | |
| import time | |
| # ν둬ννΈ κΈΈμ΄ μ ν | |
| if len(context) > 6000: | |
| context = context[:2500] + "\n...(μ€λ΅)...\n" + context[-2500:] | |
| # ν둬ννΈ κ΅¬μ± | |
| prompt = f""" | |
| λ€μ μ 보λ₯Ό κΈ°λ°μΌλ‘ μ§λ¬Έμ μ ννκ² λ΅λ³ν΄μ£ΌμΈμ. | |
| μ§λ¬Έ: {query} | |
| μ°Έκ³ μ 보: | |
| {context} | |
| μ°Έκ³ μ 보μ λ΅μ΄ μμΌλ©΄ λ°λμ κ·Έ μ 보λ₯Ό κΈ°λ°μΌλ‘ λ΅λ³νμΈμ. | |
| μ°Έκ³ μ 보μ λ΅μ΄ μλ κ²½μ°μλ μΌλ°μ μΈ μ§μμ νμ©νμ¬ λ΅λ³ν μ μμ§λ§, "μ 곡λ λ¬Έμμλ μ΄ μ λ³΄κ° μμΌλ, μΌλ°μ μΌλ‘λ..." μμΌλ‘ μμνμΈμ. | |
| λ΅λ³μ μ ννκ³ κ°κ²°νκ² μ 곡νλ, κ°λ₯ν μ°Έκ³ μ 보μμ κ·Όκ±°λ₯Ό μ°Ύμ μ€λͺ ν΄μ£ΌμΈμ. | |
| μ°Έκ³ μ 보μ μΆμ²λ ν¨κ» μλ €μ£ΌμΈμ. | |
| """ | |
| # API μμ² μλ | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {api_key}" | |
| } | |
| payload = { | |
| "model": model_name, | |
| "messages": [{"role": "user", "content": prompt}], | |
| "temperature": 0.3, | |
| "max_tokens": 1000 | |
| } | |
| # μ¬μλ λ‘μ§ | |
| retry_delay = 1.0 | |
| for attempt in range(max_retries): | |
| try: | |
| logger.info(f"DeepSeek API μ§μ νΈμΆ μλ ({attempt + 1}/{max_retries})...") | |
| response = requests.post( | |
| endpoint, | |
| headers=headers, | |
| json=payload, | |
| timeout=timeout | |
| ) | |
| if response.status_code == 200: | |
| result = response.json() | |
| content = result.get("choices", [{}])[0].get("message", {}).get("content", "") | |
| logger.info(f"DeepSeek API μ§μ νΈμΆ μ±κ³΅") | |
| return content | |
| else: | |
| logger.warning(f"API μ€λ₯: μν μ½λ {response.status_code}") | |
| # μμ² νλμΈ κ²½μ° λ μ€λ λκΈ° | |
| if response.status_code == 429: | |
| retry_delay = min(retry_delay * 3, 15) | |
| else: | |
| retry_delay = min(retry_delay * 2, 10) | |
| if attempt < max_retries - 1: | |
| logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") | |
| time.sleep(retry_delay) | |
| except Exception as e: | |
| logger.error(f"API νΈμΆ μ€λ₯: {e}") | |
| if attempt < max_retries - 1: | |
| logger.info(f"{retry_delay}μ΄ ν μ¬μλ...") | |
| time.sleep(retry_delay) | |
| retry_delay = min(retry_delay * 2, 10) | |
| # λͺ¨λ μλ μ€ν¨ | |
| raise Exception("μ΅λ μ¬μλ νμ μ΄κ³Ό") | |
| # λ²‘ν° κ²μ μν | |
| if self.vector_store and hasattr(self.vector_store, "similarity_search"): | |
| logger.info("λ²‘ν° κ²μ μν...") | |
| docs = self.vector_store.similarity_search(query, k=5) | |
| # κ²μ κ²°κ³Ό 컨ν μ€νΈ κ΅¬μ± | |
| context_parts = [] | |
| for i, doc in enumerate(docs, 1): | |
| source = doc.metadata.get("source", "μ μ μλ μΆμ²") | |
| page = doc.metadata.get("page", "") | |
| source_info = f"{source}" | |
| if page: | |
| source_info += f" (νμ΄μ§: {page})" | |
| context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") | |
| context = "\n".join(context_parts) | |
| # μ§μ API νΈμΆ | |
| logger.info("DeepSeek API μ§μ νΈμΆ μλ...") | |
| response = direct_api_call( | |
| query, | |
| context, | |
| DEEPSEEK_API_KEY, | |
| DEEPSEEK_MODEL, | |
| DEEPSEEK_ENDPOINT, | |
| max_retries=3, | |
| timeout=120 | |
| ) | |
| logger.info("DeepSeek API μ§μ νΈμΆ μ±κ³΅") | |
| else: | |
| raise Exception("λ²‘ν° μ€ν μ΄κ° μ΄κΈ°νλμ§ μμμ΅λλ€") | |
| except Exception as direct_api_error: | |
| logger.error(f"DeepSeek API μ§μ νΈμΆ μ€ν¨: {direct_api_error}, κ²μ κ²°κ³Ό λ°ν") | |
| # 3. κ²μ κ²°κ³Όλ§μ΄λΌλ λ°ν | |
| try: | |
| # λ²‘ν° κ²μ μν | |
| if self.vector_store and hasattr(self.vector_store, "similarity_search"): | |
| docs = self.vector_store.similarity_search(query, k=5) | |
| # κ²μ κ²°κ³Ό 컨ν μ€νΈ κ΅¬μ± | |
| context_parts = [] | |
| for i, doc in enumerate(docs, 1): | |
| source = doc.metadata.get("source", "μ μ μλ μΆμ²") | |
| page = doc.metadata.get("page", "") | |
| source_info = f"{source}" | |
| if page: | |
| source_info += f" (νμ΄μ§: {page})" | |
| context_parts.append(f"[μ°Έκ³ μλ£ {i}] - μΆμ²: {source_info}\n{doc.page_content}\n") | |
| context = "\n".join(context_parts) | |
| # κ°λ¨ν μλ΅ μμ± | |
| predefined_answers = { | |
| "λνλ―Όκ΅μ μλ": "λνλ―Όκ΅μ μλλ μμΈμ λλ€.", | |
| "μλ": "λνλ―Όκ΅μ μλλ μμΈμ λλ€.", | |
| "λꡬμΌ": "μ λ RAG κΈ°λ° μ§μμλ΅ μμ€ν μ λλ€. λ¬Έμλ₯Ό κ²μνκ³ κ΄λ ¨ μ 보λ₯Ό μ°Ύμλ립λλ€.", | |
| "μλ ": "μλ νμΈμ! 무μμ λμλ릴κΉμ?", | |
| "λν΄": "μ¬μ©μμ μ§λ¬Έμ λ΅λ³νκΈ° μν΄ λ¬Έμλ₯Ό κ²μνκ³ μμ΅λλ€. 무μμ μλ €λ릴κΉμ?" | |
| } | |
| # μ§λ¬Έμ λ§λ 미리 μ μλ μλ΅μ΄ μλμ§ νμΈ | |
| for key, answer in predefined_answers.items(): | |
| if key in query.lower(): | |
| response = answer | |
| logger.info(f"미리 μ μλ μλ΅ μ 곡: {key}") | |
| break | |
| else: | |
| # 미리 μ μλ μλ΅μ΄ μμΌλ©΄ κ²μ κ²°κ³Όλ§ νμ | |
| response = f""" | |
| API μλ² μ°κ²°μ λ¬Έμ κ° μμ΄ κ²μ κ²°κ³Όλ§ νμν©λλ€. | |
| μ§λ¬Έ: {query} | |
| κ²μλ κ΄λ ¨ λ¬Έμ: | |
| {context} | |
| [μ°Έκ³ ] API μ°κ²° λ¬Έμ λ‘ μΈν΄ μλ μμ½μ΄ μ 곡λμ§ μμ΅λλ€. λ€μ μλνκ±°λ λ€λ₯Έ μ§λ¬Έμ ν΄λ³΄μΈμ. | |
| """ | |
| logger.info("κ²μ κ²°κ³Όλ§ νμ") | |
| else: | |
| response = f"API μ°κ²° λ° λ²‘ν° κ²μμ λͺ¨λ μ€ν¨νμ΅λλ€. μμ€ν κ΄λ¦¬μμκ² λ¬ΈμνμΈμ." | |
| except Exception as fallback_error: | |
| logger.error(f"μ΅μ’ ν΄λ°± μλ΅ μμ± μ€ν¨: {fallback_error}") | |
| # 4. μ΅νμ λ°©λ²: μ€λ₯ λ©μμ§λ₯Ό μλ΅μΌλ‘ λ°ν | |
| if "Connection error" in str(rag_error) or "timeout" in str(rag_error).lower(): | |
| response = f""" | |
| API μλ² μ°κ²°μ λ¬Έμ κ° μμ΅λλ€. μ μ ν λ€μ μλν΄μ£ΌμΈμ. | |
| μ§λ¬Έ: {query} | |
| [μ°Έκ³ ] νμ¬ DeepSeek API μλ²μμ μ°κ²°μ΄ μννμ§ μμ΅λλ€. μ΄λ‘ μΈν΄ μ§λ¬Έμ λν μλ΅μ μ 곡ν μ μμ΅λλ€. | |
| """ | |
| else: | |
| response = f"쿼리 μ²λ¦¬ μ€ μ€λ₯κ° λ°μνμ΅λλ€: {str(rag_error)}" | |
| end_time = time.time() | |
| query_time = end_time - start_time | |
| logger.info(f"쿼리 μ²λ¦¬ μλ£: {query_time:.2f}μ΄") | |
| chat_history.append((query, response)) | |
| return "", chat_history | |
| except RAGInitializationError as e: | |
| error_msg = f"RAG μμ€ν μ΄κΈ°ν μ€λ₯: {str(e)}. 'documents' ν΄λμ PDF νμΌμ΄ μλμ§ νμΈνκ³ , μ¬μμν΄ λ³΄μΈμ." | |
| logger.error(f"쿼리 μ²λ¦¬ μ€ RAG μ΄κΈ°ν μ€λ₯: {e}", exc_info=True) | |
| chat_history.append((query, error_msg)) | |
| return "", chat_history | |
| except (VectorStoreError, DocumentProcessingError) as e: | |
| error_msg = f"λ¬Έμ μ²λ¦¬ μμ€ν μ€λ₯: {str(e)}. λ¬Έμ νμμ΄ μ¬λ°λ₯Έμ§ νμΈν΄ 보μΈμ." | |
| logger.error(f"쿼리 μ²λ¦¬ μ€ λ¬Έμ/λ²‘ν° μ€ν μ΄ μ€λ₯: {e}", exc_info=True) | |
| chat_history.append((query, error_msg)) | |
| return "", chat_history | |
| except Exception as e: | |
| error_msg = f"쿼리 μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| logger.error(f"쿼리 μ²λ¦¬ μ€ μμμΉ λͺ»ν μ€λ₯: {e}", exc_info=True) | |
| chat_history.append((query, error_msg)) | |
| return "", chat_history | |
| def launch_app(self) -> None: | |
| """ | |
| Gradio μ± μ€ν | |
| """ | |
| try: | |
| import gradio as gr | |
| except ImportError: | |
| logger.error("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") | |
| print("Gradio λΌμ΄λΈλ¬λ¦¬λ₯Ό μ°Ύμ μ μμ΅λλ€. pip install gradioλ‘ μ€μΉνμΈμ.") | |
| return | |
| app_instance = self | |
| try: | |
| with gr.Blocks(title="PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") as app: | |
| gr.Markdown("# PDF λ¬Έμ κΈ°λ° RAG μ±λ΄") | |
| gr.Markdown(f"* μ¬μ© μ€μΈ LLM λͺ¨λΈ: **{LLM_MODEL}**") | |
| actual_pdf_dir = ( | |
| self.pdf_directory.replace("\\", "\\\\") | |
| if os.name == 'nt' else self.pdf_directory | |
| ) | |
| gr.Markdown(f"* PDF λ¬Έμ ν΄λ: **{actual_pdf_dir}**") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| status_box = gr.Textbox( | |
| label="λ¬Έμ μ²λ¦¬ μν", | |
| value=self._get_status_message(), | |
| lines=5, | |
| interactive=False | |
| ) | |
| refresh_button = gr.Button("λ¬Έμ μλ‘ μ½κΈ°", variant="primary") | |
| reset_button = gr.Button("μΊμ μ΄κΈ°ν", variant="stop") | |
| status_info = gr.Markdown( | |
| value=f"μμ€ν μν: {'μ΄κΈ°νλ¨' if self.is_initialized else 'μ΄κΈ°νλμ§ μμ'}" | |
| ) | |
| with gr.Accordion("μΊμ μΈλΆ μ 보", open=False): | |
| cache_info = gr.Textbox( | |
| label="μΊμλ νμΌ μ 보", | |
| value=self._get_cache_info(), | |
| lines=5, | |
| interactive=False | |
| ) | |
| with gr.Column(scale=2): | |
| chatbot = gr.Chatbot( | |
| label="λν λ΄μ©", | |
| bubble_full_width=False, | |
| height=500, | |
| show_copy_button=True | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=4): | |
| query_box = gr.Textbox( | |
| label="μ§λ¬Έ", | |
| placeholder="μ²λ¦¬λ λ¬Έμ λ΄μ©μ λν΄ μ§λ¬ΈνμΈμ...", | |
| lines=2 | |
| ) | |
| with gr.Column(scale=1): | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="μμ±μΌλ‘ μ§λ¬ΈνκΈ°" | |
| ) | |
| with gr.Row(): | |
| submit_btn = gr.Button("μ μ‘", variant="primary") | |
| clear_chat_button = gr.Button("λν μ΄κΈ°ν") | |
| # VITO STTμ© μμ± μ²λ¦¬ ν¨μ | |
| def process_audio(audio): | |
| logger.info("μμ± μΈμ μ²λ¦¬ μμ...") | |
| try: | |
| from vito_stt import VitoSTT | |
| import soundfile as sf | |
| import tempfile | |
| import os | |
| if audio is None: | |
| return "μμ±μ΄ λ Ήμλμ§ μμμ΅λλ€." | |
| sr, y = audio | |
| logger.info(f"μ€λμ€ λ Ήμ λ°μ΄ν° μμ : μνλ μ΄νΈ={sr}Hz, κΈΈμ΄={len(y)}μν") | |
| if len(y) / sr < 1.0: | |
| return "λ Ήμλ μμ±μ΄ λ무 μ§§μ΅λλ€. λ€μ μλν΄μ£ΌμΈμ." | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp: | |
| tmp_path = tmp.name | |
| sf.write(tmp_path, y, sr, format="WAV") | |
| logger.info(f"μμ WAV νμΌ μ μ₯λ¨: {tmp_path}") | |
| vito = VitoSTT() | |
| with open(tmp_path, "rb") as f: | |
| audio_bytes = f.read() | |
| result = vito.transcribe_audio(audio_bytes, language="ko") | |
| try: | |
| os.unlink(tmp_path) | |
| logger.info("μμ μ€λμ€ νμΌ μμ λ¨") | |
| except Exception as e: | |
| logger.warning(f"μμ νμΌ μμ μ€ν¨: {e}") | |
| if result.get("success"): | |
| recognized_text = result.get("text", "") | |
| logger.info(f"μμ±μΈμ μ±κ³΅: {recognized_text}") | |
| return recognized_text | |
| else: | |
| error_msg = f"μμ± μΈμ μ€ν¨: {result.get('error', 'μ μ μλ μ€λ₯')}" | |
| logger.error(error_msg) | |
| return error_msg | |
| except ImportError as e: | |
| logger.error(f"νμν λΌμ΄λΈλ¬λ¦¬ λλ½: {e}") | |
| return ("μμ±μΈμμ νμν λΌμ΄λΈλ¬λ¦¬κ° μ€μΉλμ§ μμμ΅λλ€. " | |
| "pip install soundfile numpy requests λ₯Ό μ€νν΄μ£ΌμΈμ.") | |
| except Exception as e: | |
| logger.error(f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
| return f"μμ± μ²λ¦¬ μ€ μ€λ₯ λ°μ: {str(e)}" | |
| # μμ± μΈμ ν μλ μ§λ¬Έ μ²λ¦¬ | |
| def process_audio_and_submit(audio, chat_history): | |
| recognized_text = process_audio(audio) | |
| if (not recognized_text | |
| or recognized_text.startswith("μμ± μΈμ μ€ν¨") | |
| or recognized_text.startswith("μμ± μ²λ¦¬ μ€ μ€λ₯")): | |
| return recognized_text, chat_history | |
| return app_instance.process_query(recognized_text, chat_history) | |
| def update_ui_after_refresh(result): | |
| return ( | |
| result, | |
| app_instance._get_status_message(), | |
| f"μμ€ν μν: {'μ΄κΈ°νλ¨' if app_instance.is_initialized else 'μ΄κΈ°νλμ§ μμ'}", | |
| app_instance._get_cache_info() | |
| ) | |
| # μ΄λ²€νΈ νΈλ€λ¬ λ°μΈλ© | |
| audio_input.stop_recording( | |
| fn=process_audio_and_submit, | |
| inputs=[audio_input, chatbot], | |
| outputs=[query_box, chatbot] | |
| ) | |
| audio_input.stop_recording( | |
| fn=process_audio, | |
| inputs=[audio_input], | |
| outputs=[query_box] | |
| ) | |
| refresh_button.click( | |
| fn=lambda: update_ui_after_refresh(self.auto_process_documents()), | |
| inputs=[], | |
| outputs=[status_box, status_box, status_info, cache_info] | |
| ) | |
| reset_button.click( | |
| fn=lambda: update_ui_after_refresh( | |
| f"{self.reset_cache()}\n\n{self.auto_process_documents()}" | |
| ), | |
| inputs=[], | |
| outputs=[status_box, status_box, status_info, cache_info] | |
| ) | |
| submit_btn.click( | |
| fn=self.process_query, | |
| inputs=[query_box, chatbot], | |
| outputs=[query_box, chatbot] | |
| ) | |
| query_box.submit( | |
| fn=self.process_query, | |
| inputs=[query_box, chatbot], | |
| outputs=[query_box, chatbot] | |
| ) | |
| clear_chat_button.click( | |
| fn=lambda: [], | |
| outputs=[chatbot] | |
| ) | |
| app.launch(share=False) | |
| except Exception as e: | |
| logger.error(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}", exc_info=True) | |
| print(f"Gradio μ± μ€ν μ€ μ€λ₯ λ°μ: {e}") | |
| def _get_status_message(self) -> str: | |
| """ | |
| νμ¬ μ²λ¦¬ μν λ©μμ§ μμ± | |
| Returns: | |
| μν λ©μμ§ | |
| """ | |
| if not self.processed_files: | |
| return "μ²λ¦¬λ λ¬Έμκ° μμ΅λλ€. 'λ¬Έμ μλ‘ μ½κΈ°' λ²νΌμ ν΄λ¦νμΈμ." | |
| # DeepSeek API μν νμΈ | |
| from config import USE_DEEPSEEK, DEEPSEEK_API_KEY, DEEPSEEK_MODEL | |
| model_info = "" | |
| if USE_DEEPSEEK and DEEPSEEK_API_KEY: | |
| # DeepSeek API ν μ€νΈ μν | |
| try: | |
| # ν μ€νΈ ν¨μ κ°μ Έμ€κΈ° μλ | |
| try: | |
| from deepseek_utils import test_deepseek_api | |
| # DeepSeek μ€μ κ°μ Έμ€κΈ° | |
| from config import DEEPSEEK_ENDPOINT | |
| # API ν μ€νΈ | |
| test_result = test_deepseek_api(DEEPSEEK_API_KEY, DEEPSEEK_ENDPOINT, DEEPSEEK_MODEL) | |
| if test_result["success"]: | |
| model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" | |
| else: | |
| model_info = f"\nDeepSeek API μν: μ€λ₯ - {test_result['message']}" | |
| except ImportError: | |
| # μ§μ ν μ€νΈ μ€ν | |
| import requests | |
| import json | |
| # DeepSeek μ€μ κ°μ Έμ€κΈ° | |
| from config import DEEPSEEK_ENDPOINT | |
| # ν μ€νΈμ© κ°λ¨ν ν둬ννΈ | |
| test_prompt = "Hello, please respond with a short greeting." | |
| # API μμ² ν€λ λ° λ°μ΄ν° | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {DEEPSEEK_API_KEY}" | |
| } | |
| payload = { | |
| "model": DEEPSEEK_MODEL, | |
| "messages": [{"role": "user", "content": test_prompt}], | |
| "temperature": 0.7, | |
| "max_tokens": 50 | |
| } | |
| # API μμ² μ μ‘ | |
| try: | |
| response = requests.post( | |
| DEEPSEEK_ENDPOINT, | |
| headers=headers, | |
| data=json.dumps(payload), | |
| timeout=5 # 5μ΄ νμμμ (UI λ°μμ± μ μ§) | |
| ) | |
| # μλ΅ νμΈ | |
| if response.status_code == 200: | |
| model_info = f"\nDeepSeek API μν: μ μ ({DEEPSEEK_MODEL})" | |
| else: | |
| error_message = response.text[:100] | |
| model_info = f"\nDeepSeek API μν: μ€λ₯ (μν μ½λ: {response.status_code})" | |
| except Exception as e: | |
| model_info = f"\nDeepSeek API μν: μ°κ²° μ€ν¨ ({str(e)[:100]})" | |
| except Exception as e: | |
| model_info = f"\nDeepSeek API μν νμΈ μ€ν¨: {str(e)[:100]}" | |
| return f"μ²λ¦¬λ λ¬Έμ ({len(self.processed_files)}κ°): {', '.join(self.processed_files)}{model_info}" | |
| def _get_cache_info(self) -> str: | |
| """ | |
| μΊμ μΈλΆ μ 보 λ©μμ§ μμ± | |
| Returns: | |
| μΊμ μ 보 λ©μμ§ | |
| """ | |
| if not self.file_index: | |
| return "μΊμλ νμΌμ΄ μμ΅λλ€." | |
| file_info = "" | |
| for file_path, info in self.file_index.items(): | |
| file_name = info.get('file_name', os.path.basename(file_path)) | |
| chunks_count = info.get('chunks_count', 0) | |
| file_size = info.get('file_size', 0) | |
| last_processed = info.get('last_processed', 0) | |
| # νμΌ ν¬κΈ°λ₯Ό μ¬λμ΄ μ½κΈ° μ¬μ΄ ννλ‘ λ³ν | |
| if file_size < 1024: | |
| size_str = f"{file_size} bytes" | |
| elif file_size < 1024 * 1024: | |
| size_str = f"{file_size / 1024:.1f} KB" | |
| else: | |
| size_str = f"{file_size / (1024 * 1024):.1f} MB" | |
| # λ§μ§λ§ μ²λ¦¬ μκ°μ λ μ§/μκ° νμμΌλ‘ λ³ν | |
| if last_processed: | |
| from datetime import datetime | |
| last_time = datetime.fromtimestamp(last_processed).strftime('%Y-%m-%d %H:%M:%S') | |
| else: | |
| last_time = "μ μ μμ" | |
| file_info += f"- {file_name}: {chunks_count}κ° μ²ν¬, {size_str}, λ§μ§λ§ μ²λ¦¬: {last_time}\n" | |
| return file_info | |
| if __name__ == "__main__": | |
| app = AutoRAGChatApp() | |
| app.launch_app() |