"""
문서 처리 유틸리티 모듈
"""

import os
import re
import logging
from typing import List, Dict, Any, Optional, Tuple, Union
import numpy as np

logger = logging.getLogger("DocProcessor")
if not logger.hasHandlers():
    handler = logging.StreamHandler()
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)

class DocumentProcessor:
    """문서 처리 유틸리티 클래스"""
    
    @staticmethod
    def split_text(
        text: str, 
        chunk_size: int = 512,
        chunk_overlap: int = 50,
        separator: str = "\n"
    ) -> List[str]:
        """
        텍스트를 더 작은 청크로 분할
        
        Args:
            text: 분할할 텍스트
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
            separator: 분할 시 사용할 구분자
        
        Returns:
            분할된 텍스트 청크 목록
        """
        if not text or chunk_size <= 0:
            return []
        
        # 구분자로 분할
        parts = text.split(separator)
        chunks = []
        current_chunk = []
        current_size = 0
        
        for part in parts:
            part_size = len(part)
            
            if current_size + part_size + len(current_chunk) > chunk_size and current_chunk:
                # 현재 청크가 최대 크기를 초과하면 저장
                chunks.append(separator.join(current_chunk))
                
                # 중첩을 위해 일부 청크 유지
                overlap_tokens = []
                overlap_size = 0
                for token in reversed(current_chunk):
                    if overlap_size + len(token) <= chunk_overlap:
                        overlap_tokens.insert(0, token)
                        overlap_size += len(token) + 1  # separator 길이 포함
                    else:
                        break
                
                current_chunk = overlap_tokens
                current_size = overlap_size - len(current_chunk)  # separator 길이 제외
            
            current_chunk.append(part)
            current_size += part_size
        
        # 마지막 청크 추가
        if current_chunk:
            chunks.append(separator.join(current_chunk))
        
        return chunks
    
    @staticmethod
    def clean_text(text: str, remove_urls: bool = True, remove_extra_whitespace: bool = True) -> str:
        """
        텍스트 정제
        
        Args:
            text: 정제할 텍스트
            remove_urls: URL 제거 여부
            remove_extra_whitespace: 여분의 공백 제거 여부
        
        Returns:
            정제된 텍스트
        """
        if not text:
            return ""
        
        # URL 제거
        if remove_urls:
            text = re.sub(r'https?://\S+|www\.\S+', '', text)
        
        # 특수 문자 및 HTML 태그 정제
        text = re.sub(r'<.*?>', '', text)  # HTML 태그 제거
        
        # 여분의 공백 제거
        if remove_extra_whitespace:
            text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    @staticmethod
    def text_to_documents(
        text: str,
        metadata: Optional[Dict[str, Any]] = None,
        chunk_size: int = 512,
        chunk_overlap: int = 50
    ) -> List[Dict[str, Any]]:
        """
        텍스트를 문서 객체 목록으로 변환
        
        Args:
            text: 변환할 텍스트
            metadata: 문서에 추가할 메타데이터
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
        
        Returns:
            문서 객체 목록
        """
        if not text:
            return []
        
        # 텍스트 정제
        clean = DocumentProcessor.clean_text(text)
        
        # 텍스트 분할
        chunks = DocumentProcessor.split_text(
            clean, 
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        
        # 문서 객체 생성
        documents = []
        for i, chunk in enumerate(chunks):
            doc = {
                "text": chunk,
                "index": i,
                "chunk_count": len(chunks)
            }
            
            # 메타데이터 추가
            if metadata:
                doc.update(metadata)
            
            documents.append(doc)
        
        return documents
    
    @staticmethod
    def load_documents_from_directory(
        directory: str,
        extensions: List[str] = [".txt", ".md", ".csv"],
        recursive: bool = True,
        chunk_size: int = 512,
        chunk_overlap: int = 50
    ) -> List[Dict[str, Any]]:
        """
        디렉토리에서 문서 로드 및 처리
        
        Args:
            directory: 로드할 디렉토리 경로
            extensions: 처리할 파일 확장자 목록
            recursive: 하위 디렉토리 검색 여부
            chunk_size: 각 청크의 최대 문자 수
            chunk_overlap: 청크 간 중첩되는 문자 수
        
        Returns:
            문서 객체 목록
        """
        if not os.path.isdir(directory):
            logger.error(f"디렉토리를 찾을 수 없습니다: {directory}")
            return []
        
        documents = []
        
        for root, dirs, files in os.walk(directory):
            if not recursive and root != directory:
                continue
            
            for file in files:
                _, ext = os.path.splitext(file)
                if ext.lower() not in extensions:
                    continue
                
                file_path = os.path.join(root, file)
                rel_path = os.path.relpath(file_path, directory)
                
                try:
                    logger.info(f"파일 로드 중: {rel_path}")
                    # 먼저 UTF-8로 시도
                    try:
                        with open(file_path, 'r', encoding='utf-8') as f:
                            content = f.read()
                    except UnicodeDecodeError:
                        # UTF-8로 실패하면 CP949(한국어 Windows 기본 인코딩)로 시도
                        logger.info(f"UTF-8 디코딩 실패, CP949로 시도: {rel_path}")
                        with open(file_path, 'r', encoding='cp949') as f:
                            content = f.read()
                    
                    # 메타데이터 생성
                    metadata = {
                        "source": rel_path,
                        "filename": file,
                        "filetype": ext.lower()[1:],
                        "filepath": file_path
                    }
                    
                    # 문서 처리
                    file_docs = DocumentProcessor.text_to_documents(
                        content,
                        metadata=metadata,
                        chunk_size=chunk_size,
                        chunk_overlap=chunk_overlap
                    )
                    
                    documents.extend(file_docs)
                    logger.info(f"{len(file_docs)}개 청크 추출: {rel_path}")
                    
                except Exception as e:
                    logger.error(f"파일 '{rel_path}' 처리 중 오류 발생: {e}")
                    continue
        
        logger.info(f"총 {len(documents)}개 문서 청크를 로드했습니다.")
        return documents
    
    @staticmethod
    def prepare_rag_context(results: List[Dict[str, Any]], field: str = "text") -> List[str]:
        """
        검색 결과에서 RAG에 사용할 컨텍스트 추출
        
        Args:
            results: 검색 결과 목록
            field: 텍스트 내용이 있는 필드 이름
            
        Returns:
            컨텍스트 텍스트 목록
        """
        context = []
        
        for result in results:
            if field in result:
                context.append(result[field])
        
        return context