Spaces:
Runtime error
Runtime error
| import logging | |
| from typing import List, Optional | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders.text import TextLoader | |
| from langchain_core.documents import Document | |
| from .config import Config | |
| logger = logging.getLogger(__name__) | |
| class DocumentProcessor: | |
| def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None): | |
| config = Config.get_doc_processing_config() | |
| self.chunk_size = chunk_size or config['chunk_size'] | |
| self.chunk_overlap = chunk_overlap or config['chunk_overlap'] | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=self.chunk_size, | |
| chunk_overlap=self.chunk_overlap, | |
| length_function=len, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]: | |
| try: | |
| config = Config.get_doc_processing_config() | |
| encoding = encoding or config['encoding'] | |
| logger.info(f"Loading document from {file_path}") | |
| loader = TextLoader(file_path, encoding=encoding) | |
| documents = loader.load() | |
| logger.info(f"Successfully loaded {len(documents)} document(s)") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Error loading document from {file_path}: {e}") | |
| raise e | |
| def chunk_documents(self, documents: List[Document]) -> List[Document]: | |
| try: | |
| logger.info(f"Chunking {len(documents)} document(s)") | |
| chunks = self.text_splitter.split_documents(documents) | |
| logger.info(f"Successfully created {len(chunks)} chunk(s)") | |
| return chunks | |
| except Exception as e: | |
| logger.error(f"Error chunking documents: {e}") | |
| raise e | |
| def process_document(self, file_path: str) -> List[Document]: | |
| try: | |
| documents = self.load_document(file_path) | |
| chunks = self.chunk_documents(documents) | |
| logger.info(f"Document processing completed: {len(chunks)} chunks created") | |
| return chunks | |
| except Exception as e: | |
| logger.error(f"Error processing document: {e}") | |
| raise e | |
| def get_document_stats(self, chunks: List[Document]) -> dict: | |
| if not chunks: | |
| return { | |
| 'total_chunks': 0, | |
| 'total_characters': 0, | |
| 'avg_chunk_size': 0, | |
| 'min_chunk_size': 0, | |
| 'max_chunk_size': 0 | |
| } | |
| chunk_sizes = [len(chunk.page_content) for chunk in chunks] | |
| total_chars = sum(chunk_sizes) | |
| return { | |
| 'total_chunks': len(chunks), | |
| 'total_characters': total_chars, | |
| 'avg_chunk_size': total_chars / len(chunks), | |
| 'min_chunk_size': min(chunk_sizes), | |
| 'max_chunk_size': max(chunk_sizes) | |
| } | |