Spaces:

hugging2021
/

rag-document-system

Runtime error

App Files Files Community

rag-document-system / src /document_processor.py

hugging2021

Upload folder using huggingface_hub

816825a verified 5 months ago

raw

history blame contribute delete

3.05 kB

	import logging
	from typing import List, Optional
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders.text import TextLoader
	from langchain_core.documents import Document

	from .config import Config

	logger = logging.getLogger(__name__)

	class DocumentProcessor:
	def __init__(self, chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None):
	config = Config.get_doc_processing_config()
	self.chunk_size = chunk_size or config['chunk_size']
	self.chunk_overlap = chunk_overlap or config['chunk_overlap']
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=self.chunk_size,
	chunk_overlap=self.chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	def load_document(self, file_path: str, encoding: Optional[str] = None) -> List[Document]:
	try:
	config = Config.get_doc_processing_config()
	encoding = encoding or config['encoding']
	logger.info(f"Loading document from {file_path}")
	loader = TextLoader(file_path, encoding=encoding)
	documents = loader.load()
	logger.info(f"Successfully loaded {len(documents)} document(s)")
	return documents

	except Exception as e:
	logger.error(f"Error loading document from {file_path}: {e}")
	raise e

	def chunk_documents(self, documents: List[Document]) -> List[Document]:
	try:
	logger.info(f"Chunking {len(documents)} document(s)")
	chunks = self.text_splitter.split_documents(documents)
	logger.info(f"Successfully created {len(chunks)} chunk(s)")
	return chunks

	except Exception as e:
	logger.error(f"Error chunking documents: {e}")
	raise e

	def process_document(self, file_path: str) -> List[Document]:
	try:
	documents = self.load_document(file_path)
	chunks = self.chunk_documents(documents)
	logger.info(f"Document processing completed: {len(chunks)} chunks created")
	return chunks

	except Exception as e:
	logger.error(f"Error processing document: {e}")
	raise e

	def get_document_stats(self, chunks: List[Document]) -> dict:
	if not chunks:
	return {
	'total_chunks': 0,
	'total_characters': 0,
	'avg_chunk_size': 0,
	'min_chunk_size': 0,
	'max_chunk_size': 0
	}

	chunk_sizes = [len(chunk.page_content) for chunk in chunks]
	total_chars = sum(chunk_sizes)

	return {
	'total_chunks': len(chunks),
	'total_characters': total_chars,
	'avg_chunk_size': total_chars / len(chunks),
	'min_chunk_size': min(chunk_sizes),
	'max_chunk_size': max(chunk_sizes)
	}