Spaces:

Alamgirapi
/

Try

No application file

App Files Files Community

Try / src /agenticRAG /components /document_parsing.py

Alamgirapi

Upload folder src

b325aad verified 6 months ago

raw

history blame contribute delete

6.88 kB

	import os
	from typing import List, Union
	from pathlib import Path

	# LangChain imports
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.document_loaders import (
	PyPDFLoader,
	Docx2txtLoader,
	TextLoader,
	UnstructuredMarkdownLoader
	)
	from langchain.schema import Document

	class DocumentChunker:
	"""
	A class to read various document types and chunk them using LangChain
	"""

	def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
	"""
	Initialize the DocumentChunker

	Args:
	chunk_size (int): Size of each chunk in characters
	chunk_overlap (int): Number of characters to overlap between chunks
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	separators=["\n\n", "\n", " ", ""]
	)

	def read_pdf(self, file_path: str) -> List[Document]:
	"""Read PDF file and return documents"""
	try:
	loader = PyPDFLoader(file_path)
	documents = loader.load()
	return documents
	except Exception as e:
	print(f"Error reading PDF file {file_path}: {e}")
	return []

	def read_docx(self, file_path: str) -> List[Document]:
	"""Read DOCX file and return documents"""
	try:
	loader = Docx2txtLoader(file_path)
	documents = loader.load()
	return documents
	except Exception as e:
	print(f"Error reading DOCX file {file_path}: {e}")
	return []

	def read_txt(self, file_path: str) -> List[Document]:
	"""Read TXT file and return documents"""
	try:
	loader = TextLoader(file_path, encoding='utf-8')
	documents = loader.load()
	return documents
	except Exception as e:
	print(f"Error reading TXT file {file_path}: {e}")
	return []

	def read_md(self, file_path: str) -> List[Document]:
	"""Read Markdown file and return documents"""
	try:
	loader = UnstructuredMarkdownLoader(file_path)
	documents = loader.load()
	return documents
	except Exception as e:
	print(f"Error reading MD file {file_path}: {e}")
	return []

	def load_document(self, file_path: str) -> List[Document]:
	"""
	Load document based on file extension

	Args:
	file_path (str): Path to the document file

	Returns:
	List[Document]: List of loaded documents
	"""
	file_extension = Path(file_path).suffix.lower()

	if file_extension == '.pdf':
	return self.read_pdf(file_path)
	elif file_extension == '.docx':
	return self.read_docx(file_path)
	elif file_extension == '.txt':
	return self.read_txt(file_path)
	elif file_extension == '.md':
	return self.read_md(file_path)
	else:
	print(f"Unsupported file type: {file_extension}")
	return []

	def chunk_documents(self, documents: List[Document]) -> List[str]:
	"""
	Chunk documents and return list of strings

	Args:
	documents (List[Document]): List of documents to chunk

	Returns:
	List[str]: List of chunked text strings
	"""
	if not documents:
	return []

	# Split documents into chunks
	chunks = self.text_splitter.split_documents(documents)

	# Extract text content from chunks
	chunk_texts = [chunk.page_content for chunk in chunks]

	return chunk_texts

	def process_file(self, file_path: str) -> List[str]:
	"""
	Process a single file: load and chunk it

	Args:
	file_path (str): Path to the file to process

	Returns:
	List[str]: List of chunked text strings
	"""
	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return []

	# Load document
	documents = self.load_document(file_path)

	if not documents:
	print(f"No content loaded from {file_path}")
	return []

	# Chunk documents
	chunks = self.chunk_documents(documents)

	print(f"Successfully processed {file_path}: {len(chunks)} chunks created")
	return chunks

	def process_multiple_files(self, file_paths: List[str]) -> List[str]:
	"""
	Process multiple files and return combined chunks

	Args:
	file_paths (List[str]): List of file paths to process

	Returns:
	List[str]: Combined list of chunked text strings
	"""
	all_chunks = []

	for file_path in file_paths:
	chunks = self.process_file(file_path)
	all_chunks.extend(chunks)

	return all_chunks


	# Example usage and utility functions
	def main():
	"""Example usage of the DocumentChunker class"""

	# Initialize chunker with custom parameters
	chunker = DocumentChunker(chunk_size=800, chunk_overlap=100)

	# Example: Process a single file
	file_path = "example.pdf" # Replace with your file path
	chunks = chunker.process_file(file_path)

	if chunks:
	print(f"Total chunks: {len(chunks)}")
	print("\nFirst chunk preview:")
	print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])

	# Example: Process multiple files
	file_paths = [
	"document1.pdf",
	"document2.docx",
	"document3.txt",
	"document4.md"
	]

	all_chunks = chunker.process_multiple_files(file_paths)
	print(f"\nTotal chunks from all files: {len(all_chunks)}")

	return all_chunks


	def create_chunker_with_custom_settings(chunk_size: int = 1000,
	chunk_overlap: int = 200) -> DocumentChunker:
	"""
	Create a DocumentChunker with custom settings

	Args:
	chunk_size (int): Size of each chunk
	chunk_overlap (int): Overlap between chunks

	Returns:
	DocumentChunker: Configured chunker instance
	"""
	return DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


	if __name__ == "__main__":
	main()