Spaces:
No application file
No application file
| import os | |
| from typing import List, Union | |
| from pathlib import Path | |
| # LangChain imports | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import ( | |
| PyPDFLoader, | |
| Docx2txtLoader, | |
| TextLoader, | |
| UnstructuredMarkdownLoader | |
| ) | |
| from langchain.schema import Document | |
| class DocumentChunker: | |
| """ | |
| A class to read various document types and chunk them using LangChain | |
| """ | |
| def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): | |
| """ | |
| Initialize the DocumentChunker | |
| Args: | |
| chunk_size (int): Size of each chunk in characters | |
| chunk_overlap (int): Number of characters to overlap between chunks | |
| """ | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| def read_pdf(self, file_path: str) -> List[Document]: | |
| """Read PDF file and return documents""" | |
| try: | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| return documents | |
| except Exception as e: | |
| print(f"Error reading PDF file {file_path}: {e}") | |
| return [] | |
| def read_docx(self, file_path: str) -> List[Document]: | |
| """Read DOCX file and return documents""" | |
| try: | |
| loader = Docx2txtLoader(file_path) | |
| documents = loader.load() | |
| return documents | |
| except Exception as e: | |
| print(f"Error reading DOCX file {file_path}: {e}") | |
| return [] | |
| def read_txt(self, file_path: str) -> List[Document]: | |
| """Read TXT file and return documents""" | |
| try: | |
| loader = TextLoader(file_path, encoding='utf-8') | |
| documents = loader.load() | |
| return documents | |
| except Exception as e: | |
| print(f"Error reading TXT file {file_path}: {e}") | |
| return [] | |
| def read_md(self, file_path: str) -> List[Document]: | |
| """Read Markdown file and return documents""" | |
| try: | |
| loader = UnstructuredMarkdownLoader(file_path) | |
| documents = loader.load() | |
| return documents | |
| except Exception as e: | |
| print(f"Error reading MD file {file_path}: {e}") | |
| return [] | |
| def load_document(self, file_path: str) -> List[Document]: | |
| """ | |
| Load document based on file extension | |
| Args: | |
| file_path (str): Path to the document file | |
| Returns: | |
| List[Document]: List of loaded documents | |
| """ | |
| file_extension = Path(file_path).suffix.lower() | |
| if file_extension == '.pdf': | |
| return self.read_pdf(file_path) | |
| elif file_extension == '.docx': | |
| return self.read_docx(file_path) | |
| elif file_extension == '.txt': | |
| return self.read_txt(file_path) | |
| elif file_extension == '.md': | |
| return self.read_md(file_path) | |
| else: | |
| print(f"Unsupported file type: {file_extension}") | |
| return [] | |
| def chunk_documents(self, documents: List[Document]) -> List[str]: | |
| """ | |
| Chunk documents and return list of strings | |
| Args: | |
| documents (List[Document]): List of documents to chunk | |
| Returns: | |
| List[str]: List of chunked text strings | |
| """ | |
| if not documents: | |
| return [] | |
| # Split documents into chunks | |
| chunks = self.text_splitter.split_documents(documents) | |
| # Extract text content from chunks | |
| chunk_texts = [chunk.page_content for chunk in chunks] | |
| return chunk_texts | |
| def process_file(self, file_path: str) -> List[str]: | |
| """ | |
| Process a single file: load and chunk it | |
| Args: | |
| file_path (str): Path to the file to process | |
| Returns: | |
| List[str]: List of chunked text strings | |
| """ | |
| if not os.path.exists(file_path): | |
| print(f"File not found: {file_path}") | |
| return [] | |
| # Load document | |
| documents = self.load_document(file_path) | |
| if not documents: | |
| print(f"No content loaded from {file_path}") | |
| return [] | |
| # Chunk documents | |
| chunks = self.chunk_documents(documents) | |
| print(f"Successfully processed {file_path}: {len(chunks)} chunks created") | |
| return chunks | |
| def process_multiple_files(self, file_paths: List[str]) -> List[str]: | |
| """ | |
| Process multiple files and return combined chunks | |
| Args: | |
| file_paths (List[str]): List of file paths to process | |
| Returns: | |
| List[str]: Combined list of chunked text strings | |
| """ | |
| all_chunks = [] | |
| for file_path in file_paths: | |
| chunks = self.process_file(file_path) | |
| all_chunks.extend(chunks) | |
| return all_chunks | |
| # Example usage and utility functions | |
| def main(): | |
| """Example usage of the DocumentChunker class""" | |
| # Initialize chunker with custom parameters | |
| chunker = DocumentChunker(chunk_size=800, chunk_overlap=100) | |
| # Example: Process a single file | |
| file_path = "example.pdf" # Replace with your file path | |
| chunks = chunker.process_file(file_path) | |
| if chunks: | |
| print(f"Total chunks: {len(chunks)}") | |
| print("\nFirst chunk preview:") | |
| print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0]) | |
| # Example: Process multiple files | |
| file_paths = [ | |
| "document1.pdf", | |
| "document2.docx", | |
| "document3.txt", | |
| "document4.md" | |
| ] | |
| all_chunks = chunker.process_multiple_files(file_paths) | |
| print(f"\nTotal chunks from all files: {len(all_chunks)}") | |
| return all_chunks | |
| def create_chunker_with_custom_settings(chunk_size: int = 1000, | |
| chunk_overlap: int = 200) -> DocumentChunker: | |
| """ | |
| Create a DocumentChunker with custom settings | |
| Args: | |
| chunk_size (int): Size of each chunk | |
| chunk_overlap (int): Overlap between chunks | |
| Returns: | |
| DocumentChunker: Configured chunker instance | |
| """ | |
| return DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap) | |
| if __name__ == "__main__": | |
| main() |