Spaces:

Alamgirapi
/

Try

No application file

Try

File size: 6,880 Bytes

b325aad

import os
from typing import List, Union
from pathlib import Path

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    PyPDFLoader,
    Docx2txtLoader,
    TextLoader,
    UnstructuredMarkdownLoader
)
from langchain.schema import Document

class DocumentChunker:
    """

    A class to read various document types and chunk them using LangChain

    """
    
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        """

        Initialize the DocumentChunker

        

        Args:

            chunk_size (int): Size of each chunk in characters

            chunk_overlap (int): Number of characters to overlap between chunks

        """
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            separators=["\n\n", "\n", " ", ""]
        )
    
    def read_pdf(self, file_path: str) -> List[Document]:
        """Read PDF file and return documents"""
        try:
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            return documents
        except Exception as e:
            print(f"Error reading PDF file {file_path}: {e}")
            return []
    
    def read_docx(self, file_path: str) -> List[Document]:
        """Read DOCX file and return documents"""
        try:
            loader = Docx2txtLoader(file_path)
            documents = loader.load()
            return documents
        except Exception as e:
            print(f"Error reading DOCX file {file_path}: {e}")
            return []
    
    def read_txt(self, file_path: str) -> List[Document]:
        """Read TXT file and return documents"""
        try:
            loader = TextLoader(file_path, encoding='utf-8')
            documents = loader.load()
            return documents
        except Exception as e:
            print(f"Error reading TXT file {file_path}: {e}")
            return []
    
    def read_md(self, file_path: str) -> List[Document]:
        """Read Markdown file and return documents"""
        try:
            loader = UnstructuredMarkdownLoader(file_path)
            documents = loader.load()
            return documents
        except Exception as e:
            print(f"Error reading MD file {file_path}: {e}")
            return []
    
    def load_document(self, file_path: str) -> List[Document]:
        """

        Load document based on file extension

        

        Args:

            file_path (str): Path to the document file

            

        Returns:

            List[Document]: List of loaded documents

        """
        file_extension = Path(file_path).suffix.lower()
        
        if file_extension == '.pdf':
            return self.read_pdf(file_path)
        elif file_extension == '.docx':
            return self.read_docx(file_path)
        elif file_extension == '.txt':
            return self.read_txt(file_path)
        elif file_extension == '.md':
            return self.read_md(file_path)
        else:
            print(f"Unsupported file type: {file_extension}")
            return []
    
    def chunk_documents(self, documents: List[Document]) -> List[str]:
        """

        Chunk documents and return list of strings

        

        Args:

            documents (List[Document]): List of documents to chunk

            

        Returns:

            List[str]: List of chunked text strings

        """
        if not documents:
            return []
        
        # Split documents into chunks
        chunks = self.text_splitter.split_documents(documents)
        
        # Extract text content from chunks
        chunk_texts = [chunk.page_content for chunk in chunks]
        
        return chunk_texts
    
    def process_file(self, file_path: str) -> List[str]:
        """

        Process a single file: load and chunk it

        

        Args:

            file_path (str): Path to the file to process

            

        Returns:

            List[str]: List of chunked text strings

        """
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return []
        
        # Load document
        documents = self.load_document(file_path)
        
        if not documents:
            print(f"No content loaded from {file_path}")
            return []
        
        # Chunk documents
        chunks = self.chunk_documents(documents)
        
        print(f"Successfully processed {file_path}: {len(chunks)} chunks created")
        return chunks
    
    def process_multiple_files(self, file_paths: List[str]) -> List[str]:
        """

        Process multiple files and return combined chunks

        

        Args:

            file_paths (List[str]): List of file paths to process

            

        Returns:

            List[str]: Combined list of chunked text strings

        """
        all_chunks = []
        
        for file_path in file_paths:
            chunks = self.process_file(file_path)
            all_chunks.extend(chunks)
        
        return all_chunks


# Example usage and utility functions
def main():
    """Example usage of the DocumentChunker class"""
    
    # Initialize chunker with custom parameters
    chunker = DocumentChunker(chunk_size=800, chunk_overlap=100)
    
    # Example: Process a single file
    file_path = "example.pdf"  # Replace with your file path
    chunks = chunker.process_file(file_path)
    
    if chunks:
        print(f"Total chunks: {len(chunks)}")
        print("\nFirst chunk preview:")
        print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])
    
    # Example: Process multiple files
    file_paths = [
        "document1.pdf",
        "document2.docx",
        "document3.txt",
        "document4.md"
    ]
    
    all_chunks = chunker.process_multiple_files(file_paths)
    print(f"\nTotal chunks from all files: {len(all_chunks)}")
    
    return all_chunks


def create_chunker_with_custom_settings(chunk_size: int = 1000, 

                                       chunk_overlap: int = 200) -> DocumentChunker:
    """

    Create a DocumentChunker with custom settings

    

    Args:

        chunk_size (int): Size of each chunk

        chunk_overlap (int): Overlap between chunks

        

    Returns:

        DocumentChunker: Configured chunker instance

    """
    return DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)


if __name__ == "__main__":
    main()