Spaces:
No application file
No application file
File size: 6,880 Bytes
b325aad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
import os
from typing import List, Union
from pathlib import Path
# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
TextLoader,
UnstructuredMarkdownLoader
)
from langchain.schema import Document
class DocumentChunker:
"""
A class to read various document types and chunk them using LangChain
"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
"""
Initialize the DocumentChunker
Args:
chunk_size (int): Size of each chunk in characters
chunk_overlap (int): Number of characters to overlap between chunks
"""
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
separators=["\n\n", "\n", " ", ""]
)
def read_pdf(self, file_path: str) -> List[Document]:
"""Read PDF file and return documents"""
try:
loader = PyPDFLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading PDF file {file_path}: {e}")
return []
def read_docx(self, file_path: str) -> List[Document]:
"""Read DOCX file and return documents"""
try:
loader = Docx2txtLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading DOCX file {file_path}: {e}")
return []
def read_txt(self, file_path: str) -> List[Document]:
"""Read TXT file and return documents"""
try:
loader = TextLoader(file_path, encoding='utf-8')
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading TXT file {file_path}: {e}")
return []
def read_md(self, file_path: str) -> List[Document]:
"""Read Markdown file and return documents"""
try:
loader = UnstructuredMarkdownLoader(file_path)
documents = loader.load()
return documents
except Exception as e:
print(f"Error reading MD file {file_path}: {e}")
return []
def load_document(self, file_path: str) -> List[Document]:
"""
Load document based on file extension
Args:
file_path (str): Path to the document file
Returns:
List[Document]: List of loaded documents
"""
file_extension = Path(file_path).suffix.lower()
if file_extension == '.pdf':
return self.read_pdf(file_path)
elif file_extension == '.docx':
return self.read_docx(file_path)
elif file_extension == '.txt':
return self.read_txt(file_path)
elif file_extension == '.md':
return self.read_md(file_path)
else:
print(f"Unsupported file type: {file_extension}")
return []
def chunk_documents(self, documents: List[Document]) -> List[str]:
"""
Chunk documents and return list of strings
Args:
documents (List[Document]): List of documents to chunk
Returns:
List[str]: List of chunked text strings
"""
if not documents:
return []
# Split documents into chunks
chunks = self.text_splitter.split_documents(documents)
# Extract text content from chunks
chunk_texts = [chunk.page_content for chunk in chunks]
return chunk_texts
def process_file(self, file_path: str) -> List[str]:
"""
Process a single file: load and chunk it
Args:
file_path (str): Path to the file to process
Returns:
List[str]: List of chunked text strings
"""
if not os.path.exists(file_path):
print(f"File not found: {file_path}")
return []
# Load document
documents = self.load_document(file_path)
if not documents:
print(f"No content loaded from {file_path}")
return []
# Chunk documents
chunks = self.chunk_documents(documents)
print(f"Successfully processed {file_path}: {len(chunks)} chunks created")
return chunks
def process_multiple_files(self, file_paths: List[str]) -> List[str]:
"""
Process multiple files and return combined chunks
Args:
file_paths (List[str]): List of file paths to process
Returns:
List[str]: Combined list of chunked text strings
"""
all_chunks = []
for file_path in file_paths:
chunks = self.process_file(file_path)
all_chunks.extend(chunks)
return all_chunks
# Example usage and utility functions
def main():
"""Example usage of the DocumentChunker class"""
# Initialize chunker with custom parameters
chunker = DocumentChunker(chunk_size=800, chunk_overlap=100)
# Example: Process a single file
file_path = "example.pdf" # Replace with your file path
chunks = chunker.process_file(file_path)
if chunks:
print(f"Total chunks: {len(chunks)}")
print("\nFirst chunk preview:")
print(chunks[0][:200] + "..." if len(chunks[0]) > 200 else chunks[0])
# Example: Process multiple files
file_paths = [
"document1.pdf",
"document2.docx",
"document3.txt",
"document4.md"
]
all_chunks = chunker.process_multiple_files(file_paths)
print(f"\nTotal chunks from all files: {len(all_chunks)}")
return all_chunks
def create_chunker_with_custom_settings(chunk_size: int = 1000,
chunk_overlap: int = 200) -> DocumentChunker:
"""
Create a DocumentChunker with custom settings
Args:
chunk_size (int): Size of each chunk
chunk_overlap (int): Overlap between chunks
Returns:
DocumentChunker: Configured chunker instance
"""
return DocumentChunker(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
if __name__ == "__main__":
main() |