Spaces:
Runtime error
Runtime error
File size: 2,172 Bytes
c03c816 843c84e c03c816 843c84e c03c816 843c84e c03c816 843c84e c03c816 843c84e c03c816 843c84e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
"""
Document Chunking Module
Splits documents into smaller chunks optimized for embedding and retrieval.
"""
from langchain_text_splitters import RecursiveCharacterTextSplitter
def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
"""
Split documents into smaller chunks for embedding.
Args:
documents: List of LangChain Document objects
chunk_size: Maximum size of each chunk in characters (default: 1000)
chunk_overlap: Number of characters to overlap between chunks (default: 200)
Returns:
list: List of chunked Document objects with preserved metadata
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
length_function=len,
add_start_index=True, # Track position in original document
separators=[
"\n\n", # Paragraphs (preferred)
"\n", # Lines
" ", # Words
"" # Characters (fallback)
]
)
chunks = text_splitter.split_documents(documents)
return chunks
if __name__ == "__main__":
from pathlib import Path
import sys
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from loader.ingest import load_upb_documents
print(" Loading documents...\n")
documents = load_upb_documents()
print(f" Loaded {len(documents)} documents")
print(f" Total characters: {sum(len(doc.page_content) for doc in documents):,}\n")
print(" Chunking documents...")
chunks = chunk_documents(documents)
print(f"\n Created {len(chunks)} chunks")
print(f" Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks):,} characters")
# Show chunks by category
chunk_categories = {}
for chunk in chunks:
cat = chunk.metadata.get('category', 'unknown')
chunk_categories[cat] = chunk_categories.get(cat, 0) + 1
print("\nChunks by category:")
for cat, count in sorted(chunk_categories.items()):
print(f" - {cat}: {count} chunks")
print("\nChunks ready for embedding!") |