File size: 2,172 Bytes
c03c816
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
843c84e
c03c816
 
843c84e
 
c03c816
843c84e
c03c816
 
843c84e
 
c03c816
 
 
 
 
 
843c84e
 
c03c816
 
843c84e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
"""
Document Chunking Module
Splits documents into smaller chunks optimized for embedding and retrieval.
"""

from langchain_text_splitters import RecursiveCharacterTextSplitter


def chunk_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks for embedding.
    
    Args:
        documents: List of LangChain Document objects
        chunk_size: Maximum size of each chunk in characters (default: 1000)
        chunk_overlap: Number of characters to overlap between chunks (default: 200)
    
    Returns:
        list: List of chunked Document objects with preserved metadata
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True,  # Track position in original document
        separators=[
            "\n\n",  # Paragraphs (preferred)
            "\n",    # Lines
            " ",     # Words
            ""       # Characters (fallback)
        ]
    )
    
    chunks = text_splitter.split_documents(documents)
    return chunks


if __name__ == "__main__":
    from pathlib import Path
    import sys
    
    # Add src to path
    sys.path.insert(0, str(Path(__file__).parent.parent))
    
    from loader.ingest import load_upb_documents
    
    print(" Loading documents...\n")
    documents = load_upb_documents()
    
    print(f" Loaded {len(documents)} documents")
    print(f" Total characters: {sum(len(doc.page_content) for doc in documents):,}\n")
    
    print(" Chunking documents...")
    chunks = chunk_documents(documents)
    
    print(f"\n Created {len(chunks)} chunks")
    print(f" Average chunk size: {sum(len(c.page_content) for c in chunks) // len(chunks):,} characters")

    # Show chunks by category
    chunk_categories = {}
    for chunk in chunks:
        cat = chunk.metadata.get('category', 'unknown')
        chunk_categories[cat] = chunk_categories.get(cat, 0) + 1

    print("\nChunks by category:")
    for cat, count in sorted(chunk_categories.items()):
        print(f"  - {cat}: {count} chunks")

    print("\nChunks ready for embedding!")