File size: 2,514 Bytes
939a9f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""
Chunker module
--------------
Purpose: Split text into smaller chunks.
"""

from typing import List, Dict
from dataclasses import dataclass

@dataclass
class Chunk:
    text: str
    chunk_id: int
    start_idx: int
    word_count: int

def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[Chunk]:
    """
    Split text into smaller chunks.

    Args:
        text (str): The text to split into chunks.
        chunk_size (int): The size of each chunk.
        overlap (int): The overlap between chunks.

    Returns:
        List[Chunk]: A list of chunks.
    """
    words = text.split()
    
    if not words:
        return []

    stride = chunk_size - overlap
    chunks = []
    chunk_id = 0
    
    for i in range(0, len(words), stride):
        chunk = words[i:i + chunk_size]
        chunk_text = ' '.join(chunk)
        
        if not chunk_text.strip():
            continue

        chunk = Chunk(
            text=chunk_text,
            chunk_id=chunk_id,
            start_idx=i,
            word_count=len(chunk)
        )
        
        chunks.append(chunk)
        chunk_id += 1
        
    return chunks


def chunk_documents(
    documents: Dict[str, str],
    chunk_size: int = 500,
    overlap: int = 50
) -> Dict[str, List[Chunk]]:
    """
    Chunk multiple documents.
    
    Args:
        documents: Dict of {doc_id: text}
        chunk_size: Tokens per chunk
        overlap: Token overlap
    
    Returns:
        Dict of {doc_id: [chunks]}
    
    Example:
        >>> docs = {"doc1": "Text 1", "doc2": "Text 2"}
        >>> chunked = chunk_documents(docs)
        >>> "doc1" in chunked
        True
    """
    chunked_docs = {}
    
    for doc_id, text in documents.items():
        chunks = chunk_text(text, chunk_size, overlap)
        chunked_docs[doc_id] = chunks
    
    return chunked_docs

if __name__ == "__main__":
    text = """
    Machine Learning is a subset of artificial intelligence that involves training models to make predictions or decisions based on data. It is a powerful tool for solving a wide range of problems, from image recognition to natural language processing. In this article, we will explore the basics of machine learning and how it can be used to solve real-world problems.
    """
    
    chunks = chunk_text(text, chunk_size=50, overlap=10)
    print(f"Split into {len(chunks)} chunks:")
    for chunk in chunks:
        print(f"  Chunk {chunk.chunk_id}: {chunk.word_count} words | {chunk.text[:60]}...")