File size: 6,011 Bytes
9901473
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
"""
Paragraphization and fixed-size character chunking with overlap.
"""
from typing import List, Dict


def split_into_paragraphs(cleaned: str) -> list:
    """
    Split cleaned text into paragraphs.

    # TODO hints:
    # - Split on double newlines; strip; drop very short fragments.

    # Acceptance:
    # - Returns a list of paragraph strings.
    """
    paragraphs = cleaned.split('\n\n')
    return [p.strip() for p in paragraphs if len(p.strip()) > 0]


def chunk_paragraphs(paragraphs: list, size: int, overlap: int, book: str) -> List[Dict]:
    """
    Make fixed-size chunks with overlap; attach source metadata (book, para_idx, char_span).

    # TODO hints:
    # - Accumulate paragraph text until >= size; slide by size-overlap.
    # - Keep 'source_id' for citations; store start/end character indices.

    # Acceptance:
    # - Returns list of dicts: {id, text, meta:{book, para_idx_start, para_idx_end, span}}
    """
    chunks = []
    chunk_id = 0
    i = 0  # Start from first paragraph
    total_paragraphs = len(paragraphs)
    last_i = -1  # Track last position to detect infinite loops
    
    print(f"πŸ“š Chunking '{book}': {total_paragraphs} paragraphs, size={size}, overlap={overlap}")
    
    while i < len(paragraphs):
        # Safety check: prevent infinite loops
        if i == last_i:
            print(f"⚠️  Warning: Stuck at paragraph {i}, forcing forward progress")
            i += 1
            if i >= len(paragraphs):
                break
        last_i = i
        # Accumulate paragraphs until we reach the target size
        chunk_paras = []
        chunk_text = ""
        para_start_idx = i
        
        # Add paragraphs until we reach or exceed the target size
        while i < len(paragraphs) and len(chunk_text) < size:
            para = paragraphs[i]
            # Add paragraph with separator
            if chunk_text:
                chunk_text += "\n\n" + para
            else:
                chunk_text = para
            chunk_paras.append(i)
            i += 1
        
        # If we have content, create a chunk
        if chunk_text:
            para_end_idx = chunk_paras[-1] if chunk_paras else para_start_idx
            
            chunks.append({
                'id': f'{book}_chunk_{chunk_id}',
                'text': chunk_text,
                'meta': {
                    'book': book,
                    'para_idx_start': para_start_idx,
                    'para_idx_end': para_end_idx,
                    'char_count': len(chunk_text)
                }
            })
            
            # Print progress
            progress_pct = (i / total_paragraphs) * 100
            print(f"  Chunk {chunk_id}: paras {para_start_idx}-{para_end_idx}, {len(chunk_text)} chars ({progress_pct:.1f}% complete)")
            
            chunk_id += 1
            
            # Slide back by (size - overlap) characters for next chunk
            # This creates overlapping chunks
            if i < len(paragraphs) and overlap > 0:
                slide_back = size - overlap
                
                # Find which paragraph to start the next chunk from
                # We want to keep 'overlap' characters from the end of current chunk
                if len(chunk_text) > slide_back:
                    # Work backwards from the end of chunk_text to find where overlap starts
                    # Count characters from the end backwards
                    chars_from_end = 0
                    para_idx_back = len(chunk_paras) - 1
                    
                    # Find the paragraph that contains the start of the overlap region
                    while para_idx_back >= 0:
                        para_idx = chunk_paras[para_idx_back]
                        para_len = len(paragraphs[para_idx])
                        # Add separator length (2 chars for \n\n) if not the last para
                        separator_len = 2 if para_idx_back < len(chunk_paras) - 1 else 0
                        chars_from_end += para_len + separator_len
                        
                        # If we've covered at least 'overlap' chars, we found our starting point
                        if chars_from_end >= overlap:
                            # Start next chunk from this paragraph (included in overlap)
                            next_start_idx = para_idx
                            # CRITICAL: Only move backwards if:
                            # 1. We're going to a different position
                            # 2. It's before the current 'i' (which is already past this chunk)
                            # 3. It's different from where we just were (prevents getting stuck)
                            if next_start_idx < para_end_idx and next_start_idx != last_i:
                                i = next_start_idx
                            else:
                                # Can't safely move backwards, just continue forward from current i
                                # This prevents infinite loops
                                pass
                            break
                        
                        para_idx_back -= 1
                    
                    # Safety check: if we didn't find a good position, just continue forward
                    # This prevents infinite loops
                    if para_idx_back < 0:
                        # Couldn't find overlap point, just continue from where we are
                        # Ensure we at least move forward by 1 paragraph to prevent infinite loops
                        if i <= para_end_idx:
                            i = para_end_idx + 1
                else:
                    # Chunk is smaller than slide_back, can't create meaningful overlap
                    # Just continue from current position
                    pass
        else:
            # No more content
            break
    
    print(f"βœ… Created {len(chunks)} chunks from '{book}'")
    return chunks