Tuminha commited on
Commit
9901473
·
verified ·
1 Parent(s): 150db75

Upload src/chunk.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/chunk.py +140 -0
src/chunk.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Paragraphization and fixed-size character chunking with overlap.
3
+ """
4
+ from typing import List, Dict
5
+
6
+
7
+ def split_into_paragraphs(cleaned: str) -> list:
8
+ """
9
+ Split cleaned text into paragraphs.
10
+
11
+ # TODO hints:
12
+ # - Split on double newlines; strip; drop very short fragments.
13
+
14
+ # Acceptance:
15
+ # - Returns a list of paragraph strings.
16
+ """
17
+ paragraphs = cleaned.split('\n\n')
18
+ return [p.strip() for p in paragraphs if len(p.strip()) > 0]
19
+
20
+
21
+ def chunk_paragraphs(paragraphs: list, size: int, overlap: int, book: str) -> List[Dict]:
22
+ """
23
+ Make fixed-size chunks with overlap; attach source metadata (book, para_idx, char_span).
24
+
25
+ # TODO hints:
26
+ # - Accumulate paragraph text until >= size; slide by size-overlap.
27
+ # - Keep 'source_id' for citations; store start/end character indices.
28
+
29
+ # Acceptance:
30
+ # - Returns list of dicts: {id, text, meta:{book, para_idx_start, para_idx_end, span}}
31
+ """
32
+ chunks = []
33
+ chunk_id = 0
34
+ i = 0 # Start from first paragraph
35
+ total_paragraphs = len(paragraphs)
36
+ last_i = -1 # Track last position to detect infinite loops
37
+
38
+ print(f"📚 Chunking '{book}': {total_paragraphs} paragraphs, size={size}, overlap={overlap}")
39
+
40
+ while i < len(paragraphs):
41
+ # Safety check: prevent infinite loops
42
+ if i == last_i:
43
+ print(f"⚠️ Warning: Stuck at paragraph {i}, forcing forward progress")
44
+ i += 1
45
+ if i >= len(paragraphs):
46
+ break
47
+ last_i = i
48
+ # Accumulate paragraphs until we reach the target size
49
+ chunk_paras = []
50
+ chunk_text = ""
51
+ para_start_idx = i
52
+
53
+ # Add paragraphs until we reach or exceed the target size
54
+ while i < len(paragraphs) and len(chunk_text) < size:
55
+ para = paragraphs[i]
56
+ # Add paragraph with separator
57
+ if chunk_text:
58
+ chunk_text += "\n\n" + para
59
+ else:
60
+ chunk_text = para
61
+ chunk_paras.append(i)
62
+ i += 1
63
+
64
+ # If we have content, create a chunk
65
+ if chunk_text:
66
+ para_end_idx = chunk_paras[-1] if chunk_paras else para_start_idx
67
+
68
+ chunks.append({
69
+ 'id': f'{book}_chunk_{chunk_id}',
70
+ 'text': chunk_text,
71
+ 'meta': {
72
+ 'book': book,
73
+ 'para_idx_start': para_start_idx,
74
+ 'para_idx_end': para_end_idx,
75
+ 'char_count': len(chunk_text)
76
+ }
77
+ })
78
+
79
+ # Print progress
80
+ progress_pct = (i / total_paragraphs) * 100
81
+ print(f" Chunk {chunk_id}: paras {para_start_idx}-{para_end_idx}, {len(chunk_text)} chars ({progress_pct:.1f}% complete)")
82
+
83
+ chunk_id += 1
84
+
85
+ # Slide back by (size - overlap) characters for next chunk
86
+ # This creates overlapping chunks
87
+ if i < len(paragraphs) and overlap > 0:
88
+ slide_back = size - overlap
89
+
90
+ # Find which paragraph to start the next chunk from
91
+ # We want to keep 'overlap' characters from the end of current chunk
92
+ if len(chunk_text) > slide_back:
93
+ # Work backwards from the end of chunk_text to find where overlap starts
94
+ # Count characters from the end backwards
95
+ chars_from_end = 0
96
+ para_idx_back = len(chunk_paras) - 1
97
+
98
+ # Find the paragraph that contains the start of the overlap region
99
+ while para_idx_back >= 0:
100
+ para_idx = chunk_paras[para_idx_back]
101
+ para_len = len(paragraphs[para_idx])
102
+ # Add separator length (2 chars for \n\n) if not the last para
103
+ separator_len = 2 if para_idx_back < len(chunk_paras) - 1 else 0
104
+ chars_from_end += para_len + separator_len
105
+
106
+ # If we've covered at least 'overlap' chars, we found our starting point
107
+ if chars_from_end >= overlap:
108
+ # Start next chunk from this paragraph (included in overlap)
109
+ next_start_idx = para_idx
110
+ # CRITICAL: Only move backwards if:
111
+ # 1. We're going to a different position
112
+ # 2. It's before the current 'i' (which is already past this chunk)
113
+ # 3. It's different from where we just were (prevents getting stuck)
114
+ if next_start_idx < para_end_idx and next_start_idx != last_i:
115
+ i = next_start_idx
116
+ else:
117
+ # Can't safely move backwards, just continue forward from current i
118
+ # This prevents infinite loops
119
+ pass
120
+ break
121
+
122
+ para_idx_back -= 1
123
+
124
+ # Safety check: if we didn't find a good position, just continue forward
125
+ # This prevents infinite loops
126
+ if para_idx_back < 0:
127
+ # Couldn't find overlap point, just continue from where we are
128
+ # Ensure we at least move forward by 1 paragraph to prevent infinite loops
129
+ if i <= para_end_idx:
130
+ i = para_end_idx + 1
131
+ else:
132
+ # Chunk is smaller than slide_back, can't create meaningful overlap
133
+ # Just continue from current position
134
+ pass
135
+ else:
136
+ # No more content
137
+ break
138
+
139
+ print(f"✅ Created {len(chunks)} chunks from '{book}'")
140
+ return chunks