Spaces:
Sleeping
Sleeping
File size: 6,011 Bytes
9901473 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 |
"""
Paragraphization and fixed-size character chunking with overlap.
"""
from typing import List, Dict
def split_into_paragraphs(cleaned: str) -> list:
"""
Split cleaned text into paragraphs.
# TODO hints:
# - Split on double newlines; strip; drop very short fragments.
# Acceptance:
# - Returns a list of paragraph strings.
"""
paragraphs = cleaned.split('\n\n')
return [p.strip() for p in paragraphs if len(p.strip()) > 0]
def chunk_paragraphs(paragraphs: list, size: int, overlap: int, book: str) -> List[Dict]:
"""
Make fixed-size chunks with overlap; attach source metadata (book, para_idx, char_span).
# TODO hints:
# - Accumulate paragraph text until >= size; slide by size-overlap.
# - Keep 'source_id' for citations; store start/end character indices.
# Acceptance:
# - Returns list of dicts: {id, text, meta:{book, para_idx_start, para_idx_end, span}}
"""
chunks = []
chunk_id = 0
i = 0 # Start from first paragraph
total_paragraphs = len(paragraphs)
last_i = -1 # Track last position to detect infinite loops
print(f"π Chunking '{book}': {total_paragraphs} paragraphs, size={size}, overlap={overlap}")
while i < len(paragraphs):
# Safety check: prevent infinite loops
if i == last_i:
print(f"β οΈ Warning: Stuck at paragraph {i}, forcing forward progress")
i += 1
if i >= len(paragraphs):
break
last_i = i
# Accumulate paragraphs until we reach the target size
chunk_paras = []
chunk_text = ""
para_start_idx = i
# Add paragraphs until we reach or exceed the target size
while i < len(paragraphs) and len(chunk_text) < size:
para = paragraphs[i]
# Add paragraph with separator
if chunk_text:
chunk_text += "\n\n" + para
else:
chunk_text = para
chunk_paras.append(i)
i += 1
# If we have content, create a chunk
if chunk_text:
para_end_idx = chunk_paras[-1] if chunk_paras else para_start_idx
chunks.append({
'id': f'{book}_chunk_{chunk_id}',
'text': chunk_text,
'meta': {
'book': book,
'para_idx_start': para_start_idx,
'para_idx_end': para_end_idx,
'char_count': len(chunk_text)
}
})
# Print progress
progress_pct = (i / total_paragraphs) * 100
print(f" Chunk {chunk_id}: paras {para_start_idx}-{para_end_idx}, {len(chunk_text)} chars ({progress_pct:.1f}% complete)")
chunk_id += 1
# Slide back by (size - overlap) characters for next chunk
# This creates overlapping chunks
if i < len(paragraphs) and overlap > 0:
slide_back = size - overlap
# Find which paragraph to start the next chunk from
# We want to keep 'overlap' characters from the end of current chunk
if len(chunk_text) > slide_back:
# Work backwards from the end of chunk_text to find where overlap starts
# Count characters from the end backwards
chars_from_end = 0
para_idx_back = len(chunk_paras) - 1
# Find the paragraph that contains the start of the overlap region
while para_idx_back >= 0:
para_idx = chunk_paras[para_idx_back]
para_len = len(paragraphs[para_idx])
# Add separator length (2 chars for \n\n) if not the last para
separator_len = 2 if para_idx_back < len(chunk_paras) - 1 else 0
chars_from_end += para_len + separator_len
# If we've covered at least 'overlap' chars, we found our starting point
if chars_from_end >= overlap:
# Start next chunk from this paragraph (included in overlap)
next_start_idx = para_idx
# CRITICAL: Only move backwards if:
# 1. We're going to a different position
# 2. It's before the current 'i' (which is already past this chunk)
# 3. It's different from where we just were (prevents getting stuck)
if next_start_idx < para_end_idx and next_start_idx != last_i:
i = next_start_idx
else:
# Can't safely move backwards, just continue forward from current i
# This prevents infinite loops
pass
break
para_idx_back -= 1
# Safety check: if we didn't find a good position, just continue forward
# This prevents infinite loops
if para_idx_back < 0:
# Couldn't find overlap point, just continue from where we are
# Ensure we at least move forward by 1 paragraph to prevent infinite loops
if i <= para_end_idx:
i = para_end_idx + 1
else:
# Chunk is smaller than slide_back, can't create meaningful overlap
# Just continue from current position
pass
else:
# No more content
break
print(f"β
Created {len(chunks)} chunks from '{book}'")
return chunks
|