Spaces:
Sleeping
Sleeping
File size: 1,350 Bytes
4d592a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import re
from typing import List, Optional, Dict, Any
def intelligent_chunk(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
sentences = re.split(r"(?<=[.!?])\s+", text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length > chunk_size and current_chunk:
chunks.append(" ".join(current_chunk))
overlap_sentences = (
current_chunk[-overlap:]
if len(current_chunk) > overlap
else current_chunk
)
current_chunk = overlap_sentences + [sentence]
current_length = sum(len(s.split()) for s in current_chunk)
else:
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def create_chunk_metadata(
document_id: str,
chunk_index: int,
page_number: Optional[int] = None,
section: Optional[str] = None,
total_chunks: int = 0,
) -> Dict[str, Any]:
return {
"document_id": document_id,
"chunk_index": chunk_index,
"page_number": page_number,
"section": section,
"total_chunks": total_chunks,
}
|