yuvrajsingh6
feat: RAG system with OCR for Hugging Face Spaces
4d592a4
import re
from typing import List, Optional, Dict, Any
def intelligent_chunk(text: str, chunk_size: int = 512, overlap: int = 50) -> List[str]:
sentences = re.split(r"(?<=[.!?])\s+", text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
sentence_length = len(sentence.split())
if current_length + sentence_length > chunk_size and current_chunk:
chunks.append(" ".join(current_chunk))
overlap_sentences = (
current_chunk[-overlap:]
if len(current_chunk) > overlap
else current_chunk
)
current_chunk = overlap_sentences + [sentence]
current_length = sum(len(s.split()) for s in current_chunk)
else:
current_chunk.append(sentence)
current_length += sentence_length
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
def create_chunk_metadata(
document_id: str,
chunk_index: int,
page_number: Optional[int] = None,
section: Optional[str] = None,
total_chunks: int = 0,
) -> Dict[str, Any]:
return {
"document_id": document_id,
"chunk_index": chunk_index,
"page_number": page_number,
"section": section,
"total_chunks": total_chunks,
}