Seif-aber
implemented pdf chat assistant with gemini and RAG
edac567
raw
history blame contribute delete
963 Bytes
"""Basic text cleaning and fixed-size overlapping chunking utilities."""
from typing import List
def clean_text(text: str) -> str:
"""
Normalize whitespace in text.
Args:
text: Raw text.
Returns:
Cleaned single-spaced text.
"""
return " ".join(text.split())
def chunk_pdf_text(pdf_text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""
Split text into overlapping chunks.
Args:
pdf_text: Full text.
chunk_size: Max chars per chunk.
overlap: Overlapping chars between chunks.
Returns:
List of chunk strings.
"""
if chunk_size <= overlap:
raise ValueError("chunk_size must be greater than overlap")
chunks: List[str] = []
start = 0
length = len(pdf_text)
while start < length:
end = min(start + chunk_size, length)
chunks.append(pdf_text[start:end])
start += chunk_size - overlap
return chunks