Spaces:
Sleeping
Sleeping
File size: 963 Bytes
edac567 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
"""Basic text cleaning and fixed-size overlapping chunking utilities."""
from typing import List
def clean_text(text: str) -> str:
"""
Normalize whitespace in text.
Args:
text: Raw text.
Returns:
Cleaned single-spaced text.
"""
return " ".join(text.split())
def chunk_pdf_text(pdf_text: str, chunk_size: int = 1000, overlap: int = 100) -> List[str]:
"""
Split text into overlapping chunks.
Args:
pdf_text: Full text.
chunk_size: Max chars per chunk.
overlap: Overlapping chars between chunks.
Returns:
List of chunk strings.
"""
if chunk_size <= overlap:
raise ValueError("chunk_size must be greater than overlap")
chunks: List[str] = []
start = 0
length = len(pdf_text)
while start < length:
end = min(start + chunk_size, length)
chunks.append(pdf_text[start:end])
start += chunk_size - overlap
return chunks |