Spaces:
Sleeping
Sleeping
File size: 831 Bytes
c16e1c9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
import re
def extract_text(text: str, max_words: int = 300):
"""
Split raw text into chunks of ~300 words.
Suitable for document ingestion before embeddings.
Args:
text (str): Raw text input
max_words (int): Max words per chunk (default 300)
Returns:
List[str]: List of chunked text segments
"""
# Normalize whitespace
clean = re.sub(r'\s+', ' ', text).strip()
if not clean:
return []
words = clean.split(" ")
chunks = []
current = []
count = 0
for word in words:
current.append(word)
count += 1
if count >= max_words:
chunks.append(" ".join(current))
current = []
count = 0
# Add final chunk
if current:
chunks.append(" ".join(current))
return chunks
|