Spaces:
Sleeping
Sleeping
File size: 364 Bytes
2068d15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 | import re
def clean_text(text: str) -> str:
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
text = text.strip()
return text
def preprocess_documents(documents: list[dict]) -> list[dict]:
return [
{
**doc,
"content": clean_text(doc["content"])
}
for doc in documents
] |