RAG_backend / src /ingestion /preprocessor.py
vinimoreira's picture
Add files for RAG backend
2068d15 verified
raw
history blame contribute delete
364 Bytes
import re
def clean_text(text: str) -> str:
text = re.sub(r"\n+", "\n", text)
text = re.sub(r"[ \t]+", " ", text)
text = text.strip()
return text
def preprocess_documents(documents: list[dict]) -> list[dict]:
return [
{
**doc,
"content": clean_text(doc["content"])
}
for doc in documents
]