File size: 364 Bytes
2068d15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import re

def clean_text(text: str) -> str:
    text = re.sub(r"\n+", "\n", text)
    text = re.sub(r"[ \t]+", " ", text)
    text = text.strip()
    return text


def preprocess_documents(documents: list[dict]) -> list[dict]:
    return [
        {
            **doc,
            "content": clean_text(doc["content"])
        }
        for doc in documents
    ]