File size: 274 Bytes
04e4b39
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
from pathlib import Path

SRC = Path("data/corpus_raw.txt")
if __name__ == "__main__":
    text = SRC.read_text(encoding="utf-8", errors="ignore")
    text = text.replace("\r\n", "\n").strip()
    SRC.write_text(text, encoding="utf-8")
    print("cleaned corpus in-place.")