File size: 274 Bytes
04e4b39 |
1 2 3 4 5 6 7 8 9 |
from pathlib import Path
SRC = Path("data/corpus_raw.txt")
if __name__ == "__main__":
text = SRC.read_text(encoding="utf-8", errors="ignore")
text = text.replace("\r\n", "\n").strip()
SRC.write_text(text, encoding="utf-8")
print("cleaned corpus in-place.")
|