| from pathlib import Path | |
| SRC = Path("data/corpus_raw.txt") | |
| if __name__ == "__main__": | |
| text = SRC.read_text(encoding="utf-8", errors="ignore") | |
| text = text.replace("\r\n", "\n").strip() | |
| SRC.write_text(text, encoding="utf-8") | |
| print("cleaned corpus in-place.") | |