from pathlib import Path SRC = Path("data/corpus_raw.txt") if __name__ == "__main__": text = SRC.read_text(encoding="utf-8", errors="ignore") text = text.replace("\r\n", "\n").strip() SRC.write_text(text, encoding="utf-8") print("cleaned corpus in-place.")