Physics-Tutor-Model / train /prepare_corpus.py
adityashisharma's picture
Upload 6 files
04e4b39 verified
from pathlib import Path
SRC = Path("data/corpus_raw.txt")
if __name__ == "__main__":
text = SRC.read_text(encoding="utf-8", errors="ignore")
text = text.replace("\r\n", "\n").strip()
SRC.write_text(text, encoding="utf-8")
print("cleaned corpus in-place.")