Tarush-AI
/

AureliusGPT

small-language-model

Model card Files Files and versions

AureliusGPT / model /vocab /preprocess.py

Tarush-AI's picture

Upload folder using huggingface_hub

f451089 verified about 2 months ago

history blame contribute delete

3.51 kB

	import re, os, unicodedata
	import sys
	from config import PROJECT_ROOT, greekwords

	class Preprocessor:
	def __init__(self):
	pass

	def process(self, text):
	meditations = unicodedata.normalize("NFC", text)

	# Check if the text has the expected structure of the original Meditations
	is_original = "THE FIRST BOOK" in meditations and "APPENDIX" in meditations

	if is_original:
	startindex = meditations.index("THE FIRST BOOK")
	meditations = meditations[startindex:]
	endindex = meditations.index("APPENDIX")
	meditations = meditations[:endindex]

	book_name = r"THE\s+[A-Z]+\s+BOOK\s+[IVXLCDM]+\.\s"
	section_name = r"\n\n[IVXLCDM]+\. "
	book_end = r"\n\n\n\n"

	meditations = re.sub(book_name, "<BEGIN> \n", meditations)
	meditations = re.sub(book_end, "<END> ", meditations)
	meditations = re.sub(section_name, "\n <END> \n <BEGIN> \n", meditations)

	split_pattern = f"{book_name}\|{section_name}"
	else:
	# Minimal processing for synthetic data
	split_pattern = r"\n\n" # Split by paragraphs or double newlines

	underline = r"[_]+"
	newline_in_sentence = r"(?<!\n)\n(?!\n)"

	meditations = re.sub(underline, "", meditations)
	meditations = re.sub(newline_in_sentence, " ", meditations)

	for key, value in greekwords.items():
	meditations = meditations.replace(key, value)

	raw_sections = re.split(split_pattern, meditations)

	final_sections = []
	for section in raw_sections:
	if not section.strip():
	continue
	sentences = self.split_into_sentences(section)
	if sentences:
	final_sections.extend(sentences)

	return meditations, final_sections

	def split_into_sentences(self, text: str) -> list[str]:
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if s.strip()]

	def test(self, file):
	processed = None
	if file:
	try:
	processed = self.process(file)
	except Exception:
	print("The processed file is not compliant with preprocess' requirements. Falling back to default file.\n")
	processed = None
	if not processed:
	test_file_path = os.path.join(os.path.dirname(__file__), "preprocess_test.txt")
	with open(test_file_path, "r") as f:
	processed = self.process(f.read())

	output_file_path = os.path.join(os.path.dirname(__file__), "preprocess_test_output.txt")
	with open(output_file_path, "w") as f:
	f.write(processed[0])

	print(f"Saved to {output_file_path}.")

	if __name__ == "__main__":
	file = None
	if len(sys.argv) > 1:
	test = sys.argv[1]
	if test != "test":
	print("Only permitted argument is 'test'; Please try again.")
	pass

	else:
	print("Preprocessing logic is wrapped into overall training functionality.")
	pass

	if len(sys.argv) > 2:
	filepath = sys.argv[2]
	try:
	with open(filepath, "r") as f:
	file = f.read()
	except Exception as e:
	print("Invalid filepath, falling back to original test.")
	file = None

	Preprocessor().test(file)