Spaces:

Eli-Iustus
/

Vision

Sleeping

Vision / training /generate_nlp_data.py

Upload 321 files

2013cf0 verified about 2 months ago

1.22 kB

	import os
	import json

	def generate_nlp_dataset(output_file="training/nlp_data.json"):
	"""
	Creates a pairing of (Common OCR Mistakes -> Correct Word).
	This dataset can be used to 'train' the spellchecker's dictionary
	or fine-tune a specialized NLP model.
	"""
	data = [
	{"input": "th3", "target": "the"},
	{"input": "p3ople", "target": "people"},
	{"input": "v0ice", "target": "voice"},
	{"input": "no4hij", "target": "nothing"},
	{"input": "Ia", "target": "in"},
	{"input": "0f", "target": "of"},
	{"input": "joshu4", "target": "joshua"},
	{"input": "he11o", "target": "hello"},
	{"input": "w0r1d", "target": "world"},
	{"input": "re-ling", "target": "feeling"},
	{"input": "odia", "target": "who"},
	{"input": "wheo", "target": "who"},
	{"input": "4!", "target": "voice"},
	{"input": "314", "target": "a"}
	]

	os.makedirs(os.path.dirname(output_file), exist_ok=True)
	with open(output_file, 'w') as f:
	json.dump(data, f, indent=4)

	print(f"NLP Dataset created: {output_file}")

	if __name__ == "__main__":
	generate_nlp_dataset()