Vision / training /generate_nlp_data.py
Eli-Iustus's picture
Upload 321 files
2013cf0 verified
import os
import json
def generate_nlp_dataset(output_file="training/nlp_data.json"):
"""
Creates a pairing of (Common OCR Mistakes -> Correct Word).
This dataset can be used to 'train' the spellchecker's dictionary
or fine-tune a specialized NLP model.
"""
data = [
{"input": "th3", "target": "the"},
{"input": "p3ople", "target": "people"},
{"input": "v0ice", "target": "voice"},
{"input": "no4hij", "target": "nothing"},
{"input": "Ia", "target": "in"},
{"input": "0f", "target": "of"},
{"input": "joshu4", "target": "joshua"},
{"input": "he11o", "target": "hello"},
{"input": "w0r1d", "target": "world"},
{"input": "re-ling", "target": "feeling"},
{"input": "odia", "target": "who"},
{"input": "wheo", "target": "who"},
{"input": "4!", "target": "voice"},
{"input": "314", "target": "a"}
]
os.makedirs(os.path.dirname(output_file), exist_ok=True)
with open(output_file, 'w') as f:
json.dump(data, f, indent=4)
print(f"NLP Dataset created: {output_file}")
if __name__ == "__main__":
generate_nlp_dataset()