Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| def generate_nlp_dataset(output_file="training/nlp_data.json"): | |
| """ | |
| Creates a pairing of (Common OCR Mistakes -> Correct Word). | |
| This dataset can be used to 'train' the spellchecker's dictionary | |
| or fine-tune a specialized NLP model. | |
| """ | |
| data = [ | |
| {"input": "th3", "target": "the"}, | |
| {"input": "p3ople", "target": "people"}, | |
| {"input": "v0ice", "target": "voice"}, | |
| {"input": "no4hij", "target": "nothing"}, | |
| {"input": "Ia", "target": "in"}, | |
| {"input": "0f", "target": "of"}, | |
| {"input": "joshu4", "target": "joshua"}, | |
| {"input": "he11o", "target": "hello"}, | |
| {"input": "w0r1d", "target": "world"}, | |
| {"input": "re-ling", "target": "feeling"}, | |
| {"input": "odia", "target": "who"}, | |
| {"input": "wheo", "target": "who"}, | |
| {"input": "4!", "target": "voice"}, | |
| {"input": "314", "target": "a"} | |
| ] | |
| os.makedirs(os.path.dirname(output_file), exist_ok=True) | |
| with open(output_file, 'w') as f: | |
| json.dump(data, f, indent=4) | |
| print(f"NLP Dataset created: {output_file}") | |
| if __name__ == "__main__": | |
| generate_nlp_dataset() | |