| import json | |
| data = None | |
| with open('en-ja_full.txt', 'rt') as f: | |
| data = f.readlines() | |
| data = [d.split('\n')[0].replace('\t', ' ').replace('"', '\"').replace('”', '\"').replace('“', '\"') for d in data] | |
| data = [json.dumps({"text":d}, ensure_ascii=False) for d in data] | |
| data = '\n'.join(data) | |
| print(data[:10000]) | |
| #print(data[:100]) | |
| with open('en-ja100.jsonl', 'wt') as f: | |
| f.write(data) | |