| import json | |
| from tqdm import tqdm | |
| f = open("/home/aiscuser/fhw/data/qwq_python_deduplicated.json", "r+") | |
| fw = open("/home/aiscuser/fhw/data/qwq_python_length.json", "w+") | |
| lines = f.readlines() | |
| for line in tqdm(lines): | |
| d = json.loads(line) | |
| length = len(d["instruction"].split()) | |
| if length <= 500: | |
| fw.write(line) | |