| | import os |
| | import json |
| | from tqdm import tqdm |
| | names = os.listdir("/home/aiscuser/fhw/data") |
| | fw = open(f"/home/aiscuser/fhw/data/qwq_python_filtered.json",'w+') |
| | |
| | for name in names: |
| | if "QwQ-32B-Preview" not in name: |
| | continue |
| | f = open(f"/home/aiscuser/fhw/data/{name}",'r+') |
| | lines = f.readlines() |
| | for line in tqdm(lines): |
| | d = json.loads(line) |
| | d["instruction"] = d["instruction"].strip('1.').strip(' ').strip('\n').strip('\t') |
| | if len(d["instruction"].split())<200: |
| | fw.write(json.dumps(d)+'\n') |
| | |
| | |
| | continue |
| | else: |
| | end = d["instruction"].rfind('?') |
| | if end != -1: |
| | d["instruction"] = d["instruction"][:end+1].strip(' ').strip('\n').strip('\t') |
| | if d["instruction"][-1]=='?': |
| | fw.write(json.dumps(d)+'\n') |
| | |
| | continue |
| | else: |
| | sign = 0 |
| | ts = d["instruction"].split('\n') |
| | ts_len = len(ts) |
| | for i in range(ts_len): |
| | if ts[ts_len-1-i].find("How can")!=-1 or ts[ts_len-1-i].find("how can")!=-1 or ts[ts_len-1-i].find("Can you")!=-1 or ts[ts_len-1-i].find("can you")!=-1 or ts[ts_len-1-i].startswith("Please") or ts[ts_len-1-i].startswith("please"): |
| | |
| | sign = 1 |
| | d["instruction"] = '\n'.join(ts[0:ts_len-i]).strip(' ').strip('\n') |
| | fw.write(json.dumps(d)+'\n') |
| | |
| | break |
| | if sign == 1: |
| | continue |
| | else: |
| | top_num = ts_len if ts_len < 5 else 5 |
| | for i in range(top_num): |
| | if ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Ensure")!=-1 or ts[i].find("ensure")!=-1 or ts[i].find("Write")!=-1 or ts[i].find("write")!=-1 or ts[i].find("Implement")!=-1 or ts[i].find("implement")!=-1 or ts[i].find("Create")!=-1 or ts[i].find("create")!=-1 or ts[i].find("Explain")!=-1 or ts[i].find("You are")!=-1 or ts[i].find("Given")!=-1 or ts[i].find("Implementing")!=-1 or ts[i].find("implementing")!=-1 or ts[i].find("Writing")!=-1 or ts[i].find("writing")!=-1 or ts[i].find("Creating")!=-1 or ts[i].find("creating")!=-1 or ts[i].find("Design")!=-1 or ts[i].find("design")!=-1 or ts[i].find("Consider")!=-1 or ts[i].find("consider")!=-1 or ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Calculate")!=-1 or ts[i].find("calculate")!=-1: |
| | sign = 1 |
| | |
| | d["instruction"] = '\n'.join(ts[0:i+1]).strip(' ').strip('\n') |
| | |
| | fw.write(json.dumps(d)+'\n') |
| | break |
| | if sign == 1: |
| | continue |
| | else: |
| | end = d["instruction"].find('\n\nSure') |
| | if end == -1: |
| | end = d["instruction"].find('\n\nCertainly') |
| | if end == -1: |
| | end = d["instruction"].find('\n\nHere') |
| | if end == -1: |
| | end = d["instruction"].find('\n\nNow') |
| | if end == -1: |
| | |
| | |
| | continue |
| | else: |
| | d["instruction"] = d["instruction"][:end].strip(' ').strip('\n').strip('\t') |
| | fw.write(json.dumps(d)+'\n') |
| |
|