import os import json from tqdm import tqdm names = os.listdir("/home/aiscuser/fhw/data") fw = open(f"/home/aiscuser/fhw/data/qwq_python_filtered.json",'w+') #print(len(filenames)) for name in names: if "QwQ-32B-Preview" not in name: continue f = open(f"/home/aiscuser/fhw/data/{name}",'r+') lines = f.readlines() for line in tqdm(lines): d = json.loads(line) d["instruction"] = d["instruction"].strip('1.').strip(' ').strip('\n').strip('\t') if len(d["instruction"].split())<200: fw.write(json.dumps(d)+'\n') #print(d["instruction"]) #print("########################################################################") continue else: end = d["instruction"].rfind('?') if end != -1: d["instruction"] = d["instruction"][:end+1].strip(' ').strip('\n').strip('\t') if d["instruction"][-1]=='?': fw.write(json.dumps(d)+'\n') #print(d["instruction"]) continue else: sign = 0 ts = d["instruction"].split('\n') ts_len = len(ts) for i in range(ts_len): if ts[ts_len-1-i].find("How can")!=-1 or ts[ts_len-1-i].find("how can")!=-1 or ts[ts_len-1-i].find("Can you")!=-1 or ts[ts_len-1-i].find("can you")!=-1 or ts[ts_len-1-i].startswith("Please") or ts[ts_len-1-i].startswith("please"): #print(filename) sign = 1 d["instruction"] = '\n'.join(ts[0:ts_len-i]).strip(' ').strip('\n') fw.write(json.dumps(d)+'\n') #print(d["instruction"]) break if sign == 1: continue else: top_num = ts_len if ts_len < 5 else 5 for i in range(top_num): if ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Ensure")!=-1 or ts[i].find("ensure")!=-1 or ts[i].find("Write")!=-1 or ts[i].find("write")!=-1 or ts[i].find("Implement")!=-1 or ts[i].find("implement")!=-1 or ts[i].find("Create")!=-1 or ts[i].find("create")!=-1 or ts[i].find("Explain")!=-1 or ts[i].find("You are")!=-1 or ts[i].find("Given")!=-1 or ts[i].find("Implementing")!=-1 or ts[i].find("implementing")!=-1 or ts[i].find("Writing")!=-1 or ts[i].find("writing")!=-1 or ts[i].find("Creating")!=-1 or ts[i].find("creating")!=-1 or ts[i].find("Design")!=-1 or ts[i].find("design")!=-1 or ts[i].find("Consider")!=-1 or ts[i].find("consider")!=-1 or ts[i].find("Provide")!=-1 or ts[i].find("provide")!=-1 or ts[i].find("Calculate")!=-1 or ts[i].find("calculate")!=-1: sign = 1 #print(filename) d["instruction"] = '\n'.join(ts[0:i+1]).strip(' ').strip('\n') #print(d["instruction"]) fw.write(json.dumps(d)+'\n') break if sign == 1: continue else: end = d["instruction"].find('\n\nSure') if end == -1: end = d["instruction"].find('\n\nCertainly') if end == -1: end = d["instruction"].find('\n\nHere') if end == -1: end = d["instruction"].find('\n\nNow') if end == -1: #print(d["instruction"]) #print("########################################################################") continue else: d["instruction"] = d["instruction"][:end].strip(' ').strip('\n').strip('\t') fw.write(json.dumps(d)+'\n')