import re import json from tqdm import tqdm import os #names = os.listdir("processed_data") f = open("/home/aiscuser/fhw/data/llama_instruct_final.json", "r+") fw = open("/home/aiscuser/fhw/data/llama_instruct_selected.json", 'w+') """ lines = [] for name in names: if "llama_python_scored" in name: f = open(f"processed_data/{name}", 'r+') lines.extend(f.readlines()) """ lines = f.readlines() for line in tqdm(lines): d = json.loads(line) instruction = d["instruction"] judgement = d["quality_judgement"] extracted = re.findall(r"\[\[(\d*\.\d+|\d+)/10\]\]", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\[\[(\d*\.\d+|\d+)\]\]", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)/10\]\*\*", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score: \[(\d*\.\d+|\d+)\]\*\*", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)/10\*\*", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score: (\d*\.\d+|\d+)\*\*", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)/10", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\*\*Score:\*\* (\d*\.\d+|\d+)", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"Score(.*?)", judgement, re.S) if len(extracted) > 0: judgement = extracted[-1] extracted = re.findall(r"\d*\.\d+|\d+", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[-1]) fw.write(json.dumps(d)+"\n") continue extracted = re.findall(r"\d*\.\d+|\d+", judgement, re.S) if len(extracted) > 0: d["score"] = float(extracted[0]) fw.write(json.dumps(d)+"\n") continue #print("###########################################")