Tsukihjy's picture
download
raw
5.83 kB
import json
import random
def get_random_indices(array_length, num_indices):
# 确保抽取的数量不超过数组长度
if num_indices > array_length:
return random.sample(range(array_length), array_length)
# 使用 random.sample 抽取指定数量的索引
return random.sample(range(array_length), num_indices)
def find_first_non_ac(array):
for element in array:
if element != "AC":
return element
return "AC"
test_als = ["lcb"]
model_name_list = [
# "claude-sonnet-4-20250514-thinking",
# "deepseek-v3",
# "qwen3-nothink",
# "claude4",
"gpt-4o",
# "qwen-coder-plus",
# "Qwen2.5-7B-Instruct",
# "Qwen2.5-14B-Instruct",
# "Qwen2.5-32B-Instruct",
# "Qwen2.5-Coder-7B-Instruct",
# "Qwen2.5-Coder-14B-Instruct",
# "Qwen2.5-Coder-32B-Instruct",
]
ds = json.load(open("/home/luoxianzhen/yang/data/Ours/TestcaseBench-v28.json", "r", encoding="utf-8"))
tcb_id_transform = {}
for item in ds:
tcb_id_transform[item['wrong_code'][0]['problem']] = item['tcb_id']
all_wrong_code = json.load(open("/home/luoxianzhen/yang/data/Ours/all_wrong_code/data/all_wrong_code_subset.json", "r", encoding="utf-8"))
sub_set = []
for item in all_wrong_code:
sub_set.append(tcb_id_transform[item['name']])
import os
for model_name in model_name_list:
for test_al in test_als:
result_file = f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-random-{test_al}-rank1-all.json"
if not os.path.exists(result_file):
print(f"{model_name}-{test_al} NOT EXSIT!")
continue
results = json.load(open(result_file, "r", encoding="utf-8"))
rank_result = {
"rank1": {"AC":0, "CE": 0, "WA":0, "RE": 0, "TLE":0, "MLE":0,"EXE":0},
}
success_k = {
"rank1": {"total": 0, "hacked": 0},
}
for k, v in results.items():
# if k not in sub_set:
# continue
rank = len(v['codes'])
for i in range(1):
nums_of_tests = rank * (i + 1)
array_length = max([len(code['status']) for code in v['codes']])
## 每道题计算 rate
hacked = 0
status_present = {
"AC":0, "CE": 0, "WA":0, "RE": 0, "TLE":0, "MLE":0,"EXE":0
}
success_k[f"rank{i+1}"]["total"] += rank
if array_length == 0:
status_present['AC'] += rank
else:
for code in v['codes']:
tests_status = code['status']
status_present[find_first_non_ac(tests_status)] += 1
if find_first_non_ac(tests_status) != "AC":
hacked += 1
success_k[f"rank{i+1}"]["hacked"] += hacked / rank
for key, value in status_present.items():
rank_result[f"rank{i+1}"][key] += (value / rank)
# 创建 Markdown 表格
algorithm_model = f"{test_al}|{model_name}"
# 创建 Markdown 表格
markdown_table = "| Algorithm | Model | Rank | AC | CE | WA | RE | TLE | MLE | EXE | Hack Rate |\n"
markdown_table += "|----------|--------|------|----|----|----|----|-----|-----|-----|-----------|\n"
for rank in rank_result:
total = success_k[rank]["total"]
hacked = success_k[rank]["hacked"]
hack_rate = (hacked / len(results) * 100) if total > 0 else 0
hack_rate = round(hack_rate, 2) # 保留两位小数
# 计算每个状态的百分比和数量
status_percentages = []
for key in rank_result[rank]:
count = rank_result[rank][key]
percentage = (count / len(results) * 100)
status_percentages.append(f"{percentage:.2f}%")
# 将每个状态的百分比和数量组合在一起
markdown_table += f"| {algorithm_model} | {rank} | " + " | ".join(status_percentages) + f" | {hack_rate}% |\n"
# 保存到 .md 文件
with open(f"/home/luoxianzhen/yang/data/add_experience/rank_result-{model_name}-{test_al}-edge_and_random.md", "w") as file:
file.write(markdown_table)
print("Markdown 文件已生成: rank_result.md")
model_name = "gpt-4o"
test_al = "lcb"
edge_file = result_file = f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-edge-{test_al}-rank1-all.json"
random_file = result_file = f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-random-{test_al}-rank1-all.json"
edge_result = json.load(open(edge_file, "r", encoding="utf-8"))
random_result = json.load(open(random_file, "r", encoding="utf-8"))
edge_count = 0
edge_only_dict = {}
random_count = 0
random_only_dict= {}
all_count = 0
i=0
for tcb_id in edge_result.keys():
edge_only_dict[tcb_id] = []
random_only_dict[tcb_id] = []
for edge_code, random_code in zip(edge_result[tcb_id]['codes'], random_result[tcb_id]['codes']):
i+=1
edge_status = find_first_non_ac(edge_code['status'])
random_status = find_first_non_ac(random_code['status'])
if edge_status != "AC":
edge_count += 1
if random_status == "AC":
edge_only_dict[tcb_id].append(edge_code['code_id'])
if random_status != "AC":
random_count += 1
if edge_status == "AC":
random_only_dict[tcb_id].append(edge_code['code_id'])
if edge_status != "AC" and random_status != "AC":
all_count += 1
print(f"total {i} | edge_count: {edge_count} | random_count {random_count} | all_count {all_count}")

Xet Storage Details

Size:
5.83 kB
·
Xet hash:
0007b4b737448a16ffb48265cf28cf560389f8cc68ec81cd71fef0db706877b4

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.