| import json | |
| import random | |
| def get_random_indices(array_length, num_indices): | |
| # 确保抽取的数量不超过数组长度 | |
| if num_indices > array_length: | |
| return random.sample(range(array_length), array_length) | |
| # 使用 random.sample 抽取指定数量的索引 | |
| return random.sample(range(array_length), num_indices) | |
| def find_first_non_ac(array): | |
| for element in array: | |
| if element != "AC": | |
| return element | |
| return "AC" | |
| test_als = ["lcb","ht","algo","crux","predo"] | |
| test_als = ["lcb"] | |
| model_name_list = [ | |
| "claude-sonnet-4-20250514-thinking", | |
| # "deepseek-v3", | |
| # "qwen3-nothink", | |
| # "claude4", | |
| # "gpt-4o", | |
| # "qwen-coder-plus", | |
| # "Qwen2.5-7B-Instruct", | |
| # "Qwen2.5-14B-Instruct", | |
| # "Qwen2.5-32B-Instruct", | |
| # "Qwen2.5-Coder-7B-Instruct", | |
| # "Qwen2.5-Coder-14B-Instruct", | |
| # "Qwen2.5-Coder-32B-Instruct", | |
| ] | |
| ds = json.load(open("/home/luoxianzhen/yang/data/Ours/TestcaseBench-v28.json", "r", encoding="utf-8")) | |
| rank_dict = {} | |
| tcb_id_transform = {} | |
| for item in ds: | |
| tcb_id_transform[item['wrong_code'][0]['problem']] = item['tcb_id'] | |
| rank_dict[item['tcb_id']] = len(item['wrong_code']) | |
| all_wrong_code = json.load(open("/home/luoxianzhen/yang/data/Ours/all_wrong_code/data/all_wrong_code_subset.json", "r", encoding="utf-8")) | |
| sub_set = [] | |
| for item in all_wrong_code: | |
| sub_set.append(tcb_id_transform[item['name']]) | |
| import os | |
| for model_name in model_name_list: | |
| for test_al in test_als: | |
| result_file = f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-all-wrong-code-{test_al}-rank5-all.json" | |
| if not os.path.exists(result_file): | |
| print(f"{model_name}-{test_al} NOT EXSIT!") | |
| continue | |
| results = json.load(open(result_file, "r", encoding="utf-8")) | |
| for k, v in results.items(): | |
| if k not in sub_set: | |
| continue | |
| rank = len(v['codes']) | |
| rank = rank_dict[k] | |
| for i in range(0, 1): | |
| nums_of_tests = rank * (i + 1) | |
| array_length = max([len(code['status']) for code in v['codes']]) | |
| tests_index = get_random_indices(array_length, nums_of_tests) | |
| for code in v['codes']: | |
| if len(tests_index) <= 0: | |
| continue | |
| tests_status = [code['status'][i] for i in tests_index] if max(tests_index) < len(code['status']) else code['status'] | |
| code['status'] = tests_status | |
| tests_details = [code['details'][i] for i in tests_index] if max(tests_index) < len(code['details']) else code['details'] | |
| code['details'] = tests_details | |
| json.dump(results, open(f"/home/luoxianzhen/yang/eval_wrong_code/ALLmode_results/tcb-{model_name}-{test_al}-all-wrong-code-{test_al}-rank1.json", "w", encoding="utf-8"), indent=4, ensure_ascii=False) | |
Xet Storage Details
- Size:
- 2.95 kB
- Xet hash:
- fdd308e143c83e3a61641b3b8e78be855360e4d2ae6bb58fd45b78fec868d7ac
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.