| import json | |
| import re | |
| from collections import defaultdict | |
| def clean_key(original_key): | |
| tcb_id = original_key.replace('/', ' ') | |
| tcb_id = re.sub(r'^[^.]*\.', '', tcb_id) | |
| i = tcb_id.find('」') | |
| if i != -1: | |
| tcb_id = tcb_id[i+1:] | |
| tcb_id = tcb_id.strip() | |
| return tcb_id | |
| # def process_and_dedup_json(filepath): | |
| # with open(filepath, 'r', encoding='utf-8') as f: | |
| # data = json.load(f) | |
| # deduped_data = {} | |
| # key_count = defaultdict(int) | |
| # repeated_keys = set() | |
| # for key, value in data.items(): | |
| # cleaned_key = clean_key(key) | |
| # key_count[cleaned_key] += 1 | |
| # if cleaned_key not in deduped_data: | |
| # deduped_data[cleaned_key] = value | |
| # else: | |
| # repeated_keys.add(cleaned_key) | |
| # return deduped_data, sorted(repeated_keys) | |
| # # 示例使用 | |
| # input_path = '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/final_v1.0.json' | |
| # output_path = '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/final_v2.0.json' | |
| # deduped_json, repeated_ids = process_and_dedup_json(input_path) | |
| # # 写入去重后的 JSON 文件 | |
| # with open(output_path, 'w', encoding='utf-8') as f: | |
| # json.dump(deduped_json, f, ensure_ascii=False, indent=2) | |
| # add_keys = ['wrong_code', 'timeLimit', 'memoryLimit', 'rank', 'n_rows', 'balance_var', 'balance_column_sum'] | |
| # testcase_bench = [] | |
| # with open('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench.json', 'r', encoding='utf-8') as f: | |
| # testcase_bench = json.load(f) | |
| # for item in testcase_bench: | |
| # tcb_id = item['tcb_id'] | |
| # if tcb_id in deduped_json: | |
| # for key in add_keys: | |
| # item[key] = deduped_json[tcb_id][key] | |
| # else: | |
| # print(tcb_id) | |
| # with open('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v2.json', 'w', encoding='utf-8') as f: | |
| # json.dump(testcase_bench, f, ensure_ascii=False, indent=4) | |
| # import json | |
| # def remove_aa_bb_keys(input_file, output_file): | |
| # with open(input_file, 'r', encoding='utf-8') as f: | |
| # data = json.load(f) | |
| # # 如果 data 是列表,处理每个元素 | |
| # # "timeLimit": 1500, | |
| # # "memoryLimit": 512, | |
| # if isinstance(data, list): | |
| # for item in data: | |
| # item.pop('timeLimit', None) | |
| # item.pop('memoryLimit', None) | |
| # else: | |
| # raise ValueError("JSON 须为数组形式,每个元素是一个对象") | |
| # # 保存结果 | |
| # print(f"共处理 {len(data)} 条记录。") | |
| # return data | |
| # 示例用法 | |
| # data = remove_aa_bb_keys('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v2.json', '/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v3.json') | |
| # for item in data: | |
| # for w_code_item in item['wrong_code']: | |
| # w_code_item['problem'] = item['tcb_id'] | |
| # with open('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v3.json', 'w', encoding='utf-8') as f: | |
| # json.dump(data, f, ensure_ascii=False, indent=4) | |
| # with open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/balance_v3.0.dev_with_code.json", 'r', encoding='utf-8') as f: | |
| # worry_code = json.load(f) | |
| # with open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v4.json", 'r', encoding='utf-8') as f: | |
| # tcb_bench = json.load(f) | |
| # worry_code_dict = {} | |
| # for k, v in worry_code.items(): | |
| # tcb_id = clean_key(k) | |
| # worry_code_dict[tcb_id] = v | |
| # for item in tcb_bench: | |
| # if "地底蔷薇" in item['tcb_id']: | |
| # worry_codes = worry_code_dict[""] | |
| # else: | |
| # worry_codes = worry_code_dict[item['tcb_id']] | |
| # item['wrong_code'] = worry_codes | |
| # with open('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v5.json', 'w', encoding='utf-8') as f: | |
| # json.dump(tcb_bench, f, ensure_ascii=False, indent=6) | |
| with open("/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v5.json", 'r', encoding='utf-8') as f: | |
| tcb_bench = json.load(f) | |
| def remove_freopen_lines(code_str: str) -> str: | |
| """ | |
| 移除包含 'freopen' 的代码行 | |
| :param code_str: 原始代码字符串 | |
| :return: 处理后的代码字符串 | |
| """ | |
| lines = code_str.splitlines() | |
| filtered_lines = [line for line in lines if 'freopen' not in line] | |
| return '\n'.join(filtered_lines) | |
| for item in tcb_bench: | |
| for code_item in item["wrong_code"]["wrong_code"]: | |
| code_item['code'] = remove_freopen_lines(code_item['code']) | |
| with open('/home/i-luoxianzhen/data/TestCase-Gen/data/Ours/TestcaseBench-v6.json', 'w', encoding='utf-8') as f: | |
| json.dump(tcb_bench, f, ensure_ascii=False, indent=6) |
Xet Storage Details
- Size:
- 4.61 kB
- Xet hash:
- 325948b8fb047e2614d5ec6e79663890563ad1c8167f1ad79d3481e7b38d6237
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.