Tsukihjy/testcase / methods /Filter /load_data.py
Tsukihjy's picture
download
raw
6.73 kB
import json
import os
import json
base_dir = "/home/luoxianzhen/yang"
if not os.path.exists(base_dir):
base_dir = "/home/relay/luoxianzhen/yang"
def read_jsonl_to_dict(file_path):
result = {}
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
data = json.loads(line.strip())
# 跳过 gen_nums 和 right_nums 都为 0 的条目
if data.get("gen_nums", 0) == 0 and data.get("right_nums", 0) == 0:
continue
# 使用 tcb_id 作为 key,重复的直接覆盖
result[data["tcb_id"]] = {
"gen_nums": data["gen_nums"],
"right_nums": data["right_nums"]
}
print(f"has flited {len(result)}")
return result
def get_data(name="tcb", prefix_dir=None, save_dir=None, testcase_alg="", pass_rate_save_file=""):
import random
if name == "tcb":
ds = json.load(open(f"{base_dir}/data/Ours/TestcaseBench-v28.json", "r", encoding="utf-8"))
res = []
for item in ds:
if not os.path.exists(prefix_dir.format(item['tcb_id'])):
continue
solutions = []
if len(item['solutions']) < 8:
solutions = item['solutions']
else:
solutions = random.sample(item['solutions'], 8)
for c in (solutions):
res.append({
"code": c['code'],
"time_limit": item["runtime_limit"],
"memory_limit": item["memory_limit"],
"compileAndRunOptions": c["compileAndRunOptions"],
"test_cases": prefix_dir.format(item['tcb_id']),
"save_path": save_dir.format(item['tcb_id']),
"problem_id": item['tcb_id'],
"testcase_alg": testcase_alg
})
return res
def get_data_lenght(name="tcb", prefix_dir=None, save_dir=None, testcase_alg="", pass_rate_save_file=""):
import random
if name == "tcb":
ds = json.load(open(f"{base_dir}/data/Ours/TestcaseBench-v28.json", "r", encoding="utf-8"))
# ds = ds[0: 100]
res = []
for item in ds:
solutions = []
if len(item['solutions']) < 8:
solutions = item['solutions']
else:
solutions = random.sample(item['solutions'], 8)
for c in (solutions):
res.append({
"code": c['code'],
"time_limit": item["runtime_limit"],
"memory_limit": item["memory_limit"],
"compileAndRunOptions": c["compileAndRunOptions"],
"test_cases": prefix_dir.format(item['tcb_id']),
"save_path": save_dir.format(item['tcb_id']),
"problem_id": item['tcb_id'],
"testcase_alg": testcase_alg
})
return res
def normalize_time(data_list):
# 提取所有 time 值
times = [item["time"] for item in data_list if "time" in item]
if not times:
return []
min_t, max_t = min(times), max(times)
# 避免分母为0的情况(即所有值都一样)
if min_t == max_t:
return [0.0 for _ in times]
# 最大最小归一化
normalized = [(t - min_t) / (max_t - min_t) for t in times]
return normalized
def get_data_subset(name="tcb", prefix_dir=None, save_dir=None, testcase_alg="", pass_rate_save_file=""):
import random
if name == "tcb":
ds = json.load(open(f"{base_dir}/data/Ours/TestcaseBench-v28.json", "r", encoding="utf-8"))
res = []
subsets = ['秘密袭击', '最大公约数', '种树 Growing Trees', '反质数序列', '小 Y 和恐怖的奴隶主', '混合果汁', '迷宫探险', '儒略日', '拼图', 'DFS 序 2', '转圈游戏', '城池攻占', '潜入行动', '简单算术', '基因串', '最小公倍数', '数三角形', '战争调度', '与众不同', '钓鱼', '对称轴 Axes of Symmetry', '唱、跳、rap 和篮球', '降雨量', '餐巾计划', 'LJJ 的字符串', '无源汇有上下界可行流', '最短不公共子串', '任务安排 1', '道路堵塞', '聪明的燕姿', '你的名字', '塔', '遗失的答案', '字母 Letters', '林克卡特树', '石头花园 Rock Garden', '分配问题', 'Seek the Name, Seek the Fame', '炮兵阵地', 'xor', '最大连续和', '树上询问', '转化', '小奇采药', '数列递推', 'Divide', '崂山白花蛇草水', '猜数游戏', '花匠', '小 Q 的草稿', '架设电话线', '格雷码', 'Minimax', '喷水装置', '画框', '人造情感', 'Identity Theft', 'A + B 问题', 'Sim', '回文子串', '修剪草坪', '生日礼物', '填树', '题', '不同的最小割', '勘破神机', '地铁交通', '书法家', '数列互质', 'Sumdiv', '舞会', '动态图连通性', '伪光滑数', '庆典', '货车运输', '数的划分', '老 C 的任务', '吃', 'Transport', '网络协议', '活动安排', "Bessie's Snow Cow", '旅行者', '三元组', '普通平衡树', '取石子游戏 2', '领导集团问题', '滚榜', '飞镖', '抛硬币', '逛公园', '棘手的操作', '大工程', '镜面通道', '炸弹攻击 2', '找爸爸', '线性代数', '小凸玩密室', 'Circus', 'A 的 B 次方']
for item in ds:
if item['tcb_id'] not in subsets:
continue
solutions = random.sample(item['solutions'], 8)
for i, c in enumerate(solutions):
res.append({
"code_id": i,
"code": c['code'],
"time_limit": item["runtime_limit"],
"memory_limit": item["memory_limit"],
"compileAndRunOptions": c["compileAndRunOptions"],
"test_cases": prefix_dir.format(item['tcb_id']),
"save_path": save_dir.format(item['tcb_id']),
"problem_id": item['tcb_id'],
})
return res
if __name__ == "__main__":
# ds = json.load(open("/home/i-luoxianzhen/yang/eval_wrong_code/results/all_results.json"))
# save_back_results(ds, name="codeforces")
# print("Data loaded and saved back successfully.")
model_name = "deepseek-v3"
testcase_alg = "crux"
pass_rate_save_file = f"{base_dir}/save_tests_{model_name}-fliter/{testcase_alg}/test_pass_rate.jsonl"
data = get_data(name="tcb", prefix_dir=f"{base_dir}/save_tests_{model_name}/{testcase_alg}/" + "tests-{}.jsonl", save_dir=f"{base_dir}/save_tests_{model_name}-fliter/{testcase_alg}/" + "tests-{}.jsonl", testcase_alg=testcase_alg, pass_rate_save_file=pass_rate_save_file)
print(len(data))

Xet Storage Details

Size:
6.73 kB
·
Xet hash:
372cd038b0b91a52d607d77b44fc75614dbc0d7ebf141dbc6cb3e3e5e0849665

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.