Tsukihjy/testcase / testcase-data /Ours /right_code_distribute.py
Tsukihjy's picture
download
raw
6.53 kB
import json
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import os
def plot_columns_with_and_without_normalization(arr: np.ndarray,
out_dir: str = "./output"):
"""
绘制二维数组的散点图:
1) 原始值散点图
2) 每列 Min-Max 归一化后的散点图
参数:
arr: numpy 数组 (N, M)
out_dir: 保存图片目录
返回:
norm_arr: 归一化后的数组
"""
if not isinstance(arr, np.ndarray):
arr = np.array(arr, dtype=float)
if arr.ndim != 2:
raise ValueError("arr 必须是二维数组")
os.makedirs(out_dir, exist_ok=True)
n_rows, n_cols = arr.shape
# ---- 归一化 ----
col_min = arr.min(axis=0, keepdims=True)
col_max = arr.max(axis=0, keepdims=True)
denom = np.where((col_max - col_min) == 0, 1.0, (col_max - col_min))
norm_arr = (arr - col_min) / denom
# ---- 1) 原始散点图 ----
plt.figure(figsize=(8, 6))
for j in range(n_cols):
plt.scatter([j] * n_rows, arr[:, j], s=20, alpha=0.7)
plt.title("point")
plt.xlabel("x")
plt.ylabel("y")
plt.xticks(range(n_cols))
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "original_scatter.png"), dpi=300)
plt.close()
# ---- 2) 归一化散点图 ----
plt.figure(figsize=(8, 6))
for j in range(n_cols):
plt.scatter([j] * n_rows, norm_arr[:, j], s=20, alpha=0.7)
plt.title("y")
plt.xlabel("x")
plt.ylabel(" (0~1)")
plt.xticks(range(n_cols))
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "normalized_scatter.png"), dpi=300)
plt.close()
# ---- 合并所有值 ----
values = norm_arr.flatten()
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
import os
def plot_bin_counts(arr: np.ndarray,
out_path: str = "./output/bin_counts.png",
bins: int = 40):
"""
对二维数组做列归一化,然后统计 0~1 区间内每个 bin 的数量,
画成折线图并标注每个点的数量。
参数:
arr: numpy 数组 (N, M)
out_path: 保存图片路径
bins: 分箱数
返回:
norm_arr: 归一化后的数组
bin_edges: bin 边界
counts: 每个区间的数量
"""
if not isinstance(arr, np.ndarray):
arr = np.array(arr, dtype=float)
if arr.ndim != 2:
raise ValueError("arr 必须是二维数组")
os.makedirs(os.path.dirname(out_path), exist_ok=True)
# ---- 每列 Min-Max 归一化 ----
col_min = arr.min(axis=0, keepdims=True)
col_max = arr.max(axis=0, keepdims=True)
denom = np.where((col_max - col_min) == 0, 1.0, (col_max - col_min))
norm_arr = (arr - col_min) / denom
cn = (norm_arr < 0.2).sum(axis=0)
# ---- 合并所有值 ----
values = norm_arr.flatten()
# ---- 统计分箱数量 ----
counts, bin_edges = np.histogram(values, bins=bins, range=(0, 1))
cumulative_counts = np.cumsum(counts)
cumulative_prob = cumulative_counts / cumulative_counts[-1]
print(cumulative_prob)
# 在累计概率前添加0,0%
cumulative_prob = np.insert(cumulative_prob, 0, 0)
# 创建新的 bin_edges,添加一个 0 到最前面
new_bin_edges = np.linspace(0, 1, bins + 1) # 确保 bin_edges 数量和 cumulative_prob 匹配
# 画出累计概率分布图
plt.figure(figsize=(8, 6))
# plt.step(new_bin_edges, cumulative_prob, where='post', label='Cumulative Probability')
plt.plot(new_bin_edges, cumulative_prob, label='Cumulative Probability')
plt.xlabel('Value')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Probability Distribution')
plt.grid(True)
plt.xticks(np.linspace(0, 1, 11))
plt.yticks(np.linspace(0, 1, 11))
# 显示图形
plt.legend()
plt.savefig("./output/TCB-Bench_count.png", dpi=300)
plt.close()
print(f"count=== {cn}")
return norm_arr
tcb_bench = json.load(open("/home/luoxianzhen/yang/data/Ours/TestcaseBench-v22.json", "r", encoding="utf-8"))
# add_data = json.load(open("/home/luoxianzhen/yang/data/Ours/correct.json", "r", encoding="utf-8"))
add_data = json.load(open("/home/luoxianzhen/yang/data/Ours/saves_correct_code.json", "r", encoding="utf-8"))
times_distribute = []
# for item in tcb_bench:
# right_code_list = item['solutions']
# if len(right_code_list) < 30:
# continue
# times_distribute.append([])
# for code in right_code_list:
# times_distribute[-1].append(code['time'])
import random
for k, item in add_data.items():
if len(item) < 100:
continue
times_distribute.append([])
items = random.sample(item, 100)
for code in items:
times_distribute[-1].append(code['time'])
import numpy as np
sample = np.array(times_distribute)
print(sample.shape)
# 运行并出图
# norm_arr = plot_bin_counts(sample.T)
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import random
# 示例二维数组(将其视为用户数据)
data_2d = np.array(sample[0:100])
# 初始化MinMaxScaler进行最大最小归一化
scaler = MinMaxScaler()
# 对每列数据进行最大最小归一化
data_normalized = scaler.fit_transform(data_2d)
# 绘制每列数据的散点图
plt.figure(figsize=(10, 6))
# 对每一列数据进行散点图绘制
# for i in range(data_normalized.shape[1]):
# plt.scatter(np.arange(data_normalized.shape[0]), data_normalized[:, i], label=f'Column {i+1}')
for i in range(data_normalized.shape[1]):
# 为每个点设置颜色
colors = np.where(data_normalized[:, i] <= 0.2, '#FF7A8C', '#FECB71')
plt.scatter(np.arange(data_normalized.shape[0]), data_normalized[:, i], color=colors, label=f'Column {i+1}')
# 设置标题和标签
plt.title("Scatter Plot of Each Column After Min-Max Normalization")
plt.xlabel("Index")
plt.ylabel("Normalized Value")
plt.grid(True)
# 显示图形
plt.savefig("./output/right_code_distribute-100.png", dpi=800)
import pandas as pd
import numpy as np
# 示例二维数组
# data_2d = np.array(data_normalized)
# 将二维数组转换为 DataFrame,给列命名
df = pd.DataFrame(data_normalized, columns=[f'Column {i+1}' for i in range(data_2d.shape[1])])
# 保存为 Excel 文件
df.to_excel('normalized_data.xlsx', index=False)

Xet Storage Details

Size:
6.53 kB
·
Xet hash:
41caafb42980b21ce39a1b7fca237438ca5148779cfb5ef15d371914d3ac7e20

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.