| import json | |
| # -*- coding: utf-8 -*- | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import os | |
| def plot_columns_with_and_without_normalization(arr: np.ndarray, | |
| out_dir: str = "./output"): | |
| """ | |
| 绘制二维数组的散点图: | |
| 1) 原始值散点图 | |
| 2) 每列 Min-Max 归一化后的散点图 | |
| 参数: | |
| arr: numpy 数组 (N, M) | |
| out_dir: 保存图片目录 | |
| 返回: | |
| norm_arr: 归一化后的数组 | |
| """ | |
| if not isinstance(arr, np.ndarray): | |
| arr = np.array(arr, dtype=float) | |
| if arr.ndim != 2: | |
| raise ValueError("arr 必须是二维数组") | |
| os.makedirs(out_dir, exist_ok=True) | |
| n_rows, n_cols = arr.shape | |
| # ---- 归一化 ---- | |
| col_min = arr.min(axis=0, keepdims=True) | |
| col_max = arr.max(axis=0, keepdims=True) | |
| denom = np.where((col_max - col_min) == 0, 1.0, (col_max - col_min)) | |
| norm_arr = (arr - col_min) / denom | |
| # ---- 1) 原始散点图 ---- | |
| plt.figure(figsize=(8, 6)) | |
| for j in range(n_cols): | |
| plt.scatter([j] * n_rows, arr[:, j], s=20, alpha=0.7) | |
| plt.title("point") | |
| plt.xlabel("x") | |
| plt.ylabel("y") | |
| plt.xticks(range(n_cols)) | |
| plt.grid(True, linestyle="--", alpha=0.5) | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(out_dir, "original_scatter.png"), dpi=300) | |
| plt.close() | |
| # ---- 2) 归一化散点图 ---- | |
| plt.figure(figsize=(8, 6)) | |
| for j in range(n_cols): | |
| plt.scatter([j] * n_rows, norm_arr[:, j], s=20, alpha=0.7) | |
| plt.title("y") | |
| plt.xlabel("x") | |
| plt.ylabel(" (0~1)") | |
| plt.xticks(range(n_cols)) | |
| plt.grid(True, linestyle="--", alpha=0.5) | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(out_dir, "normalized_scatter.png"), dpi=300) | |
| plt.close() | |
| # ---- 合并所有值 ---- | |
| values = norm_arr.flatten() | |
| # -*- coding: utf-8 -*- | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import os | |
| def plot_bin_counts(arr: np.ndarray, | |
| out_path: str = "./output/bin_counts.png", | |
| bins: int = 40): | |
| """ | |
| 对二维数组做列归一化,然后统计 0~1 区间内每个 bin 的数量, | |
| 画成折线图并标注每个点的数量。 | |
| 参数: | |
| arr: numpy 数组 (N, M) | |
| out_path: 保存图片路径 | |
| bins: 分箱数 | |
| 返回: | |
| norm_arr: 归一化后的数组 | |
| bin_edges: bin 边界 | |
| counts: 每个区间的数量 | |
| """ | |
| if not isinstance(arr, np.ndarray): | |
| arr = np.array(arr, dtype=float) | |
| if arr.ndim != 2: | |
| raise ValueError("arr 必须是二维数组") | |
| os.makedirs(os.path.dirname(out_path), exist_ok=True) | |
| # ---- 每列 Min-Max 归一化 ---- | |
| col_min = arr.min(axis=0, keepdims=True) | |
| col_max = arr.max(axis=0, keepdims=True) | |
| denom = np.where((col_max - col_min) == 0, 1.0, (col_max - col_min)) | |
| norm_arr = (arr - col_min) / denom | |
| cn = (norm_arr < 0.2).sum(axis=0) | |
| # ---- 合并所有值 ---- | |
| values = norm_arr.flatten() | |
| # ---- 统计分箱数量 ---- | |
| counts, bin_edges = np.histogram(values, bins=bins, range=(0, 1)) | |
| cumulative_counts = np.cumsum(counts) | |
| cumulative_prob = cumulative_counts / cumulative_counts[-1] | |
| print(cumulative_prob) | |
| # 在累计概率前添加0,0% | |
| cumulative_prob = np.insert(cumulative_prob, 0, 0) | |
| # 创建新的 bin_edges,添加一个 0 到最前面 | |
| new_bin_edges = np.linspace(0, 1, bins + 1) # 确保 bin_edges 数量和 cumulative_prob 匹配 | |
| # 画出累计概率分布图 | |
| plt.figure(figsize=(8, 6)) | |
| # plt.step(new_bin_edges, cumulative_prob, where='post', label='Cumulative Probability') | |
| plt.plot(new_bin_edges, cumulative_prob, label='Cumulative Probability') | |
| plt.xlabel('Value') | |
| plt.ylabel('Cumulative Probability') | |
| plt.title('Cumulative Probability Distribution') | |
| plt.grid(True) | |
| plt.xticks(np.linspace(0, 1, 11)) | |
| plt.yticks(np.linspace(0, 1, 11)) | |
| # 显示图形 | |
| plt.legend() | |
| plt.savefig("./output/TCB-Bench_count.png", dpi=300) | |
| plt.close() | |
| print(f"count=== {cn}") | |
| return norm_arr | |
| tcb_bench = json.load(open("/home/luoxianzhen/yang/data/Ours/TestcaseBench-v22.json", "r", encoding="utf-8")) | |
| # add_data = json.load(open("/home/luoxianzhen/yang/data/Ours/correct.json", "r", encoding="utf-8")) | |
| add_data = json.load(open("/home/luoxianzhen/yang/data/Ours/saves_correct_code.json", "r", encoding="utf-8")) | |
| times_distribute = [] | |
| # for item in tcb_bench: | |
| # right_code_list = item['solutions'] | |
| # if len(right_code_list) < 30: | |
| # continue | |
| # times_distribute.append([]) | |
| # for code in right_code_list: | |
| # times_distribute[-1].append(code['time']) | |
| import random | |
| for k, item in add_data.items(): | |
| if len(item) < 100: | |
| continue | |
| times_distribute.append([]) | |
| items = random.sample(item, 100) | |
| for code in items: | |
| times_distribute[-1].append(code['time']) | |
| import numpy as np | |
| sample = np.array(times_distribute) | |
| print(sample.shape) | |
| # 运行并出图 | |
| # norm_arr = plot_bin_counts(sample.T) | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| from sklearn.preprocessing import MinMaxScaler | |
| import random | |
| # 示例二维数组(将其视为用户数据) | |
| data_2d = np.array(sample[0:100]) | |
| # 初始化MinMaxScaler进行最大最小归一化 | |
| scaler = MinMaxScaler() | |
| # 对每列数据进行最大最小归一化 | |
| data_normalized = scaler.fit_transform(data_2d) | |
| # 绘制每列数据的散点图 | |
| plt.figure(figsize=(10, 6)) | |
| # 对每一列数据进行散点图绘制 | |
| # for i in range(data_normalized.shape[1]): | |
| # plt.scatter(np.arange(data_normalized.shape[0]), data_normalized[:, i], label=f'Column {i+1}') | |
| for i in range(data_normalized.shape[1]): | |
| # 为每个点设置颜色 | |
| colors = np.where(data_normalized[:, i] <= 0.2, '#FF7A8C', '#FECB71') | |
| plt.scatter(np.arange(data_normalized.shape[0]), data_normalized[:, i], color=colors, label=f'Column {i+1}') | |
| # 设置标题和标签 | |
| plt.title("Scatter Plot of Each Column After Min-Max Normalization") | |
| plt.xlabel("Index") | |
| plt.ylabel("Normalized Value") | |
| plt.grid(True) | |
| # 显示图形 | |
| plt.savefig("./output/right_code_distribute-100.png", dpi=800) | |
| import pandas as pd | |
| import numpy as np | |
| # 示例二维数组 | |
| # data_2d = np.array(data_normalized) | |
| # 将二维数组转换为 DataFrame,给列命名 | |
| df = pd.DataFrame(data_normalized, columns=[f'Column {i+1}' for i in range(data_2d.shape[1])]) | |
| # 保存为 Excel 文件 | |
| df.to_excel('normalized_data.xlsx', index=False) |
Xet Storage Details
- Size:
- 6.53 kB
- Xet hash:
- 41caafb42980b21ce39a1b7fca237438ca5148779cfb5ef15d371914d3ac7e20
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.