File size: 5,109 Bytes
33569f9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 | import argparse
import json
import math
import os
import random
import numpy as np
import torch
def get_difficulty_safe(item):
"""Safely gets and converts difficulty, handling errors."""
difficulty = item.get("difficulty")
if difficulty is None:
return None
try:
difficulty_float = float(difficulty)
return (
difficulty_float
if not (math.isnan(difficulty_float) or math.isinf(difficulty_float))
else None
)
except (ValueError, TypeError):
return None
def save_json(data_list, output_path, description):
"""Helper function to save a list to a JSON file."""
if data_list and isinstance(data_list[0], dict) and "data" in data_list[0]:
data_to_save = [item["data"] for item in data_list]
else:
data_to_save = data_list
if not data_to_save:
return
output_dir = os.path.dirname(output_path)
if output_dir and not os.path.exists(output_dir):
os.makedirs(output_dir)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(data_to_save, f, indent=4, ensure_ascii=False)
print(f"save to: {output_path}")
def random_sample(data_list, k, output_path, description):
"""Helper: Randomly samples k items, NO sorting afterwards."""
if not isinstance(data_list, list):
print(f"Error ({description})")
return
n = len(data_list)
k = min(n, k)
sampled = data_list if k >= n else random.sample(data_list, k)
save_json(
sampled,
output_path,
f"{description} (random sample: {len(sampled)})",
)
def difficulty_sorted_sample(data_list, k, output_path, description):
"""Helper: Sorts list by difficulty descending, samples k items using torch.linspace."""
if not data_list or k <= 0:
return
n = len(data_list)
actual_k = min(n, k)
sorted_list = sorted(data_list, key=lambda x: x["difficulty_float"], reverse=True)
sampled = []
if actual_k >= n:
sampled = sorted_list
else:
indices = torch.linspace(0, n - 1, steps=actual_k).round().long()
indices = torch.clamp(indices, 0, n - 1)
unique_indices = torch.unique(indices)
sampled = [sorted_list[i] for i in unique_indices]
save_json(
sampled,
output_path,
f"{description}",
)
def gaussian_sample(data_list, k, output_path, description, center=0.3, std_dev=0.2):
"""Samples k items based on a Gaussian distribution centered on 'center'."""
if not data_list or k <= 0:
return
n = len(data_list)
actual_k = min(n, k)
if actual_k == 0:
return
difficulties = [item["difficulty_float"] / 100.0 for item in data_list]
probs = np.exp(-((np.array(difficulties) - center) ** 2) / (2 * std_dev**2))
probs /= np.sum(probs) # Normalize to sum to 1
try:
sampled = [data_list[i] for i in np.random.choice(n, k, False, p=probs)]
save_json(
sampled,
output_path,
f"{description} (gaussian,mean: {center}, var:{std_dev})",
)
except ValueError as e:
print(f"{e}")
def process_ddata(input_json_path, output_prefix, task, k=2500):
try:
with open(input_json_path, "r", encoding="utf-8") as f:
data = json.load(f)
except Exception as E:
print(f"{E}")
return
valid_items = []
for item in data:
d = get_difficulty_safe(item)
if isinstance(item, dict) and d is not None:
valid_items.append(
{"difficulty_float": d, "p_value": d / 100.0, "data": item}
)
if len(valid_items) == 0:
return
print(f"valid data: {len(valid_items)}条 (original: {len(data)}条)")
if task == "0070_all":
subset = [item for item in valid_items if 0 < item["p_value"] <= 0.7]
difficulty_sorted_sample(
subset,
k,
f"{output_prefix}_0070_all.json",
"(0 < p <= 0.7)",
)
elif task == "gaussian_03":
subset = [item for item in valid_items if item["p_value"] > 0]
gaussian_sample(
subset,
k,
f"{output_prefix}_gaussian_03.json",
"gaussian: 0.3 center, 0.2 variance",
)
elif task == "random_sample":
random_sample(valid_items, k, f"{output_prefix}_random.json", "random_sample")
print("\n finished")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--input_json")
parser.add_argument(
"-o",
"--output_prefix",
default="",
)
parser.add_argument("-t", "--task", default="")
parser.add_argument(
"-k",
"--k_dynamic_total",
default=2500,
)
args = parser.parse_args()
if not args.output_prefix:
args.output_prefix = args.input_json[:-5]
print(f"prefix: {args.output_prefix}")
args.k_dynamic_total = int(args.k_dynamic_total)
process_ddata(args.input_json, args.output_prefix, args.task, args.k_dynamic_total)
|