File size: 4,953 Bytes
e10e693 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 | import json
import glob
import sys
from collections import defaultdict
from statistics import mean, variance
TARGET_LANGS = {
"python",
"java",
"c/c++",
"fortran",
"r",
"matlab",
"shell",
"rust",
"go",
}
# ------------------------
# 读取全部 JSONL 数据
# ------------------------
def load_jsonl_data(pattern="*.jsonl"):
JSONL_FILES = glob.glob(pattern)
# print("=====")
# print(JSONL_FILES)
language_count = defaultdict(int)
field_data = defaultdict(list) # 保存各字段所有值
field_data_by_lang = defaultdict(lambda: defaultdict(list))
for filename in JSONL_FILES:
with open(filename, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
obj = json.loads(line)
lang = obj.get("language", "unknown")
language_count[lang] += 1
# 记录所有字段的值
for k, v in obj.items():
if isinstance(v, (int, float)):
field_data[k].append(v)
field_data_by_lang[lang][k].append(v)
return language_count, field_data, field_data_by_lang
# ------------------------
# 打印语言分布
# ------------------------
def print_language_distribution(language_count):
print("\n========== 语言分布(language counts & percentage) ==========")
total_items = sum(language_count.values())
for lang, count in sorted(language_count.items(), key=lambda x: -x[1]):
pct = count / total_items * 100
print(f"{lang}: {count} ({pct:.2f}%)")
# ------------------------
# 区间统计(通用)
# ------------------------
def compute_bins(start, end, step):
bins = list(range(start, end + step, step))
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins) - 1)]
labels.append(f"{end}+")
return bins, labels
def compute_distribution(values, bins, labels):
dist = {label: 0 for label in labels}
for v in values:
placed = False
for i in range(len(bins) - 1):
if bins[i] <= v < bins[i + 1]:
dist[labels[i]] += 1
placed = True
break
if not placed:
dist[labels[-1]] += 1
return dist
def print_distribution(title, dist, total_count):
print(f"{title}")
for label, count in dist.items():
pct = count / total_count * 100
print(f" {label}: {count} ({pct:.2f}%)")
# ------------------------
# 统计某字段的分布
# ------------------------
def analyze_field_distribution(jsonl_dir, field, start, end, step):
print(f"\n================= 分析字段:{field} =================")
# 加载数据
language_count, field_data, field_data_by_lang = load_jsonl_data(jsonl_dir)
# 打印语言分布
print_language_distribution(language_count)
# 检查字段是否存在
if field not in field_data:
print(f"\n字段 '{field}' 在数据中不存在!")
return
values = []
for lang in TARGET_LANGS:
values.extend(field_data_by_lang.get(lang, {}).get(field, []))
print(f"\n========== {field} 整体统计 ==========")
print(f"个数: {len(values)}")
print(f"最小值: {min(values)}")
print(f"最大值: {max(values)}")
print(f"均值: {mean(values):.2f}")
if len(values) >= 2:
print(f"方差: {variance(values):.2f}")
else:
print("方差: N/A")
# 计算区间
bins, labels = compute_bins(start, end, step)
# 整体区间分布
overall_dist = compute_distribution(values, bins, labels)
print_distribution(f"区间分布", overall_dist, len(values))
# -------- 按语言统计 --------
print(f"\n========== 按语言统计 {field} ==========")
for lang in TARGET_LANGS:
fields = field_data_by_lang.get(lang)
if not fields or field not in fields:
continue
vals = fields[field]
print(f"\n--- {lang} ---")
print(f"数量: {len(vals)}")
print(f"最小值: {min(vals)}")
print(f"最大值: {max(vals)}")
print(f"均值: {mean(vals):.2f}")
if len(vals) >= 2:
print(f"方差: {variance(vals):.2f}")
else:
print("方差: N/A")
# 语言级区间分布
dist = compute_distribution(vals, bins, labels)
print_distribution("区间分布:", dist, len(vals))
# ------------------------
if __name__ == "__main__":
jsonl_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/analysis2/*.jsonl"
# jsonl_dir = "data/*.jsonl"
# total_lines, comment_lines, comment_tokenst, empty_lines, code_lines, tokens, functions, parameters
field = "comment_lines"
start = 0
end = 200
step = 20
analyze_field_distribution(jsonl_dir, field, start, end, step)
|