dataset-builder / data1 /compute_stars_keywords.py

DouDou

Upload data1/compute_stars_keywords.py with huggingface_hub

284b084 verified 24 days ago

2.72 kB

	import csv
	import sys
	from collections import defaultdict
	from statistics import mean, variance


	# ------------------------
	# 读取 CSV
	# ------------------------
	def load_csv(path):
	keywords = []
	stars = []

	with open(path, "r", encoding="utf-8", errors="replace") as f:
	reader = csv.DictReader(f)
	for row in reader:
	if "keyword" in row:
	keywords.append(row["keyword"])

	if "stars" in row:
	try:
	stars.append(int(row["stars"]))
	except:
	pass # 非数字跳过

	return keywords, stars


	# ------------------------
	# 区间构造与统计
	# ------------------------
	def make_bins(start, end, step):
	bins = list(range(start, end + step, step))
	labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
	labels.append(f"{end}+")
	return bins, labels


	def distribute(values, bins, labels):
	dist = {label: 0 for label in labels}

	for v in values:
	placed = False
	for i in range(len(bins) - 1):
	if bins[i] <= v < bins[i+1]:
	dist[labels[i]] += 1
	placed = True
	break
	if not placed:
	dist[labels[-1]] += 1

	return dist


	# ------------------------
	# 主统计函数
	# ------------------------
	def analyze_csv(path, start, end, step):

	print(f"\n========== 分析 CSV 文件：{path} ==========")

	keywords, stars = load_csv(path)

	print("\n========== keyword 分布 ==========")
	keyword_count = defaultdict(int)

	for kw in keywords:
	keyword_count[kw] += 1

	total_keywords = len(keywords)

	for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]):
	pct = cnt / total_keywords * 100
	print(f"{kw}: {cnt} ({pct:.2f}%)")

	print("\n========== stars 统计 ==========")
	if len(stars) == 0:
	print("没有 stars 字段或无有效数据")
	else:
	print(f"个数: {len(stars)}")
	print(f"最小值: {min(stars)}")
	print(f"最大值: {max(stars)}")
	print(f"均值: {mean(stars):.2f}")
	if len(stars) >= 2:
	print(f"方差: {variance(stars):.2f}")

	# 区间统计
	bins, labels = make_bins(start, end, step)
	dist = distribute(stars, bins, labels)

	print(f"区间分布")
	for lab in labels:
	cnt = dist[lab]
	pct = cnt / len(stars) * 100
	print(f"{lab}: {cnt} ({pct:.2f}%)")


	if __name__ == "__main__":
	path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv"
	start = 0
	end = 200
	step = 20
	analyze_csv(path, start, end, step)