dataset-builder / data1 /compute_stars_keywords.py
DouDou
Upload data1/compute_stars_keywords.py with huggingface_hub
284b084 verified
import csv
import sys
from collections import defaultdict
from statistics import mean, variance
# ------------------------
# 读取 CSV
# ------------------------
def load_csv(path):
keywords = []
stars = []
with open(path, "r", encoding="utf-8", errors="replace") as f:
reader = csv.DictReader(f)
for row in reader:
if "keyword" in row:
keywords.append(row["keyword"])
if "stars" in row:
try:
stars.append(int(row["stars"]))
except:
pass # 非数字跳过
return keywords, stars
# ------------------------
# 区间构造与统计
# ------------------------
def make_bins(start, end, step):
bins = list(range(start, end + step, step))
labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
labels.append(f"{end}+")
return bins, labels
def distribute(values, bins, labels):
dist = {label: 0 for label in labels}
for v in values:
placed = False
for i in range(len(bins) - 1):
if bins[i] <= v < bins[i+1]:
dist[labels[i]] += 1
placed = True
break
if not placed:
dist[labels[-1]] += 1
return dist
# ------------------------
# 主统计函数
# ------------------------
def analyze_csv(path, start, end, step):
print(f"\n========== 分析 CSV 文件:{path} ==========")
keywords, stars = load_csv(path)
print("\n========== keyword 分布 ==========")
keyword_count = defaultdict(int)
for kw in keywords:
keyword_count[kw] += 1
total_keywords = len(keywords)
for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]):
pct = cnt / total_keywords * 100
print(f"{kw}: {cnt} ({pct:.2f}%)")
print("\n========== stars 统计 ==========")
if len(stars) == 0:
print("没有 stars 字段或无有效数据")
else:
print(f"个数: {len(stars)}")
print(f"最小值: {min(stars)}")
print(f"最大值: {max(stars)}")
print(f"均值: {mean(stars):.2f}")
if len(stars) >= 2:
print(f"方差: {variance(stars):.2f}")
# 区间统计
bins, labels = make_bins(start, end, step)
dist = distribute(stars, bins, labels)
print(f"区间分布")
for lab in labels:
cnt = dist[lab]
pct = cnt / len(stars) * 100
print(f"{lab}: {cnt} ({pct:.2f}%)")
if __name__ == "__main__":
path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv"
start = 0
end = 200
step = 20
analyze_csv(path, start, end, step)