import csv import sys from collections import defaultdict from statistics import mean, variance # ------------------------ # 读取 CSV # ------------------------ def load_csv(path): keywords = [] stars = [] with open(path, "r", encoding="utf-8", errors="replace") as f: reader = csv.DictReader(f) for row in reader: if "keyword" in row: keywords.append(row["keyword"]) if "stars" in row: try: stars.append(int(row["stars"])) except: pass # 非数字跳过 return keywords, stars # ------------------------ # 区间构造与统计 # ------------------------ def make_bins(start, end, step): bins = list(range(start, end + step, step)) labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)] labels.append(f"{end}+") return bins, labels def distribute(values, bins, labels): dist = {label: 0 for label in labels} for v in values: placed = False for i in range(len(bins) - 1): if bins[i] <= v < bins[i+1]: dist[labels[i]] += 1 placed = True break if not placed: dist[labels[-1]] += 1 return dist # ------------------------ # 主统计函数 # ------------------------ def analyze_csv(path, start, end, step): print(f"\n========== 分析 CSV 文件:{path} ==========") keywords, stars = load_csv(path) print("\n========== keyword 分布 ==========") keyword_count = defaultdict(int) for kw in keywords: keyword_count[kw] += 1 total_keywords = len(keywords) for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]): pct = cnt / total_keywords * 100 print(f"{kw}: {cnt} ({pct:.2f}%)") print("\n========== stars 统计 ==========") if len(stars) == 0: print("没有 stars 字段或无有效数据") else: print(f"个数: {len(stars)}") print(f"最小值: {min(stars)}") print(f"最大值: {max(stars)}") print(f"均值: {mean(stars):.2f}") if len(stars) >= 2: print(f"方差: {variance(stars):.2f}") # 区间统计 bins, labels = make_bins(start, end, step) dist = distribute(stars, bins, labels) print(f"区间分布") for lab in labels: cnt = dist[lab] pct = cnt / len(stars) * 100 print(f"{lab}: {cnt} ({pct:.2f}%)") if __name__ == "__main__": path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv" start = 0 end = 200 step = 20 analyze_csv(path, start, end, step)