| | import csv |
| | import sys |
| | from collections import defaultdict |
| | from statistics import mean, variance |
| |
|
| |
|
| | |
| | |
| | |
| | def load_csv(path): |
| | keywords = [] |
| | stars = [] |
| |
|
| | with open(path, "r", encoding="utf-8", errors="replace") as f: |
| | reader = csv.DictReader(f) |
| | for row in reader: |
| | if "keyword" in row: |
| | keywords.append(row["keyword"]) |
| |
|
| | if "stars" in row: |
| | try: |
| | stars.append(int(row["stars"])) |
| | except: |
| | pass |
| |
|
| | return keywords, stars |
| |
|
| |
|
| | |
| | |
| | |
| | def make_bins(start, end, step): |
| | bins = list(range(start, end + step, step)) |
| | labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)] |
| | labels.append(f"{end}+") |
| | return bins, labels |
| |
|
| |
|
| | def distribute(values, bins, labels): |
| | dist = {label: 0 for label in labels} |
| |
|
| | for v in values: |
| | placed = False |
| | for i in range(len(bins) - 1): |
| | if bins[i] <= v < bins[i+1]: |
| | dist[labels[i]] += 1 |
| | placed = True |
| | break |
| | if not placed: |
| | dist[labels[-1]] += 1 |
| |
|
| | return dist |
| |
|
| |
|
| | |
| | |
| | |
| | def analyze_csv(path, start, end, step): |
| |
|
| | print(f"\n========== 分析 CSV 文件:{path} ==========") |
| |
|
| | keywords, stars = load_csv(path) |
| |
|
| | print("\n========== keyword 分布 ==========") |
| | keyword_count = defaultdict(int) |
| |
|
| | for kw in keywords: |
| | keyword_count[kw] += 1 |
| |
|
| | total_keywords = len(keywords) |
| |
|
| | for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]): |
| | pct = cnt / total_keywords * 100 |
| | print(f"{kw}: {cnt} ({pct:.2f}%)") |
| |
|
| | print("\n========== stars 统计 ==========") |
| | if len(stars) == 0: |
| | print("没有 stars 字段或无有效数据") |
| | else: |
| | print(f"个数: {len(stars)}") |
| | print(f"最小值: {min(stars)}") |
| | print(f"最大值: {max(stars)}") |
| | print(f"均值: {mean(stars):.2f}") |
| | if len(stars) >= 2: |
| | print(f"方差: {variance(stars):.2f}") |
| |
|
| | |
| | bins, labels = make_bins(start, end, step) |
| | dist = distribute(stars, bins, labels) |
| |
|
| | print(f"区间分布") |
| | for lab in labels: |
| | cnt = dist[lab] |
| | pct = cnt / len(stars) * 100 |
| | print(f"{lab}: {cnt} ({pct:.2f}%)") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv" |
| | start = 0 |
| | end = 200 |
| | step = 20 |
| | analyze_csv(path, start, end, step) |
| |
|