File size: 2,715 Bytes
284b084
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import csv
import sys
from collections import defaultdict
from statistics import mean, variance


# ------------------------
# 读取 CSV
# ------------------------
def load_csv(path):
    keywords = []
    stars = []

    with open(path, "r", encoding="utf-8", errors="replace") as f:
        reader = csv.DictReader(f)
        for row in reader:
            if "keyword" in row:
                keywords.append(row["keyword"])

            if "stars" in row:
                try:
                    stars.append(int(row["stars"]))
                except:
                    pass  # 非数字跳过

    return keywords, stars


# ------------------------
# 区间构造与统计
# ------------------------
def make_bins(start, end, step):
    bins = list(range(start, end + step, step))
    labels = [f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)]
    labels.append(f"{end}+")
    return bins, labels


def distribute(values, bins, labels):
    dist = {label: 0 for label in labels}

    for v in values:
        placed = False
        for i in range(len(bins) - 1):
            if bins[i] <= v < bins[i+1]:
                dist[labels[i]] += 1
                placed = True
                break
        if not placed:
            dist[labels[-1]] += 1

    return dist


# ------------------------
# 主统计函数
# ------------------------
def analyze_csv(path, start, end, step):

    print(f"\n========== 分析 CSV 文件:{path} ==========")

    keywords, stars = load_csv(path)

    print("\n========== keyword 分布 ==========")
    keyword_count = defaultdict(int)

    for kw in keywords:
        keyword_count[kw] += 1

    total_keywords = len(keywords)

    for kw, cnt in sorted(keyword_count.items(), key=lambda x: -x[1]):
        pct = cnt / total_keywords * 100
        print(f"{kw}: {cnt} ({pct:.2f}%)")

    print("\n========== stars 统计 ==========")
    if len(stars) == 0:
        print("没有 stars 字段或无有效数据")
    else:
        print(f"个数: {len(stars)}")
        print(f"最小值: {min(stars)}")
        print(f"最大值: {max(stars)}")
        print(f"均值: {mean(stars):.2f}")
        if len(stars) >= 2:
            print(f"方差: {variance(stars):.2f}")

        # 区间统计
        bins, labels = make_bins(start, end, step)
        dist = distribute(stars, bins, labels)

        print(f"区间分布")
        for lab in labels:
            cnt = dist[lab]
            pct = cnt / len(stars) * 100
            print(f"{lab}: {cnt} ({pct:.2f}%)")


if __name__ == "__main__":
    path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_checked.csv"
    start = 0
    end = 200
    step = 20
    analyze_csv(path, start, end, step)