DouDou commited on
Commit
156430e
·
verified ·
1 Parent(s): 9f60e31

Upload data1/reporting/stage_b_stats.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/reporting/stage_b_stats.py +302 -0
data1/reporting/stage_b_stats.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Stage B: 统计 repos_check_history.csv 的过滤效果
3
+ YES/NO、按keyword通过率、reason长度与Top词/短语
4
+ """
5
+ import csv
6
+ import sys
7
+ from collections import defaultdict, Counter
8
+ from tqdm import tqdm
9
+ import statistics
10
+ import re
11
+ from pathlib import Path
12
+
13
+ csv.field_size_limit(sys.maxsize)
14
+
15
+
16
+ class StageBStats:
17
+ def __init__(self, csv_path, output_dir):
18
+ self.csv_path = csv_path
19
+ self.output_dir = Path(output_dir)
20
+ self.output_dir.mkdir(parents=True, exist_ok=True)
21
+
22
+ self.stats = {
23
+ 'total': 0,
24
+ 'yes': 0,
25
+ 'no': 0,
26
+ 'by_keyword': defaultdict(lambda: {'yes': 0, 'no': 0}),
27
+ 'reason_lengths_yes': [],
28
+ 'reason_lengths_no': [],
29
+ 'reason_texts_yes': [],
30
+ 'reason_texts_no': [],
31
+ 'full_name_keyword_map': defaultdict(lambda: defaultdict(str)), # 同一仓库不同keyword的结果
32
+ 'has_topics_yes': {'yes': 0, 'no': 0},
33
+ 'has_topics_no': {'yes': 0, 'no': 0},
34
+ 'has_description_yes': {'yes': 0, 'no': 0},
35
+ 'has_description_no': {'yes': 0, 'no': 0},
36
+ }
37
+
38
+ def extract_words(self, text):
39
+ """提取词(简单分词,支持英文)"""
40
+ if not text:
41
+ return []
42
+ # 转小写,提取单词
43
+ words = re.findall(r'\b[a-z]+\b', text.lower())
44
+ # 过滤停用词
45
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
46
+ 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
47
+ 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
48
+ 'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
49
+ 'it', 'its', 'they', 'them', 'their', 'we', 'our', 'you', 'your',
50
+ 'not', 'no', 'yes', 'if', 'as', 'from', 'which', 'what', 'when', 'where',
51
+ 'why', 'how', 'who', 'whom', 'whose', 'about', 'into', 'through', 'during'}
52
+ return [w for w in words if len(w) > 2 and w not in stop_words]
53
+
54
+ def extract_phrases(self, text, n=2):
55
+ """提取n-gram短语"""
56
+ words = self.extract_words(text)
57
+ if len(words) < n:
58
+ return []
59
+ phrases = []
60
+ for i in range(len(words) - n + 1):
61
+ phrases.append(' '.join(words[i:i+n]))
62
+ return phrases
63
+
64
+ def is_empty(self, val):
65
+ """判断字段是否为空"""
66
+ if val is None:
67
+ return True
68
+ val = str(val).strip()
69
+ return val == '' or val.lower() == 'none'
70
+
71
+ def process_row(self, row):
72
+ """处理单行数据"""
73
+ self.stats['total'] += 1
74
+
75
+ keyword = row.get('keyword', '').strip()
76
+ full_name = row.get('full_name', '').strip()
77
+ is_relevant = row.get('is_relevant', '').strip().upper()
78
+ reason = row.get('reason', '').strip()
79
+ topics = row.get('topics', '').strip()
80
+ description = row.get('description', '').strip()
81
+
82
+ # YES/NO统计
83
+ if is_relevant == 'YES':
84
+ self.stats['yes'] += 1
85
+ if reason:
86
+ self.stats['reason_lengths_yes'].append(len(reason))
87
+ self.stats['reason_texts_yes'].append(reason)
88
+ elif is_relevant == 'NO':
89
+ self.stats['no'] += 1
90
+ if reason:
91
+ self.stats['reason_lengths_no'].append(len(reason))
92
+ self.stats['reason_texts_no'].append(reason)
93
+
94
+ # 按keyword统计
95
+ if keyword:
96
+ if is_relevant == 'YES':
97
+ self.stats['by_keyword'][keyword]['yes'] += 1
98
+ elif is_relevant == 'NO':
99
+ self.stats['by_keyword'][keyword]['no'] += 1
100
+
101
+ # 同一仓库多keyword的一致性检查
102
+ if full_name:
103
+ self.stats['full_name_keyword_map'][full_name][keyword] = is_relevant
104
+
105
+ # 信息量与YES率的关系
106
+ has_topics = not self.is_empty(topics)
107
+ has_description = not self.is_empty(description)
108
+
109
+ if is_relevant == 'YES':
110
+ if has_topics:
111
+ self.stats['has_topics_yes']['yes'] += 1
112
+ else:
113
+ self.stats['has_topics_yes']['no'] += 1
114
+ if has_description:
115
+ self.stats['has_description_yes']['yes'] += 1
116
+ else:
117
+ self.stats['has_description_yes']['no'] += 1
118
+ elif is_relevant == 'NO':
119
+ if has_topics:
120
+ self.stats['has_topics_no']['yes'] += 1
121
+ else:
122
+ self.stats['has_topics_no']['no'] += 1
123
+ if has_description:
124
+ self.stats['has_description_no']['yes'] += 1
125
+ else:
126
+ self.stats['has_description_no']['no'] += 1
127
+
128
+ def analyze_consistency(self):
129
+ """分析同一仓库多keyword结果的一致性"""
130
+ conflicts = 0
131
+ total_multi_keyword = 0
132
+
133
+ for full_name, keyword_results in self.stats['full_name_keyword_map'].items():
134
+ if len(keyword_results) > 1:
135
+ total_multi_keyword += 1
136
+ values = set(keyword_results.values())
137
+ if 'YES' in values and 'NO' in values:
138
+ conflicts += 1
139
+
140
+ return {
141
+ 'total_multi_keyword_repos': total_multi_keyword,
142
+ 'conflicts': conflicts,
143
+ 'conflict_rate': conflicts / total_multi_keyword * 100 if total_multi_keyword > 0 else 0
144
+ }
145
+
146
+ def process_csv(self):
147
+ """处理CSV文件"""
148
+ print(f"Processing {self.csv_path}...")
149
+
150
+ with open(self.csv_path, 'r', encoding='utf-8', errors='replace') as f:
151
+ reader = csv.DictReader(f)
152
+ for row in tqdm(reader, desc="Processing repos_check_history.csv"):
153
+ self.process_row(row)
154
+
155
+ def save_summary(self):
156
+ """保存汇总"""
157
+ summary = {
158
+ 'total': self.stats['total'],
159
+ 'yes': self.stats['yes'],
160
+ 'no': self.stats['no'],
161
+ 'yes_rate': self.stats['yes'] / self.stats['total'] * 100 if self.stats['total'] > 0 else 0,
162
+ 'no_rate': self.stats['no'] / self.stats['total'] * 100 if self.stats['total'] > 0 else 0,
163
+ }
164
+
165
+ # reason长度统计
166
+ if self.stats['reason_lengths_yes']:
167
+ summary['reason_length_yes'] = {
168
+ 'mean': statistics.mean(self.stats['reason_lengths_yes']),
169
+ 'median': statistics.median(self.stats['reason_lengths_yes']),
170
+ 'min': min(self.stats['reason_lengths_yes']),
171
+ 'max': max(self.stats['reason_lengths_yes']),
172
+ }
173
+
174
+ if self.stats['reason_lengths_no']:
175
+ summary['reason_length_no'] = {
176
+ 'mean': statistics.mean(self.stats['reason_lengths_no']),
177
+ 'median': statistics.median(self.stats['reason_lengths_no']),
178
+ 'min': min(self.stats['reason_lengths_no']),
179
+ 'max': max(self.stats['reason_lengths_no']),
180
+ }
181
+
182
+ # 一致性分析
183
+ consistency = self.analyze_consistency()
184
+ summary['consistency'] = consistency
185
+
186
+ # 信息量分析
187
+ summary['info_analysis'] = {
188
+ 'has_topics_yes_rate': self.stats['has_topics_yes']['yes'] / (self.stats['has_topics_yes']['yes'] + self.stats['has_topics_yes']['no']) * 100 if (self.stats['has_topics_yes']['yes'] + self.stats['has_topics_yes']['no']) > 0 else 0,
189
+ 'has_description_yes_rate': self.stats['has_description_yes']['yes'] / (self.stats['has_description_yes']['yes'] + self.stats['has_description_yes']['no']) * 100 if (self.stats['has_description_yes']['yes'] + self.stats['has_description_yes']['no']) > 0 else 0,
190
+ 'has_topics_no_rate': self.stats['has_topics_no']['yes'] / (self.stats['has_topics_no']['yes'] + self.stats['has_topics_no']['no']) * 100 if (self.stats['has_topics_no']['yes'] + self.stats['has_topics_no']['no']) > 0 else 0,
191
+ 'has_description_no_rate': self.stats['has_description_no']['yes'] / (self.stats['has_description_no']['yes'] + self.stats['has_description_no']['no']) * 100 if (self.stats['has_description_no']['yes'] + self.stats['has_description_no']['no']) > 0 else 0,
192
+ }
193
+
194
+ import json
195
+ with open(self.output_dir / 'filter_summary.json', 'w', encoding='utf-8') as f:
196
+ json.dump(summary, f, indent=2, ensure_ascii=False)
197
+
198
+ def save_by_keyword(self):
199
+ """保存按keyword的统计"""
200
+ rows = []
201
+ for kw, data in self.stats['by_keyword'].items():
202
+ total = data['yes'] + data['no']
203
+ rows.append({
204
+ 'keyword': kw,
205
+ 'yes': data['yes'],
206
+ 'no': data['no'],
207
+ 'total': total,
208
+ 'yes_rate': data['yes'] / total * 100 if total > 0 else 0,
209
+ })
210
+
211
+ import pandas as pd
212
+ df = pd.DataFrame(rows)
213
+ df = df.sort_values('total', ascending=False)
214
+ df.to_csv(self.output_dir / 'filter_by_keyword.csv', index=False)
215
+
216
+ def save_reason_terms(self):
217
+ """保存reason的词频统计"""
218
+ # YES的Top词
219
+ yes_words = []
220
+ for text in self.stats['reason_texts_yes']:
221
+ yes_words.extend(self.extract_words(text))
222
+
223
+ yes_word_counter = Counter(yes_words)
224
+
225
+ # NO的Top词
226
+ no_words = []
227
+ for text in self.stats['reason_texts_no']:
228
+ no_words.extend(self.extract_words(text))
229
+
230
+ no_word_counter = Counter(no_words)
231
+
232
+ # YES的Top短语(bigram)
233
+ yes_phrases = []
234
+ for text in self.stats['reason_texts_yes']:
235
+ yes_phrases.extend(self.extract_phrases(text, n=2))
236
+ yes_phrase_counter = Counter(yes_phrases)
237
+
238
+ # NO的Top短语
239
+ no_phrases = []
240
+ for text in self.stats['reason_texts_no']:
241
+ no_phrases.extend(self.extract_phrases(text, n=2))
242
+ no_phrase_counter = Counter(no_phrases)
243
+
244
+ import pandas as pd
245
+
246
+ # 保存Top词
247
+ yes_df = pd.DataFrame([
248
+ {'term': term, 'count': count, 'type': 'word', 'label': 'YES'}
249
+ for term, count in yes_word_counter.most_common(50)
250
+ ])
251
+ no_df = pd.DataFrame([
252
+ {'term': term, 'count': count, 'type': 'word', 'label': 'NO'}
253
+ for term, count in no_word_counter.most_common(50)
254
+ ])
255
+ words_df = pd.concat([yes_df, no_df], ignore_index=True)
256
+ words_df.to_csv(self.output_dir / 'reason_terms_yes_no.csv', index=False)
257
+
258
+ # 保存Top短语
259
+ yes_phrase_df = pd.DataFrame([
260
+ {'phrase': phrase, 'count': count, 'label': 'YES'}
261
+ for phrase, count in yes_phrase_counter.most_common(30)
262
+ ])
263
+ no_phrase_df = pd.DataFrame([
264
+ {'phrase': phrase, 'count': count, 'label': 'NO'}
265
+ for phrase, count in no_phrase_counter.most_common(30)
266
+ ])
267
+ phrases_df = pd.concat([yes_phrase_df, no_phrase_df], ignore_index=True)
268
+ phrases_df.to_csv(self.output_dir / 'reason_phrases_yes_no.csv', index=False)
269
+
270
+ def save_reason_length_distribution(self):
271
+ """保存reason长度分布"""
272
+ import pandas as pd
273
+
274
+ yes_df = pd.DataFrame({
275
+ 'length': self.stats['reason_lengths_yes'],
276
+ 'label': 'YES'
277
+ })
278
+ no_df = pd.DataFrame({
279
+ 'length': self.stats['reason_lengths_no'],
280
+ 'label': 'NO'
281
+ })
282
+ df = pd.concat([yes_df, no_df], ignore_index=True)
283
+ df.to_csv(self.output_dir / 'reason_length_distribution.csv', index=False)
284
+
285
+ def run(self):
286
+ """执行完整流程"""
287
+ print("Stage B: Processing repos_check_history.csv...")
288
+ self.process_csv()
289
+ print("Saving results...")
290
+ self.save_summary()
291
+ self.save_by_keyword()
292
+ self.save_reason_terms()
293
+ self.save_reason_length_distribution()
294
+ print(f"Stage B complete! Results saved to {self.output_dir}")
295
+
296
+
297
+ if __name__ == "__main__":
298
+ csv_path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv"
299
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/stage_b"
300
+ stats = StageBStats(csv_path, output_dir)
301
+ stats.run()
302
+