SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

156430e

verified ·

1 Parent(s): 9f60e31

Upload data1/reporting/stage_b_stats.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/reporting/stage_b_stats.py +302 -0

data1/reporting/stage_b_stats.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+Stage B: 统计 repos_check_history.csv 的过滤效果
+YES/NO、按keyword通过率、reason长度与Top词/短语
+"""
+import csv
+import sys
+from collections import defaultdict, Counter
+from tqdm import tqdm
+import statistics
+import re
+from pathlib import Path
+csv.field_size_limit(sys.maxsize)
+class StageBStats:
+    def __init__(self, csv_path, output_dir):
+        self.csv_path = csv_path
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        self.stats = {
+            'total': 0,
+            'yes': 0,
+            'no': 0,
+            'by_keyword': defaultdict(lambda: {'yes': 0, 'no': 0}),
+            'reason_lengths_yes': [],
+            'reason_lengths_no': [],
+            'reason_texts_yes': [],
+            'reason_texts_no': [],
+            'full_name_keyword_map': defaultdict(lambda: defaultdict(str)),  # 同一仓库不同keyword的结果
+            'has_topics_yes': {'yes': 0, 'no': 0},
+            'has_topics_no': {'yes': 0, 'no': 0},
+            'has_description_yes': {'yes': 0, 'no': 0},
+            'has_description_no': {'yes': 0, 'no': 0},
+        }
+    def extract_words(self, text):
+        """提取词（简单分词，支持英文）"""
+        if not text:
+            return []
+        # 转小写，提取单词
+        words = re.findall(r'\b[a-z]+\b', text.lower())
+        # 过滤停用词
+        stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
+                     'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
+                     'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should',
+                     'could', 'may', 'might', 'must', 'can', 'this', 'that', 'these', 'those',
+                     'it', 'its', 'they', 'them', 'their', 'we', 'our', 'you', 'your',
+                     'not', 'no', 'yes', 'if', 'as', 'from', 'which', 'what', 'when', 'where',
+                     'why', 'how', 'who', 'whom', 'whose', 'about', 'into', 'through', 'during'}
+        return [w for w in words if len(w) > 2 and w not in stop_words]
+    def extract_phrases(self, text, n=2):
+        """提取n-gram短语"""
+        words = self.extract_words(text)
+        if len(words) < n:
+            return []
+        phrases = []
+        for i in range(len(words) - n + 1):
+            phrases.append(' '.join(words[i:i+n]))
+        return phrases
+    def is_empty(self, val):
+        """判断字段是否为空"""
+        if val is None:
+            return True
+        val = str(val).strip()
+        return val == '' or val.lower() == 'none'
+    def process_row(self, row):
+        """处理单行数据"""
+        self.stats['total'] += 1
+        keyword = row.get('keyword', '').strip()
+        full_name = row.get('full_name', '').strip()
+        is_relevant = row.get('is_relevant', '').strip().upper()
+        reason = row.get('reason', '').strip()
+        topics = row.get('topics', '').strip()
+        description = row.get('description', '').strip()
+        # YES/NO统计
+        if is_relevant == 'YES':
+            self.stats['yes'] += 1
+            if reason:
+                self.stats['reason_lengths_yes'].append(len(reason))
+                self.stats['reason_texts_yes'].append(reason)
+        elif is_relevant == 'NO':
+            self.stats['no'] += 1
+            if reason:
+                self.stats['reason_lengths_no'].append(len(reason))
+                self.stats['reason_texts_no'].append(reason)
+        # 按keyword统计
+        if keyword:
+            if is_relevant == 'YES':
+                self.stats['by_keyword'][keyword]['yes'] += 1
+            elif is_relevant == 'NO':
+                self.stats['by_keyword'][keyword]['no'] += 1
+        # 同一仓库多keyword的一致性检查
+        if full_name:
+            self.stats['full_name_keyword_map'][full_name][keyword] = is_relevant
+        # 信息量与YES率的关系
+        has_topics = not self.is_empty(topics)
+        has_description = not self.is_empty(description)
+        if is_relevant == 'YES':
+            if has_topics:
+                self.stats['has_topics_yes']['yes'] += 1
+            else:
+                self.stats['has_topics_yes']['no'] += 1
+            if has_description:
+                self.stats['has_description_yes']['yes'] += 1
+            else:
+                self.stats['has_description_yes']['no'] += 1
+        elif is_relevant == 'NO':
+            if has_topics:
+                self.stats['has_topics_no']['yes'] += 1
+            else:
+                self.stats['has_topics_no']['no'] += 1
+            if has_description:
+                self.stats['has_description_no']['yes'] += 1
+            else:
+                self.stats['has_description_no']['no'] += 1
+    def analyze_consistency(self):
+        """分析同一仓库多keyword结果的一致性"""
+        conflicts = 0
+        total_multi_keyword = 0
+        for full_name, keyword_results in self.stats['full_name_keyword_map'].items():
+            if len(keyword_results) > 1:
+                total_multi_keyword += 1
+                values = set(keyword_results.values())
+                if 'YES' in values and 'NO' in values:
+                    conflicts += 1
+        return {
+            'total_multi_keyword_repos': total_multi_keyword,
+            'conflicts': conflicts,
+            'conflict_rate': conflicts / total_multi_keyword * 100 if total_multi_keyword > 0 else 0
+        }
+    def process_csv(self):
+        """处理CSV文件"""
+        print(f"Processing {self.csv_path}...")
+        with open(self.csv_path, 'r', encoding='utf-8', errors='replace') as f:
+            reader = csv.DictReader(f)
+            for row in tqdm(reader, desc="Processing repos_check_history.csv"):
+                self.process_row(row)
+    def save_summary(self):
+        """保存汇总"""
+        summary = {
+            'total': self.stats['total'],
+            'yes': self.stats['yes'],
+            'no': self.stats['no'],
+            'yes_rate': self.stats['yes'] / self.stats['total'] * 100 if self.stats['total'] > 0 else 0,
+            'no_rate': self.stats['no'] / self.stats['total'] * 100 if self.stats['total'] > 0 else 0,
+        }
+        # reason长度统计
+        if self.stats['reason_lengths_yes']:
+            summary['reason_length_yes'] = {
+                'mean': statistics.mean(self.stats['reason_lengths_yes']),
+                'median': statistics.median(self.stats['reason_lengths_yes']),
+                'min': min(self.stats['reason_lengths_yes']),
+                'max': max(self.stats['reason_lengths_yes']),
+            }
+        if self.stats['reason_lengths_no']:
+            summary['reason_length_no'] = {
+                'mean': statistics.mean(self.stats['reason_lengths_no']),
+                'median': statistics.median(self.stats['reason_lengths_no']),
+                'min': min(self.stats['reason_lengths_no']),
+                'max': max(self.stats['reason_lengths_no']),
+            }
+        # 一致性分析
+        consistency = self.analyze_consistency()
+        summary['consistency'] = consistency
+        # 信息量分析
+        summary['info_analysis'] = {
+            'has_topics_yes_rate': self.stats['has_topics_yes']['yes'] / (self.stats['has_topics_yes']['yes'] + self.stats['has_topics_yes']['no']) * 100 if (self.stats['has_topics_yes']['yes'] + self.stats['has_topics_yes']['no']) > 0 else 0,
+            'has_description_yes_rate': self.stats['has_description_yes']['yes'] / (self.stats['has_description_yes']['yes'] + self.stats['has_description_yes']['no']) * 100 if (self.stats['has_description_yes']['yes'] + self.stats['has_description_yes']['no']) > 0 else 0,
+            'has_topics_no_rate': self.stats['has_topics_no']['yes'] / (self.stats['has_topics_no']['yes'] + self.stats['has_topics_no']['no']) * 100 if (self.stats['has_topics_no']['yes'] + self.stats['has_topics_no']['no']) > 0 else 0,
+            'has_description_no_rate': self.stats['has_description_no']['yes'] / (self.stats['has_description_no']['yes'] + self.stats['has_description_no']['no']) * 100 if (self.stats['has_description_no']['yes'] + self.stats['has_description_no']['no']) > 0 else 0,
+        }
+        import json
+        with open(self.output_dir / 'filter_summary.json', 'w', encoding='utf-8') as f:
+            json.dump(summary, f, indent=2, ensure_ascii=False)
+    def save_by_keyword(self):
+        """保存按keyword的统计"""
+        rows = []
+        for kw, data in self.stats['by_keyword'].items():
+            total = data['yes'] + data['no']
+            rows.append({
+                'keyword': kw,
+                'yes': data['yes'],
+                'no': data['no'],
+                'total': total,
+                'yes_rate': data['yes'] / total * 100 if total > 0 else 0,
+            })
+        import pandas as pd
+        df = pd.DataFrame(rows)
+        df = df.sort_values('total', ascending=False)
+        df.to_csv(self.output_dir / 'filter_by_keyword.csv', index=False)
+    def save_reason_terms(self):
+        """保存reason的词频统计"""
+        # YES的Top词
+        yes_words = []
+        for text in self.stats['reason_texts_yes']:
+            yes_words.extend(self.extract_words(text))
+        yes_word_counter = Counter(yes_words)
+        # NO的Top词
+        no_words = []
+        for text in self.stats['reason_texts_no']:
+            no_words.extend(self.extract_words(text))
+        no_word_counter = Counter(no_words)
+        # YES的Top短语（bigram）
+        yes_phrases = []
+        for text in self.stats['reason_texts_yes']:
+            yes_phrases.extend(self.extract_phrases(text, n=2))
+        yes_phrase_counter = Counter(yes_phrases)
+        # NO的Top短语
+        no_phrases = []
+        for text in self.stats['reason_texts_no']:
+            no_phrases.extend(self.extract_phrases(text, n=2))
+        no_phrase_counter = Counter(no_phrases)
+        import pandas as pd
+        # 保存Top词
+        yes_df = pd.DataFrame([
+            {'term': term, 'count': count, 'type': 'word', 'label': 'YES'}
+            for term, count in yes_word_counter.most_common(50)
+        ])
+        no_df = pd.DataFrame([
+            {'term': term, 'count': count, 'type': 'word', 'label': 'NO'}
+            for term, count in no_word_counter.most_common(50)
+        ])
+        words_df = pd.concat([yes_df, no_df], ignore_index=True)
+        words_df.to_csv(self.output_dir / 'reason_terms_yes_no.csv', index=False)
+        # 保存Top短语
+        yes_phrase_df = pd.DataFrame([
+            {'phrase': phrase, 'count': count, 'label': 'YES'}
+            for phrase, count in yes_phrase_counter.most_common(30)
+        ])
+        no_phrase_df = pd.DataFrame([
+            {'phrase': phrase, 'count': count, 'label': 'NO'}
+            for phrase, count in no_phrase_counter.most_common(30)
+        ])
+        phrases_df = pd.concat([yes_phrase_df, no_phrase_df], ignore_index=True)
+        phrases_df.to_csv(self.output_dir / 'reason_phrases_yes_no.csv', index=False)
+    def save_reason_length_distribution(self):
+        """保存reason长度分布"""
+        import pandas as pd
+        yes_df = pd.DataFrame({
+            'length': self.stats['reason_lengths_yes'],
+            'label': 'YES'
+        })
+        no_df = pd.DataFrame({
+            'length': self.stats['reason_lengths_no'],
+            'label': 'NO'
+        })
+        df = pd.concat([yes_df, no_df], ignore_index=True)
+        df.to_csv(self.output_dir / 'reason_length_distribution.csv', index=False)
+    def run(self):
+        """执行完整流程"""
+        print("Stage B: Processing repos_check_history.csv...")
+        self.process_csv()
+        print("Saving results...")
+        self.save_summary()
+        self.save_by_keyword()
+        self.save_reason_terms()
+        self.save_reason_length_distribution()
+        print(f"Stage B complete! Results saved to {self.output_dir}")
+if __name__ == "__main__":
+    csv_path = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv"
+    output_dir = "/home/weifengsun/tangou1/domain_code/src/workdir/reporting/stage_b"
+    stats = StageBStats(csv_path, output_dir)
+    stats.run()