SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

d50199f

verified ·

1 Parent(s): ec67b61

Upload data1/reporting/main.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/reporting/main.py +187 -0

data1/reporting/main.py ADDED Viewed

	@@ -0,0 +1,187 @@

+"""
+主入口脚本：执行完整的统计报表流程
+"""
+import argparse
+from pathlib import Path
+import sys
+# 导入各模块
+from stage_a_stats import StageAStats
+from stage_b_stats import StageBStats
+from repo_meta_scan import RepoMetaScan
+from code_file_stats import CodeFileStats
+from code_file_stats_fast import CodeFileStatsFast  # 优化版本
+from visualization import generate_all_visualizations
+from join_insights import JoinInsights
+def main():
+    parser = argparse.ArgumentParser(description='生成数据统计报表')
+    parser.add_argument('--repos-searched', type=str,
+                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_searched.csv',
+                       help='repos_searched.csv路径')
+    parser.add_argument('--repos-check-history', type=str,
+                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_check_history.csv',
+                       help='repos_check_history.csv路径')
+    parser.add_argument('--repos-filtered', type=str,
+                       default='/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered',
+                       help='repos_filtered目录路径')
+    parser.add_argument('--output-dir', type=str,
+                       default='/home/weifengsun/tangou1/domain_code/src/workdir/reporting',
+                       help='输出目录')
+    parser.add_argument('--top-n', type=int, default=None,
+                       help='分析的仓库数量（字典序前N个，None表示所有）')
+    parser.add_argument('--workers', type=int, default=8,
+                       help='代码文件统计的并行worker数（默认CPU-1）')
+    parser.add_argument('--stage-a', action='store_true',
+                       help='运行Stage A（搜索阶段统计）')
+    parser.add_argument('--stage-b', action='store_true',
+                       help='运行Stage B（过滤阶段统计）')
+    parser.add_argument('--repo-meta', action='store_true',
+                       help='运行仓库元画像扫描')
+    parser.add_argument('--code-stats', action='store_true',
+                       help='运行代码文件级统计')
+    parser.add_argument('--code-stats-fast', action='store_true',
+                       help='运行代码文件级统计（快速版本，约提速10-20倍）')
+    parser.add_argument('--visualization', action='store_true',
+                       help='生成图表（需要先有stage-a, stage-b, repo-meta, code-stats的数据）')
+    parser.add_argument('--insights', action='store_true',
+                       help='运行关联分析（需要先有stage-a, code-stats, stage-b的数据）')
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    print("=" * 80)
+    print("数据统计报表生成系统")
+    print("=" * 80)
+    print(f"输出目录: {output_dir}")
+    print(f"分析仓库数: {args.top_n if args.top_n else '所有'}")
+    # 检查是否有指定任何阶段
+    has_stage = any([
+        args.stage_a, args.stage_b, args.repo_meta,
+        args.code_stats, args.code_stats_fast, args.visualization, args.insights
+    ])
+    if not has_stage:
+        print("\n错误: 请至少指定一个要运行的阶段！")
+        print("可用选项:")
+        print("  --stage-a          运行Stage A（搜索阶段统计）")
+        print("  --stage-b          运行Stage B（过滤阶段统计）")
+        print("  --repo-meta        运行仓库元画像扫描")
+        print("  --code-stats       运行代码文件级统计")
+        print("  --code-stats-fast  运行代码文件级统计（快速版本，推荐）")
+        print("  --visualization    生成图表")
+        print("  --insights         运行关联分析")
+        print("\n示例: python main.py --stage-a --stage-b")
+        return
+    print()
+    # 定义输出目录路径（即使不运行也需要，因为可能被其他阶段使用）
+    stage_a_dir = output_dir / 'stage_a'
+    stage_b_dir = output_dir / 'stage_b'
+    repo_meta_dir = output_dir / 'repo_meta'
+    code_stats_dir = output_dir / 'code_stats'
+    # Stage A: 搜索阶段统计
+    if args.stage_a:
+        print("\n" + "=" * 80)
+        print("Stage A: 搜索阶段统计 (repos_searched.csv)")
+        print("=" * 80)
+        stage_a_stats = StageAStats(args.repos_searched, stage_a_dir)
+        stage_a_stats.run()
+    # Stage B: 过滤阶段统计
+    if args.stage_b:
+        print("\n" + "=" * 80)
+        print("Stage B: 过滤阶段统计 (repos_check_history.csv)")
+        print("=" * 80)
+        stage_b_stats = StageBStats(args.repos_check_history, stage_b_dir)
+        stage_b_stats.run()
+    # 仓库元画像扫描
+    if args.repo_meta:
+        print("\n" + "=" * 80)
+        print("仓库元画像扫描 (repos_filtered)")
+        print("=" * 80)
+        repo_meta_scanner = RepoMetaScan(args.repos_filtered, repo_meta_dir, top_n=args.top_n)
+        repo_meta_scanner.run()
+    # Stage C: 代码文件级统计
+    if args.code_stats:
+        print("\n" + "=" * 80)
+        print("Stage C: 代码文件级统计（原版）")
+        print("=" * 80)
+        code_stats = CodeFileStats(args.repos_filtered, code_stats_dir,
+                                   top_n=args.top_n)
+        code_stats.run(num_workers=args.workers)
+    # Stage C: 代码文件级统计（快速版本）
+    if args.code_stats_fast:
+        print("\n" + "=" * 80)
+        print("Stage C: 代码文件级统计（快速版本）")
+        print("=" * 80)
+        code_stats_fast = CodeFileStatsFast(
+            args.repos_filtered,
+            code_stats_dir,
+            top_n=args.top_n,
+            max_file_size_mb=2,
+            max_files_per_repo=500  # 限制每个仓库最多500个文件
+        )
+        code_stats_fast.run(num_workers=args.workers if args.workers else 48)
+    # 图表生成（需要前面的数据）
+    if args.visualization:
+        print("\n" + "=" * 80)
+        print("生成图表")
+        print("=" * 80)
+        # 检查必要的数据是否存在
+        required_dirs = [stage_a_dir, stage_b_dir, repo_meta_dir, code_stats_dir]
+        missing_dirs = [d for d in required_dirs if not d.exists()]
+        if missing_dirs:
+            print(f"警告: 以下目录不存在，图表生成可能不完整: {[str(d) for d in missing_dirs]}")
+        generate_all_visualizations(
+            str(stage_a_dir),
+            str(stage_b_dir),
+            str(repo_meta_dir),
+            str(code_stats_dir),
+            args.repos_searched,
+            top_n=args.top_n
+        )
+    # 关联分析（需要前面的数据）
+    if args.insights:
+        print("\n" + "=" * 80)
+        print("关联分析与洞察")
+        print("=" * 80)
+        # 检查必要的数据是否存在（动态文件名）
+        top_n_suffix = f"_top{args.top_n}" if args.top_n else ""
+        repo_level_csv = code_stats_dir / f'repo_level_metrics{top_n_suffix}.csv'
+        if not repo_level_csv.exists():
+            print(f"错误: 代码统计文件不存在: {repo_level_csv}")
+            print("请先运行 --code-stats")
+            return
+        insights_dir = output_dir / 'insights'
+        join_insights = JoinInsights(
+            args.repos_searched,
+            str(repo_level_csv),
+            args.repos_check_history,
+            str(insights_dir)
+        )
+        join_insights.run()
+    print("\n" + "=" * 80)
+    print("完成！所有结果已保存到:")
+    print(f"  - 数据表格: {output_dir}")
+    print(f"  - 图表: {output_dir / 'figures'}")
+    print("=" * 80)
+if __name__ == "__main__":
+    main()