SciCode
/

dataset-builder

Model card Files Files and versions

xet

Community

DouDou commited on Feb 19

Commit

880e02b

verified ·

1 Parent(s): e5d8191

Upload data1/analysis.py with huggingface_hub

Browse files

Files changed (1) hide show

data1/analysis.py +353 -0

data1/analysis.py ADDED Viewed

	@@ -0,0 +1,353 @@

+import csv
+import re
+import tokenize
+from io import StringIO
+import os
+from tqdm import tqdm
+import json
+import sys
+from functools import lru_cache
+csv.field_size_limit(sys.maxsize)
+# ============== 预编译正则表达式以提高性能 ==============
+# 行注释规则（预编译）
+_LINE_COMMENT_PATTERNS = {
+    "python": re.compile(r"#(.*)$"),
+    "shell": re.compile(r"#(.*)$"),
+    "r": re.compile(r"#(.*)$"),
+    "matlab": re.compile(r"%(.*)$"),
+    "fortran": re.compile(r"!(.*)$"),
+    "c/c++": re.compile(r"//(.*)$"),
+    "java": re.compile(r"//(.*)$"),
+    "go": re.compile(r"//(.*)$"),
+    "rust": re.compile(r"//(.*)$"),
+}
+# 块注释规则（预编译）
+_BLOCK_COMMENT_PATTERNS = {
+    "python": re.compile(r'("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')'),
+    "c/c++": re.compile(r"/\*([\s\S]*?)\*/"),
+    "java": re.compile(r"/\*([\s\S]*?)\*/"),
+    "rust": re.compile(r"/\*([\s\S]*?)\*/"),
+    "go": re.compile(r"/\*([\s\S]*?)\*/"),
+    "matlab": re.compile(r"%\{([\s\S]*?)%\}"),
+}
+# 函数匹配规则（预编译）
+_FUNCTION_PATTERNS = {
+    "python": re.compile(r"^[ \t]*def\s+(\w+)\s*\(([^)]*)\)", re.MULTILINE),
+    "java": re.compile(r"""
+        (?:public|protected|private|static|final|native|synchronized|abstract|\s)*
+        \s*
+        (?:[\w\<\>\[\],\s]+)
+        \s+
+        (\w+)
+        \s*\(([^)]*)\)
+        (?:\s*throws\s+[\w,\s]+)?
+        \s*\{
+    """, re.MULTILINE | re.VERBOSE),
+    "c/c++": re.compile(r"""
+        ^[ \t]*
+        (?!.*typedef)
+        (?!.*\#)
+        (?:[\w\*\s&]+)
+        \b(\w+)\s*
+        \(([^)]*)\)
+        \s*(?:const)?
+        \s*(?:override)?
+        \s*(?:noexcept)?
+        \s*\{
+    """, re.MULTILINE | re.VERBOSE),
+    "go": re.compile(r"\bfunc\s+(?:\([^)]+\)\s*)?(\w+)\s*\(([^)]*)\)", re.MULTILINE),
+    "rust": re.compile(r"\b(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]*>)?\s*\(([^)]*)\)", re.MULTILINE),
+    "r": re.compile(r"(\w+)\s*(?:<-|=)\s*function\s*\(([^)]*)\)", re.MULTILINE),
+    "matlab": re.compile(r"^[ \t]*function\s+(?:(?:\[?[\w,\s]*\]?\s*=\s*)?(\w+)|(\w+))\s*\(([^)]*)\)", re.MULTILINE),
+    "shell": re.compile(r"^[ \t]*(?:function\s+)?(\w+)\s*\(\)\s*\{", re.MULTILINE),
+    "fortran": re.compile(r"""
+        (?i)
+        ^[ \t]*
+        (?:recursive\s+)?
+        (?:pure\s+)?
+        (?:elemental\s+)?
+        (?:[\w\*]+(?:\s*\([^)]*\))?\s+)?
+        (function|subroutine)\s+
+        (\w+)\s*
+        \(([^)]*)\)
+    """, re.MULTILINE | re.VERBOSE),
+}
+# 移除注释的正则（预编译）
+_REMOVE_COMMENT_PATTERNS = {
+    "python_line": re.compile(r'#.*$', re.MULTILINE),
+    "python_triple_dq": re.compile(r'"""[\s\S]*?"""'),
+    "python_triple_sq": re.compile(r"'''[\s\S]*?'''"),
+    "c_line": re.compile(r'//.*$', re.MULTILINE),
+    "c_block": re.compile(r'/\*[\s\S]*?\*/'),
+    "shell_line": re.compile(r'#.*$', re.MULTILINE),
+    "matlab_line": re.compile(r'%.*$', re.MULTILINE),
+    "matlab_block": re.compile(r'%\{[\s\S]*?%\}'),
+    "fortran_line": re.compile(r'!.*$', re.MULTILINE),
+}
+def detect_language(file_path: str):
+    """仅根据文件后缀判断语言"""
+    ext_map = {
+        ".py": "python",
+        ".java": "java",
+        ".c": "c/c++",
+        ".h": "c/c++",
+        ".hh": "c/c++",
+        ".hpp": "c/c++",
+        ".cpp": "c/c++",
+        ".cc": "c/c++",
+        ".cxx": "c/c++",
+        ".c++": "c/c++",
+        ".F": "fortran",
+        ".f90": "fortran",
+        ".f": "fortran",
+        ".f95": "fortran",
+        ".r": "r",
+        ".m": "matlab",   # MATLAB / Octave
+        ".sh": "shell",
+        ".bash": "shell",
+        ".rs": "rust",
+        ".go": "go",
+    }
+    ext = os.path.splitext(file_path)[1].lower()
+    ext = ext.strip()
+    # if ext not in ext_map.keys():
+    #     print("unknown language:", ext)
+    return ext_map.get(ext, ext)
+def count_comments(code: str, lang: str):
+    """统计注释行数与注释 token（支持 Python/Java/C++/Fortran/Matlab/R/Shell/Rust/Go/Jupyter）
+    使用预编译的正则表达式以提高性能。
+    """
+    # jupyter 使用 python 的规则
+    if lang == "jupyter":
+        lang = "python"
+    comment_lines = 0
+    comment_tokens = []
+    lines = code.splitlines()
+    # 记录已经被块注释覆盖的行号，避免重复计数
+    block_comment_line_indices = set()
+    # ---------- B. 先处理块注释（记录行号） ----------
+    if lang in _BLOCK_COMMENT_PATTERNS:
+        patt = _BLOCK_COMMENT_PATTERNS[lang]
+        if lang == "python":
+            # Python 的 triple-quote 需要特殊处理
+            for match in patt.finditer(code):
+                start_pos = match.start()
+                end_pos = match.end()
+                # 计算起始和结束行号
+                start_line = code[:start_pos].count('\n')
+                end_line = code[:end_pos].count('\n')
+                # 检查这个 triple-quote 是否是 docstring（��是赋值语句）
+                prefix = code[max(0, start_pos-20):start_pos].strip()
+                if not prefix.endswith('='):
+                    for line_idx in range(start_line, end_line + 1):
+                        block_comment_line_indices.add(line_idx)
+                    block_content = match.group(1)
+                    if block_content.startswith('"""'):
+                        block_content = block_content[3:-3]
+                    else:
+                        block_content = block_content[3:-3]
+                    for b in block_content.splitlines():
+                        comment_lines += 1
+                        if b.strip():
+                            comment_tokens.extend(b.strip().split())
+        else:
+            for match in patt.finditer(code):
+                start_pos = match.start()
+                end_pos = match.end()
+                start_line = code[:start_pos].count('\n')
+                end_line = code[:end_pos].count('\n')
+                for line_idx in range(start_line, end_line + 1):
+                    block_comment_line_indices.add(line_idx)
+                block_content = match.group(1) if match.lastindex else match.group(0)
+                for b in block_content.splitlines():
+                    comment_lines += 1
+                    if b.strip():
+                        comment_tokens.extend(b.strip().split())
+    # ---------- A. 行注释（排除已被块注释覆盖的行） ----------
+    if lang in _LINE_COMMENT_PATTERNS:
+        patt = _LINE_COMMENT_PATTERNS[lang]
+        for line_idx, line in enumerate(lines):
+            if line_idx in block_comment_line_indices:
+                continue
+            m = patt.search(line)
+            if m:
+                prefix = line[:m.start()]
+                single_quotes = prefix.count("'") - prefix.count("\\'")
+                double_quotes = prefix.count('"') - prefix.count('\\"')
+                if single_quotes % 2 == 0 and double_quotes % 2 == 0:
+                    comment_lines += 1
+                    text = m.group(1)
+                    if text:
+                        comment_tokens.extend(text.strip().split())
+    return comment_lines, len(comment_tokens)
+def count_functions_and_parameters(code: str, lang: str):
+    """统计函数数量与参数数量，支持多语言（含 Fortran subroutine/function）。
+    使用预编译的正则表达式以提高性能。
+    """
+    # jupyter 使用 python 的规则
+    if lang == "jupyter":
+        lang = "python"
+    patt = _FUNCTION_PATTERNS.get(lang)
+    if not patt:
+        return 0, 0
+    # 先移除注释，避免匹配注释中的函数定义
+    code_no_comments = _remove_comments(code, lang)
+    # 使用预编译的模式匹配
+    matches = patt.findall(code_no_comments)
+    function_count = len(matches)
+    parameter_count = 0
+    for m in matches:
+        if lang == "fortran":
+            params = m[2]  # (keyword, name, params)
+        elif lang == "matlab":
+            params = m[2] if len(m) > 2 else ""
+        else:
+            params = m[1] if isinstance(m, tuple) and len(m) > 1 else ""
+        params = params.strip() if params else ""
+        if params:
+            items = [p.strip() for p in params.split(",") if p.strip()]
+            parameter_count += len(items)
+    return function_count, parameter_count
+def _remove_comments(code: str, lang: str) -> str:
+    """移除代码中的注释，用于更准确地匹配函数定义（使用预编译正则）"""
+    if lang in ("python", "jupyter"):
+        code = _REMOVE_COMMENT_PATTERNS["python_line"].sub('', code)
+        code = _REMOVE_COMMENT_PATTERNS["python_triple_dq"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
+        code = _REMOVE_COMMENT_PATTERNS["python_triple_sq"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
+    elif lang in ("c/c++", "java", "rust", "go"):
+        code = _REMOVE_COMMENT_PATTERNS["c_line"].sub('', code)
+        code = _REMOVE_COMMENT_PATTERNS["c_block"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
+    elif lang == "shell":
+        code = _REMOVE_COMMENT_PATTERNS["shell_line"].sub('', code)
+    elif lang == "r":
+        code = _REMOVE_COMMENT_PATTERNS["shell_line"].sub('', code)  # R 也用 #
+    elif lang == "matlab":
+        code = _REMOVE_COMMENT_PATTERNS["matlab_line"].sub('', code)
+        code = _REMOVE_COMMENT_PATTERNS["matlab_block"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
+    elif lang == "fortran":
+        code = _REMOVE_COMMENT_PATTERNS["fortran_line"].sub('', code)
+    return code
+def count_tokens(code: str):
+    """统计 Python token；非 Python 用简单 split"""
+    try:
+        return len(list(tokenize.generate_tokens(StringIO(code).readline)))
+    except:
+        return len(code.split())
+def analyze_code(code_str, code_path):
+    lang = detect_language(code_path)
+    # if lang == "unknown":
+    #     print("==========unknown language==========")
+    #     print(code_str)
+    # sys.exit(0)
+    lines = code_str.count("\n") + 1
+    empty_lines = sum(1 for line in code_str.splitlines() if not line.strip())
+    comment_lines, comment_token_count = count_comments(code_str, lang)
+    functions, parameters = count_functions_and_parameters(code_str, lang)
+    tokens = count_tokens(code_str)
+    return {
+        "idx": None,
+        "language": lang,
+        "total_lines": lines,
+        "comment_lines": comment_lines,
+        "comment_tokenst": comment_token_count,
+        "empty_lines": empty_lines,
+        "code_lines": lines - empty_lines - comment_lines,
+        "tokens": tokens,
+        "functions": functions,
+        "parameters": parameters,
+    }
+if __name__ == "__main__":
+    input_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/data_merged"
+    output_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/analysis2"
+    for i in range(110, 120):
+        input_filename = f"{i:03}.csv"
+        output_file_name = f"{i:03}.jsonl"
+        input_path = os.path.join(input_dir, input_filename)
+        output_path = os.path.join(output_dir, output_file_name)
+        results = []
+        with open(input_path, "r", encoding="utf-8", errors="replace") as f:
+            filtered = (line.replace('\0', '') for line in f)  # 删除 NUL
+            reader = csv.DictReader(filtered)  # ✅ 使用 DictReader
+            for idx, row in tqdm(enumerate(reader)):
+                code_str = row.get("text")  # 用 header 名字
+                code_path = row.get("repo_path")
+                if not code_path:  # None / "" 都会进来
+                    code_path = row.get("path")
+                result = analyze_code(code_str, code_path)
+                result["idx"] = f"{i:03}-{idx}"
+                results.append(result)
+        with open(output_path, "w", encoding="utf-8") as f:
+            for r in tqdm(results):
+                f.write(json.dumps(r) + "\n")