DouDou commited on
Commit
880e02b
·
verified ·
1 Parent(s): e5d8191

Upload data1/analysis.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data1/analysis.py +353 -0
data1/analysis.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import re
3
+ import tokenize
4
+ from io import StringIO
5
+ import os
6
+ from tqdm import tqdm
7
+ import json
8
+ import sys
9
+ from functools import lru_cache
10
+
11
+ csv.field_size_limit(sys.maxsize)
12
+
13
+ # ============== 预编译正则表达式以提高性能 ==============
14
+
15
+ # 行注释规则(预编译)
16
+ _LINE_COMMENT_PATTERNS = {
17
+ "python": re.compile(r"#(.*)$"),
18
+ "shell": re.compile(r"#(.*)$"),
19
+ "r": re.compile(r"#(.*)$"),
20
+ "matlab": re.compile(r"%(.*)$"),
21
+ "fortran": re.compile(r"!(.*)$"),
22
+ "c/c++": re.compile(r"//(.*)$"),
23
+ "java": re.compile(r"//(.*)$"),
24
+ "go": re.compile(r"//(.*)$"),
25
+ "rust": re.compile(r"//(.*)$"),
26
+ }
27
+
28
+ # 块注释规则(预编译)
29
+ _BLOCK_COMMENT_PATTERNS = {
30
+ "python": re.compile(r'("""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\')'),
31
+ "c/c++": re.compile(r"/\*([\s\S]*?)\*/"),
32
+ "java": re.compile(r"/\*([\s\S]*?)\*/"),
33
+ "rust": re.compile(r"/\*([\s\S]*?)\*/"),
34
+ "go": re.compile(r"/\*([\s\S]*?)\*/"),
35
+ "matlab": re.compile(r"%\{([\s\S]*?)%\}"),
36
+ }
37
+
38
+ # 函数匹配规则(预编译)
39
+ _FUNCTION_PATTERNS = {
40
+ "python": re.compile(r"^[ \t]*def\s+(\w+)\s*\(([^)]*)\)", re.MULTILINE),
41
+ "java": re.compile(r"""
42
+ (?:public|protected|private|static|final|native|synchronized|abstract|\s)*
43
+ \s*
44
+ (?:[\w\<\>\[\],\s]+)
45
+ \s+
46
+ (\w+)
47
+ \s*\(([^)]*)\)
48
+ (?:\s*throws\s+[\w,\s]+)?
49
+ \s*\{
50
+ """, re.MULTILINE | re.VERBOSE),
51
+ "c/c++": re.compile(r"""
52
+ ^[ \t]*
53
+ (?!.*typedef)
54
+ (?!.*\#)
55
+ (?:[\w\*\s&]+)
56
+ \b(\w+)\s*
57
+ \(([^)]*)\)
58
+ \s*(?:const)?
59
+ \s*(?:override)?
60
+ \s*(?:noexcept)?
61
+ \s*\{
62
+ """, re.MULTILINE | re.VERBOSE),
63
+ "go": re.compile(r"\bfunc\s+(?:\([^)]+\)\s*)?(\w+)\s*\(([^)]*)\)", re.MULTILINE),
64
+ "rust": re.compile(r"\b(?:pub\s+)?(?:async\s+)?fn\s+(\w+)\s*(?:<[^>]*>)?\s*\(([^)]*)\)", re.MULTILINE),
65
+ "r": re.compile(r"(\w+)\s*(?:<-|=)\s*function\s*\(([^)]*)\)", re.MULTILINE),
66
+ "matlab": re.compile(r"^[ \t]*function\s+(?:(?:\[?[\w,\s]*\]?\s*=\s*)?(\w+)|(\w+))\s*\(([^)]*)\)", re.MULTILINE),
67
+ "shell": re.compile(r"^[ \t]*(?:function\s+)?(\w+)\s*\(\)\s*\{", re.MULTILINE),
68
+ "fortran": re.compile(r"""
69
+ (?i)
70
+ ^[ \t]*
71
+ (?:recursive\s+)?
72
+ (?:pure\s+)?
73
+ (?:elemental\s+)?
74
+ (?:[\w\*]+(?:\s*\([^)]*\))?\s+)?
75
+ (function|subroutine)\s+
76
+ (\w+)\s*
77
+ \(([^)]*)\)
78
+ """, re.MULTILINE | re.VERBOSE),
79
+ }
80
+
81
+ # 移除注释的正则(预编译)
82
+ _REMOVE_COMMENT_PATTERNS = {
83
+ "python_line": re.compile(r'#.*$', re.MULTILINE),
84
+ "python_triple_dq": re.compile(r'"""[\s\S]*?"""'),
85
+ "python_triple_sq": re.compile(r"'''[\s\S]*?'''"),
86
+ "c_line": re.compile(r'//.*$', re.MULTILINE),
87
+ "c_block": re.compile(r'/\*[\s\S]*?\*/'),
88
+ "shell_line": re.compile(r'#.*$', re.MULTILINE),
89
+ "matlab_line": re.compile(r'%.*$', re.MULTILINE),
90
+ "matlab_block": re.compile(r'%\{[\s\S]*?%\}'),
91
+ "fortran_line": re.compile(r'!.*$', re.MULTILINE),
92
+ }
93
+
94
+ def detect_language(file_path: str):
95
+ """仅根据文件后缀判断语言"""
96
+
97
+ ext_map = {
98
+ ".py": "python",
99
+
100
+ ".java": "java",
101
+
102
+ ".c": "c/c++",
103
+ ".h": "c/c++",
104
+ ".hh": "c/c++",
105
+ ".hpp": "c/c++",
106
+ ".cpp": "c/c++",
107
+ ".cc": "c/c++",
108
+ ".cxx": "c/c++",
109
+ ".c++": "c/c++",
110
+
111
+ ".F": "fortran",
112
+ ".f90": "fortran",
113
+ ".f": "fortran",
114
+ ".f95": "fortran",
115
+
116
+ ".r": "r",
117
+
118
+ ".m": "matlab", # MATLAB / Octave
119
+
120
+ ".sh": "shell",
121
+ ".bash": "shell",
122
+
123
+ ".rs": "rust",
124
+ ".go": "go",
125
+ }
126
+
127
+ ext = os.path.splitext(file_path)[1].lower()
128
+ ext = ext.strip()
129
+
130
+ # if ext not in ext_map.keys():
131
+ # print("unknown language:", ext)
132
+
133
+ return ext_map.get(ext, ext)
134
+
135
+
136
+ def count_comments(code: str, lang: str):
137
+ """统计注释行数与注释 token(支持 Python/Java/C++/Fortran/Matlab/R/Shell/Rust/Go/Jupyter)
138
+
139
+ 使用预编译的正则表达式以提高性能。
140
+ """
141
+
142
+ # jupyter 使用 python 的规则
143
+ if lang == "jupyter":
144
+ lang = "python"
145
+
146
+ comment_lines = 0
147
+ comment_tokens = []
148
+ lines = code.splitlines()
149
+
150
+ # 记录已经被块注释覆盖的行号,避免重复计数
151
+ block_comment_line_indices = set()
152
+
153
+ # ---------- B. 先处理块注释(记录行号) ----------
154
+ if lang in _BLOCK_COMMENT_PATTERNS:
155
+ patt = _BLOCK_COMMENT_PATTERNS[lang]
156
+
157
+ if lang == "python":
158
+ # Python 的 triple-quote 需要特殊处理
159
+ for match in patt.finditer(code):
160
+ start_pos = match.start()
161
+ end_pos = match.end()
162
+
163
+ # 计算起始和结束行号
164
+ start_line = code[:start_pos].count('\n')
165
+ end_line = code[:end_pos].count('\n')
166
+
167
+ # 检查这个 triple-quote 是否是 docstring(��是赋值语句)
168
+ prefix = code[max(0, start_pos-20):start_pos].strip()
169
+ if not prefix.endswith('='):
170
+ for line_idx in range(start_line, end_line + 1):
171
+ block_comment_line_indices.add(line_idx)
172
+
173
+ block_content = match.group(1)
174
+ if block_content.startswith('"""'):
175
+ block_content = block_content[3:-3]
176
+ else:
177
+ block_content = block_content[3:-3]
178
+
179
+ for b in block_content.splitlines():
180
+ comment_lines += 1
181
+ if b.strip():
182
+ comment_tokens.extend(b.strip().split())
183
+ else:
184
+ for match in patt.finditer(code):
185
+ start_pos = match.start()
186
+ end_pos = match.end()
187
+
188
+ start_line = code[:start_pos].count('\n')
189
+ end_line = code[:end_pos].count('\n')
190
+
191
+ for line_idx in range(start_line, end_line + 1):
192
+ block_comment_line_indices.add(line_idx)
193
+
194
+ block_content = match.group(1) if match.lastindex else match.group(0)
195
+ for b in block_content.splitlines():
196
+ comment_lines += 1
197
+ if b.strip():
198
+ comment_tokens.extend(b.strip().split())
199
+
200
+ # ---------- A. 行注释(排除已被块注释覆盖的行) ----------
201
+ if lang in _LINE_COMMENT_PATTERNS:
202
+ patt = _LINE_COMMENT_PATTERNS[lang]
203
+ for line_idx, line in enumerate(lines):
204
+ if line_idx in block_comment_line_indices:
205
+ continue
206
+
207
+ m = patt.search(line)
208
+ if m:
209
+ prefix = line[:m.start()]
210
+ single_quotes = prefix.count("'") - prefix.count("\\'")
211
+ double_quotes = prefix.count('"') - prefix.count('\\"')
212
+
213
+ if single_quotes % 2 == 0 and double_quotes % 2 == 0:
214
+ comment_lines += 1
215
+ text = m.group(1)
216
+ if text:
217
+ comment_tokens.extend(text.strip().split())
218
+
219
+ return comment_lines, len(comment_tokens)
220
+
221
+
222
+ def count_functions_and_parameters(code: str, lang: str):
223
+ """统计函数数量与参数数量,支持多语言(含 Fortran subroutine/function)。
224
+
225
+ 使用预编译的正则表达式以提高性能。
226
+ """
227
+
228
+ # jupyter 使用 python 的规则
229
+ if lang == "jupyter":
230
+ lang = "python"
231
+
232
+ patt = _FUNCTION_PATTERNS.get(lang)
233
+ if not patt:
234
+ return 0, 0
235
+
236
+ # 先移除注释,避免匹配注释中的函数定义
237
+ code_no_comments = _remove_comments(code, lang)
238
+
239
+ # 使用预编译的模式匹配
240
+ matches = patt.findall(code_no_comments)
241
+
242
+ function_count = len(matches)
243
+
244
+ parameter_count = 0
245
+ for m in matches:
246
+ if lang == "fortran":
247
+ params = m[2] # (keyword, name, params)
248
+ elif lang == "matlab":
249
+ params = m[2] if len(m) > 2 else ""
250
+ else:
251
+ params = m[1] if isinstance(m, tuple) and len(m) > 1 else ""
252
+
253
+ params = params.strip() if params else ""
254
+ if params:
255
+ items = [p.strip() for p in params.split(",") if p.strip()]
256
+ parameter_count += len(items)
257
+
258
+ return function_count, parameter_count
259
+
260
+
261
+ def _remove_comments(code: str, lang: str) -> str:
262
+ """移除代码中的注释,用于更准确地匹配函数定义(使用预编译正则)"""
263
+
264
+ if lang in ("python", "jupyter"):
265
+ code = _REMOVE_COMMENT_PATTERNS["python_line"].sub('', code)
266
+ code = _REMOVE_COMMENT_PATTERNS["python_triple_dq"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
267
+ code = _REMOVE_COMMENT_PATTERNS["python_triple_sq"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
268
+
269
+ elif lang in ("c/c++", "java", "rust", "go"):
270
+ code = _REMOVE_COMMENT_PATTERNS["c_line"].sub('', code)
271
+ code = _REMOVE_COMMENT_PATTERNS["c_block"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
272
+
273
+ elif lang == "shell":
274
+ code = _REMOVE_COMMENT_PATTERNS["shell_line"].sub('', code)
275
+
276
+ elif lang == "r":
277
+ code = _REMOVE_COMMENT_PATTERNS["shell_line"].sub('', code) # R 也用 #
278
+
279
+ elif lang == "matlab":
280
+ code = _REMOVE_COMMENT_PATTERNS["matlab_line"].sub('', code)
281
+ code = _REMOVE_COMMENT_PATTERNS["matlab_block"].sub(lambda m: '\n' * m.group(0).count('\n'), code)
282
+
283
+ elif lang == "fortran":
284
+ code = _REMOVE_COMMENT_PATTERNS["fortran_line"].sub('', code)
285
+
286
+ return code
287
+
288
+
289
+ def count_tokens(code: str):
290
+ """统计 Python token;非 Python 用简单 split"""
291
+ try:
292
+ return len(list(tokenize.generate_tokens(StringIO(code).readline)))
293
+ except:
294
+ return len(code.split())
295
+
296
+
297
+ def analyze_code(code_str, code_path):
298
+
299
+ lang = detect_language(code_path)
300
+ # if lang == "unknown":
301
+ # print("==========unknown language==========")
302
+ # print(code_str)
303
+ # sys.exit(0)
304
+ lines = code_str.count("\n") + 1
305
+ empty_lines = sum(1 for line in code_str.splitlines() if not line.strip())
306
+ comment_lines, comment_token_count = count_comments(code_str, lang)
307
+ functions, parameters = count_functions_and_parameters(code_str, lang)
308
+ tokens = count_tokens(code_str)
309
+
310
+ return {
311
+ "idx": None,
312
+ "language": lang,
313
+ "total_lines": lines,
314
+ "comment_lines": comment_lines,
315
+ "comment_tokenst": comment_token_count,
316
+ "empty_lines": empty_lines,
317
+ "code_lines": lines - empty_lines - comment_lines,
318
+ "tokens": tokens,
319
+ "functions": functions,
320
+ "parameters": parameters,
321
+ }
322
+
323
+
324
+ if __name__ == "__main__":
325
+ input_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/data_merged"
326
+ output_dir = "/home/weifengsun/tangou1/domain_code/src/datasets/analysis2"
327
+ for i in range(110, 120):
328
+ input_filename = f"{i:03}.csv"
329
+ output_file_name = f"{i:03}.jsonl"
330
+
331
+ input_path = os.path.join(input_dir, input_filename)
332
+ output_path = os.path.join(output_dir, output_file_name)
333
+
334
+ results = []
335
+
336
+ with open(input_path, "r", encoding="utf-8", errors="replace") as f:
337
+ filtered = (line.replace('\0', '') for line in f) # 删除 NUL
338
+ reader = csv.DictReader(filtered) # ✅ 使用 DictReader
339
+
340
+ for idx, row in tqdm(enumerate(reader)):
341
+ code_str = row.get("text") # 用 header 名字
342
+ code_path = row.get("repo_path")
343
+ if not code_path: # None / "" 都会进来
344
+ code_path = row.get("path")
345
+
346
+ result = analyze_code(code_str, code_path)
347
+ result["idx"] = f"{i:03}-{idx}"
348
+ results.append(result)
349
+
350
+
351
+ with open(output_path, "w", encoding="utf-8") as f:
352
+ for r in tqdm(results):
353
+ f.write(json.dumps(r) + "\n")