DouDou commited on
Commit
5c9df83
·
verified ·
1 Parent(s): a38d186

Upload data2/step22/depend_analysis.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. data2/step22/depend_analysis.py +226 -0
data2/step22/depend_analysis.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
+ import os
3
+ import json
4
+ from concurrent.futures import ProcessPoolExecutor, as_completed
5
+ from tree_sitter import Language, Parser
6
+ import tree_sitter_c
7
+ import tree_sitter_cpp
8
+ import tree_sitter_java
9
+ import tree_sitter_go
10
+ import tree_sitter_rust
11
+ import tree_sitter_julia
12
+ import tree_sitter_python
13
+
14
+ ########################################
15
+ # 配置区
16
+ ########################################
17
+
18
+ OUTPUT_DIR = "/home/weifengsun/tangou1/step2/step22/dataset_calls"
19
+ SCORE_THRESHOLD = 0.92
20
+
21
+ EXCLUDE_DIRS = {
22
+ ".git", "node_modules", "vendor", "third_party",
23
+ "build", "dist", "target", "__pycache__"
24
+ }
25
+
26
+ LANGUAGE_CONFIG = {
27
+ "python": {
28
+ "ext": [".py"],
29
+ "function_nodes": ["function_definition"],
30
+ "call_nodes": ["call"],
31
+ "name_field": "name",
32
+ },
33
+ "c": {
34
+ "ext": [".c", ".h"],
35
+ "function_nodes": ["function_definition"],
36
+ "call_nodes": ["call_expression"],
37
+ "name_field": "declarator",
38
+ },
39
+ "cpp": {
40
+ "ext": [".cpp", ".cc", ".cxx", ".hpp", ".hh"],
41
+ "function_nodes": ["function_definition"],
42
+ "call_nodes": ["call_expression"],
43
+ "name_field": "declarator",
44
+ },
45
+ "java": {
46
+ "ext": [".java"],
47
+ "function_nodes": ["method_declaration", "constructor_declaration"],
48
+ "call_nodes": ["method_invocation", "call_expression"],
49
+ "name_field": "name",
50
+ },
51
+ "go": {
52
+ "ext": [".go"],
53
+ "function_nodes": ["function_declaration", "method_declaration"],
54
+ "call_nodes": ["call_expression"],
55
+ "name_field": "name",
56
+ },
57
+ "rust": {
58
+ "ext": [".rs"],
59
+ "function_nodes": ["function_item"],
60
+ "call_nodes": ["call_expression"],
61
+ "name_field": "name",
62
+ },
63
+ "julia": {
64
+ "ext": [".jl"],
65
+ "function_nodes": ["function_definition"],
66
+ "call_nodes": ["call_expression"],
67
+ "name_field": "name",
68
+ },
69
+ }
70
+
71
+ EXT_TO_LANG = {}
72
+ for lang, cfg in LANGUAGE_CONFIG.items():
73
+ for e in cfg["ext"]:
74
+ EXT_TO_LANG[e] = lang
75
+
76
+ ########################################
77
+ # worker 初始化
78
+ ########################################
79
+
80
+ LANGUAGES = {
81
+ "python": Language(tree_sitter_python.language()),
82
+ "c": Language(tree_sitter_c.language()),
83
+ "cpp": Language(tree_sitter_cpp.language()),
84
+ "java": Language(tree_sitter_java.language()),
85
+ "go": Language(tree_sitter_go.language()),
86
+ "rust": Language(tree_sitter_rust.language()),
87
+ "julia": Language(tree_sitter_julia.language()),
88
+ }
89
+
90
+ def init_worker():
91
+ global PARSERS
92
+ PARSERS = {}
93
+ for lang in LANGUAGE_CONFIG:
94
+ try:
95
+ parser = Parser()
96
+ parser.set_language(LANGUAGES[lang])
97
+ PARSERS[lang] = parser
98
+ except Exception:
99
+ print(f"Failed to load parser for {lang}")
100
+ print(traceback.format_exc())
101
+ pass
102
+
103
+ ########################################
104
+ # jsonl 读取
105
+ ########################################
106
+
107
+ def load_high_score_functions(jsonl_path, score_threshold=SCORE_THRESHOLD):
108
+ funcs = []
109
+ names = set()
110
+ with open(jsonl_path, "r", encoding="utf-8") as f:
111
+ for line in f:
112
+ obj = json.loads(line)
113
+ if obj.get("score", 0) >= score_threshold:
114
+ funcs.append(obj)
115
+ names.add(obj["name"])
116
+ return funcs, names
117
+
118
+ ########################################
119
+ # Tree-sitter 提取调用
120
+ ########################################
121
+
122
+ def extract_calls_from_node(node, src, call_types):
123
+ calls = set()
124
+ def walk(n):
125
+ if n.type in call_types:
126
+ for c in n.children:
127
+ if c.type == "identifier":
128
+ calls.add(src[c.start_byte:c.end_byte].decode())
129
+ for c in n.children:
130
+ walk(c)
131
+ walk(node)
132
+ return calls
133
+
134
+ def find_function_node_by_line(tree, start_line, func_nodes):
135
+ result = None
136
+ def walk(node):
137
+ nonlocal result
138
+ if result is not None:
139
+ return
140
+ if node.type in func_nodes and node.start_point[0] + 1 == start_line:
141
+ result = node
142
+ return
143
+ for c in node.children:
144
+ walk(c)
145
+ walk(tree.root_node)
146
+ return result
147
+
148
+ ########################################
149
+ # 匹配函数名
150
+ ########################################
151
+
152
+ def match_names(called_names, indexed_names):
153
+ matches = {}
154
+ for call in called_names:
155
+ hits = [name for name in indexed_names if call == name or name.startswith(call) or call.startswith(name)]
156
+ if hits:
157
+ matches[call] = hits
158
+ return matches
159
+
160
+ ########################################
161
+ # 单项目分析
162
+ ########################################
163
+
164
+ def process_project_calls(project_path):
165
+ project_name = os.path.basename(project_path.rstrip("/"))
166
+ input_jsonl = os.path.join(project_path, "functions.jsonl") # 假设每项目jsonl在项目目录下
167
+ output_path = os.path.join(OUTPUT_DIR, project_name, "calls.jsonl")
168
+ os.makedirs(os.path.dirname(output_path), exist_ok=True)
169
+
170
+ funcs, name_set = load_high_score_functions(input_jsonl)
171
+
172
+ with open(output_path, "w", encoding="utf-8") as out:
173
+ for f in funcs:
174
+ file_path = f["file"]
175
+ lang = f["language"]
176
+ if not lang or lang not in PARSERS:
177
+ continue
178
+ try:
179
+ src = open(file_path, "rb").read()
180
+ parser = PARSERS[lang]
181
+ tree = parser.parse(src)
182
+ node = find_function_node_by_line(tree, f["start_line"], LANGUAGE_CONFIG[lang]["function_nodes"])
183
+ if not node:
184
+ continue
185
+ calls = extract_calls_from_node(node, src, LANGUAGE_CONFIG[lang]["call_nodes"])
186
+ matched = match_names(calls, name_set)
187
+ out.write(json.dumps({
188
+ "function": f["name"],
189
+ "qualified_name": f.get("qualified_name", f["name"]),
190
+ "calls": sorted(list(calls)),
191
+ "matched": matched,
192
+ "file": f["file"],
193
+ "start_line": f["start_line"],
194
+ "end_line": f["end_line"],
195
+ "score": f.get("score", 0),
196
+ "language": lang
197
+ }, ensure_ascii=False) + "\n")
198
+ except Exception:
199
+ continue
200
+ return project_name
201
+
202
+ ########################################
203
+ # 主入口
204
+ ########################################
205
+
206
+ def load_projects(root):
207
+ return [os.path.join(root, d) for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
208
+
209
+ def main():
210
+ projects_root = "/home/weifengsun/tangou1/domain_code/src/workdir/repos_filtered"
211
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
212
+
213
+ projects = load_projects(projects_root)
214
+
215
+ with ProcessPoolExecutor(max_workers=min(os.cpu_count(), 32), initializer=init_worker) as pool:
216
+ futures = {pool.submit(process_project_calls, p): p for p in projects}
217
+ for f in as_completed(futures):
218
+ proj = futures[f]
219
+ try:
220
+ f.result()
221
+ print(f"[OK] {proj}")
222
+ except Exception as e:
223
+ print(f"[FAIL] {proj}: {e}")
224
+
225
+ if __name__ == "__main__":
226
+ main()