import pandas as pd import ast import javalang import os class PythonASTAnalyzer(ast.NodeVisitor): def __init__(self): self.node_count = 0 self.max_depth = 0 self.current_depth = 0 self.num_functions = 0 self.num_loops = 0 self.num_conditionals = 0 def generic_visit(self, node): self.node_count += 1 self.current_depth += 1 self.max_depth = max(self.max_depth, self.current_depth) super().generic_visit(node) self.current_depth -= 1 def visit_FunctionDef(self, node): self.num_functions += 1 self.generic_visit(node) def visit_For(self, node): self.num_loops += 1 self.generic_visit(node) def visit_While(self, node): self.num_loops += 1 self.generic_visit(node) def visit_If(self, node): self.num_conditionals += 1 self.generic_visit(node) def extract_python_ast_features(code): try: tree = ast.parse(code) analyzer = PythonASTAnalyzer() analyzer.visit(tree) return { "ast_node_count": analyzer.node_count, "ast_max_depth": analyzer.max_depth, "num_functions": analyzer.num_functions, "num_loops": analyzer.num_loops, "num_conditionals": analyzer.num_conditionals } except Exception: return None def extract_java_ast_features(code): try: tree = javalang.parse.parse(code) node_count = 0 max_depth = 0 num_methods = 0 num_loops = 0 num_conditionals = 0 def walk(node, depth=0): nonlocal node_count, max_depth, num_methods, num_loops, num_conditionals if not node: return node_count += 1 max_depth = max(max_depth, depth) if isinstance(node, javalang.tree.MethodDeclaration): num_methods += 1 if isinstance(node, (javalang.tree.ForStatement, javalang.tree.WhileStatement, javalang.tree.DoStatement)): num_loops += 1 if isinstance(node, javalang.tree.IfStatement): num_conditionals += 1 for child in node.children: if isinstance(child, list): for c in child: if isinstance(c, javalang.ast.Node): walk(c, depth + 1) elif isinstance(child, javalang.ast.Node): walk(child, depth + 1) walk(tree) return { "ast_node_count": node_count, "ast_max_depth": max_depth, "num_functions": num_methods, "num_loops": num_loops, "num_conditionals": num_conditionals } except Exception: return None def extract_ast_features(df): rows = [] for _, row in df.iterrows(): code = str(row["normalized_code"]) language = str(row["Language"]).lower() if language == "python": feats = extract_python_ast_features(code) elif language == "java": feats = extract_java_ast_features(code) else: feats = None if feats is None: feats = { "ast_node_count": 0, "ast_max_depth": 0, "num_functions": 0, "num_loops": 0, "num_conditionals": 0 } rows.append(feats) return pd.DataFrame(rows) if __name__ == "__main__": os.makedirs("ast_features", exist_ok=True) for split in ["train", "val", "test"]: input_path = f"dataset/processed/dataset_{split}.csv" df = pd.read_csv(input_path) if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) X_ast = extract_ast_features(df) X_ast["Label"] = df["Label"] output_path = f"featureextraction/step2_ast_extraction/{split}_features.csv" X_ast.to_csv(output_path, index=False) print(f"AST features extracted for {split}")