joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
import ast
import javalang
import os
class PythonASTAnalyzer(ast.NodeVisitor):
def __init__(self):
self.node_count = 0
self.max_depth = 0
self.current_depth = 0
self.num_functions = 0
self.num_loops = 0
self.num_conditionals = 0
def generic_visit(self, node):
self.node_count += 1
self.current_depth += 1
self.max_depth = max(self.max_depth, self.current_depth)
super().generic_visit(node)
self.current_depth -= 1
def visit_FunctionDef(self, node):
self.num_functions += 1
self.generic_visit(node)
def visit_For(self, node):
self.num_loops += 1
self.generic_visit(node)
def visit_While(self, node):
self.num_loops += 1
self.generic_visit(node)
def visit_If(self, node):
self.num_conditionals += 1
self.generic_visit(node)
def extract_python_ast_features(code):
try:
tree = ast.parse(code)
analyzer = PythonASTAnalyzer()
analyzer.visit(tree)
return {
"ast_node_count": analyzer.node_count,
"ast_max_depth": analyzer.max_depth,
"num_functions": analyzer.num_functions,
"num_loops": analyzer.num_loops,
"num_conditionals": analyzer.num_conditionals
}
except Exception:
return None
def extract_java_ast_features(code):
try:
tree = javalang.parse.parse(code)
node_count = 0
max_depth = 0
num_methods = 0
num_loops = 0
num_conditionals = 0
def walk(node, depth=0):
nonlocal node_count, max_depth, num_methods, num_loops, num_conditionals
if not node:
return
node_count += 1
max_depth = max(max_depth, depth)
if isinstance(node, javalang.tree.MethodDeclaration):
num_methods += 1
if isinstance(node, (javalang.tree.ForStatement,
javalang.tree.WhileStatement,
javalang.tree.DoStatement)):
num_loops += 1
if isinstance(node, javalang.tree.IfStatement):
num_conditionals += 1
for child in node.children:
if isinstance(child, list):
for c in child:
if isinstance(c, javalang.ast.Node):
walk(c, depth + 1)
elif isinstance(child, javalang.ast.Node):
walk(child, depth + 1)
walk(tree)
return {
"ast_node_count": node_count,
"ast_max_depth": max_depth,
"num_functions": num_methods,
"num_loops": num_loops,
"num_conditionals": num_conditionals
}
except Exception:
return None
def extract_ast_features(df):
rows = []
for _, row in df.iterrows():
code = str(row["normalized_code"])
language = str(row["Language"]).lower()
if language == "python":
feats = extract_python_ast_features(code)
elif language == "java":
feats = extract_java_ast_features(code)
else:
feats = None
if feats is None:
feats = {
"ast_node_count": 0,
"ast_max_depth": 0,
"num_functions": 0,
"num_loops": 0,
"num_conditionals": 0
}
rows.append(feats)
return pd.DataFrame(rows)
if __name__ == "__main__":
os.makedirs("ast_features", exist_ok=True)
for split in ["train", "val", "test"]:
input_path = f"dataset/processed/dataset_{split}.csv"
df = pd.read_csv(input_path)
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
X_ast = extract_ast_features(df)
X_ast["Label"] = df["Label"]
output_path = f"featureextraction/step2_ast_extraction/{split}_features.csv"
X_ast.to_csv(output_path, index=False)
print(f"AST features extracted for {split}")