joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
import re
import os
import numpy as np
IDENTIFIER_REGEX = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")
def extract_identifiers(code):
return IDENTIFIER_REGEX.findall(code)
def is_snake_case(name):
return "_" in name and name.lower() == name
def is_camel_case(name):
return name != name.lower() and "_" not in name
def count_comments(code, language):
lines = code.split("\n")
count = 0
for line in lines:
line = line.strip()
if language == "python" and line.startswith("#"):
count += 1
elif language == "java" and line.startswith("//"):
count += 1
return count, len(lines)
def avg_line_length(lines):
return np.mean([len(l) for l in lines]) if lines else 0.0
def std_line_length(lines):
return np.std([len(l) for l in lines]) if lines else 0.0
def blank_line_ratio(lines):
if not lines:
return 0.0
blanks = sum(1 for l in lines if not l.strip())
return blanks / len(lines)
def indentation_variance(lines):
indents = [(len(l) - len(l.lstrip(" "))) for l in lines]
return np.var(indents) if indents else 0.0
def punctuation_density(code):
punct = re.findall(r"[;:{}()\[\]]", code)
return len(punct) / max(len(code), 1)
def extract_stylometry_features(df):
features = []
for _, row in df.iterrows():
code = str(row["normalized_code"])
language = str(row["Language"]).lower()
lines = code.split("\n")
identifiers = extract_identifiers(code)
identifier_count = len(identifiers)
unique_identifier_ratio = (
len(set(identifiers)) / identifier_count if identifier_count > 0 else 0.0
)
avg_identifier_length = (
np.mean([len(i) for i in identifiers]) if identifiers else 0.0
)
snake_case_count = sum(is_snake_case(i) for i in identifiers)
camel_case_count = sum(is_camel_case(i) for i in identifiers)
snake_case_ratio = (
snake_case_count / identifier_count if identifier_count > 0 else 0.0
)
camel_case_ratio = (
camel_case_count / identifier_count if identifier_count > 0 else 0.0
)
comment_lines, total_lines = count_comments(code, language)
comment_ratio = (
comment_lines / total_lines if total_lines > 0 else 0.0
)
features.append({
"identifier_count": identifier_count,
"unique_identifier_ratio": unique_identifier_ratio,
"avg_identifier_length": avg_identifier_length,
"comment_ratio": comment_ratio,
"snake_case_ratio": snake_case_ratio,
"camel_case_ratio": camel_case_ratio,
"avg_line_length": avg_line_length(lines),
"std_line_length": std_line_length(lines),
"blank_line_ratio": blank_line_ratio(lines),
"indentation_variance": indentation_variance(lines),
"punctuation_density": punctuation_density(code)
})
return pd.DataFrame(features)
if __name__ == "__main__":
os.makedirs("featureextraction/step3_stylometry_extraction", exist_ok=True)
for split in ["train", "val", "test"]:
input_path = f"dataset/processed/dataset_{split}.csv"
df = pd.read_csv(input_path)
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
X_style = extract_stylometry_features(df)
X_style["Label"] = df["Label"]
output_path = f"featureextraction/step3_stylometry_extraction/{split}_features.csv"
X_style.to_csv(output_path, index=False)
print(f"Stylometry features extracted for {split}")