import pandas as pd import re import os import numpy as np IDENTIFIER_REGEX = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b") def extract_identifiers(code): return IDENTIFIER_REGEX.findall(code) def is_snake_case(name): return "_" in name and name.lower() == name def is_camel_case(name): return name != name.lower() and "_" not in name def count_comments(code, language): lines = code.split("\n") count = 0 for line in lines: line = line.strip() if language == "python" and line.startswith("#"): count += 1 elif language == "java" and line.startswith("//"): count += 1 return count, len(lines) def avg_line_length(lines): return np.mean([len(l) for l in lines]) if lines else 0.0 def std_line_length(lines): return np.std([len(l) for l in lines]) if lines else 0.0 def blank_line_ratio(lines): if not lines: return 0.0 blanks = sum(1 for l in lines if not l.strip()) return blanks / len(lines) def indentation_variance(lines): indents = [(len(l) - len(l.lstrip(" "))) for l in lines] return np.var(indents) if indents else 0.0 def punctuation_density(code): punct = re.findall(r"[;:{}()\[\]]", code) return len(punct) / max(len(code), 1) def extract_stylometry_features(df): features = [] for _, row in df.iterrows(): code = str(row["normalized_code"]) language = str(row["Language"]).lower() lines = code.split("\n") identifiers = extract_identifiers(code) identifier_count = len(identifiers) unique_identifier_ratio = ( len(set(identifiers)) / identifier_count if identifier_count > 0 else 0.0 ) avg_identifier_length = ( np.mean([len(i) for i in identifiers]) if identifiers else 0.0 ) snake_case_count = sum(is_snake_case(i) for i in identifiers) camel_case_count = sum(is_camel_case(i) for i in identifiers) snake_case_ratio = ( snake_case_count / identifier_count if identifier_count > 0 else 0.0 ) camel_case_ratio = ( camel_case_count / identifier_count if identifier_count > 0 else 0.0 ) comment_lines, total_lines = count_comments(code, language) comment_ratio = ( comment_lines / total_lines if total_lines > 0 else 0.0 ) features.append({ "identifier_count": identifier_count, "unique_identifier_ratio": unique_identifier_ratio, "avg_identifier_length": avg_identifier_length, "comment_ratio": comment_ratio, "snake_case_ratio": snake_case_ratio, "camel_case_ratio": camel_case_ratio, "avg_line_length": avg_line_length(lines), "std_line_length": std_line_length(lines), "blank_line_ratio": blank_line_ratio(lines), "indentation_variance": indentation_variance(lines), "punctuation_density": punctuation_density(code) }) return pd.DataFrame(features) if __name__ == "__main__": os.makedirs("featureextraction/step3_stylometry_extraction", exist_ok=True) for split in ["train", "val", "test"]: input_path = f"dataset/processed/dataset_{split}.csv" df = pd.read_csv(input_path) if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) X_style = extract_stylometry_features(df) X_style["Label"] = df["Label"] output_path = f"featureextraction/step3_stylometry_extraction/{split}_features.csv" X_style.to_csv(output_path, index=False) print(f"Stylometry features extracted for {split}")