Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import re | |
| import os | |
| import numpy as np | |
| IDENTIFIER_REGEX = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b") | |
| def extract_identifiers(code): | |
| return IDENTIFIER_REGEX.findall(code) | |
| def is_snake_case(name): | |
| return "_" in name and name.lower() == name | |
| def is_camel_case(name): | |
| return name != name.lower() and "_" not in name | |
| def count_comments(code, language): | |
| lines = code.split("\n") | |
| count = 0 | |
| for line in lines: | |
| line = line.strip() | |
| if language == "python" and line.startswith("#"): | |
| count += 1 | |
| elif language == "java" and line.startswith("//"): | |
| count += 1 | |
| return count, len(lines) | |
| def avg_line_length(lines): | |
| return np.mean([len(l) for l in lines]) if lines else 0.0 | |
| def std_line_length(lines): | |
| return np.std([len(l) for l in lines]) if lines else 0.0 | |
| def blank_line_ratio(lines): | |
| if not lines: | |
| return 0.0 | |
| blanks = sum(1 for l in lines if not l.strip()) | |
| return blanks / len(lines) | |
| def indentation_variance(lines): | |
| indents = [(len(l) - len(l.lstrip(" "))) for l in lines] | |
| return np.var(indents) if indents else 0.0 | |
| def punctuation_density(code): | |
| punct = re.findall(r"[;:{}()\[\]]", code) | |
| return len(punct) / max(len(code), 1) | |
| def extract_stylometry_features(df): | |
| features = [] | |
| for _, row in df.iterrows(): | |
| code = str(row["normalized_code"]) | |
| language = str(row["Language"]).lower() | |
| lines = code.split("\n") | |
| identifiers = extract_identifiers(code) | |
| identifier_count = len(identifiers) | |
| unique_identifier_ratio = ( | |
| len(set(identifiers)) / identifier_count if identifier_count > 0 else 0.0 | |
| ) | |
| avg_identifier_length = ( | |
| np.mean([len(i) for i in identifiers]) if identifiers else 0.0 | |
| ) | |
| snake_case_count = sum(is_snake_case(i) for i in identifiers) | |
| camel_case_count = sum(is_camel_case(i) for i in identifiers) | |
| snake_case_ratio = ( | |
| snake_case_count / identifier_count if identifier_count > 0 else 0.0 | |
| ) | |
| camel_case_ratio = ( | |
| camel_case_count / identifier_count if identifier_count > 0 else 0.0 | |
| ) | |
| comment_lines, total_lines = count_comments(code, language) | |
| comment_ratio = ( | |
| comment_lines / total_lines if total_lines > 0 else 0.0 | |
| ) | |
| features.append({ | |
| "identifier_count": identifier_count, | |
| "unique_identifier_ratio": unique_identifier_ratio, | |
| "avg_identifier_length": avg_identifier_length, | |
| "comment_ratio": comment_ratio, | |
| "snake_case_ratio": snake_case_ratio, | |
| "camel_case_ratio": camel_case_ratio, | |
| "avg_line_length": avg_line_length(lines), | |
| "std_line_length": std_line_length(lines), | |
| "blank_line_ratio": blank_line_ratio(lines), | |
| "indentation_variance": indentation_variance(lines), | |
| "punctuation_density": punctuation_density(code) | |
| }) | |
| return pd.DataFrame(features) | |
| if __name__ == "__main__": | |
| os.makedirs("featureextraction/step3_stylometry_extraction", exist_ok=True) | |
| for split in ["train", "val", "test"]: | |
| input_path = f"dataset/processed/dataset_{split}.csv" | |
| df = pd.read_csv(input_path) | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| X_style = extract_stylometry_features(df) | |
| X_style["Label"] = df["Label"] | |
| output_path = f"featureextraction/step3_stylometry_extraction/{split}_features.csv" | |
| X_style.to_csv(output_path, index=False) | |
| print(f"Stylometry features extracted for {split}") | |