File size: 3,756 Bytes
b144cb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import pandas as pd
import re
import os
import numpy as np

IDENTIFIER_REGEX = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")

def extract_identifiers(code):
    return IDENTIFIER_REGEX.findall(code)

def is_snake_case(name):
    return "_" in name and name.lower() == name

def is_camel_case(name):
    return name != name.lower() and "_" not in name

def count_comments(code, language):
    lines = code.split("\n")
    count = 0

    for line in lines:
        line = line.strip()
        if language == "python" and line.startswith("#"):
            count += 1
        elif language == "java" and line.startswith("//"):
            count += 1

    return count, len(lines)

def avg_line_length(lines):
    return np.mean([len(l) for l in lines]) if lines else 0.0

def std_line_length(lines):
    return np.std([len(l) for l in lines]) if lines else 0.0

def blank_line_ratio(lines):
    if not lines:
        return 0.0
    blanks = sum(1 for l in lines if not l.strip())
    return blanks / len(lines)

def indentation_variance(lines):
    indents = [(len(l) - len(l.lstrip(" "))) for l in lines]
    return np.var(indents) if indents else 0.0

def punctuation_density(code):
    punct = re.findall(r"[;:{}()\[\]]", code)
    return len(punct) / max(len(code), 1)

def extract_stylometry_features(df):
    features = []

    for _, row in df.iterrows():
        code = str(row["normalized_code"])
        language = str(row["Language"]).lower()
        lines = code.split("\n")

        identifiers = extract_identifiers(code)
        identifier_count = len(identifiers)

        unique_identifier_ratio = (
            len(set(identifiers)) / identifier_count if identifier_count > 0 else 0.0
        )

        avg_identifier_length = (
            np.mean([len(i) for i in identifiers]) if identifiers else 0.0
        )

        snake_case_count = sum(is_snake_case(i) for i in identifiers)
        camel_case_count = sum(is_camel_case(i) for i in identifiers)

        snake_case_ratio = (
            snake_case_count / identifier_count if identifier_count > 0 else 0.0
        )

        camel_case_ratio = (
            camel_case_count / identifier_count if identifier_count > 0 else 0.0
        )


        comment_lines, total_lines = count_comments(code, language)
        comment_ratio = (
            comment_lines / total_lines if total_lines > 0 else 0.0
        )


        features.append({
          
            "identifier_count": identifier_count,
            "unique_identifier_ratio": unique_identifier_ratio,
            "avg_identifier_length": avg_identifier_length,
            "comment_ratio": comment_ratio,
            "snake_case_ratio": snake_case_ratio,
            "camel_case_ratio": camel_case_ratio,

       
            "avg_line_length": avg_line_length(lines),
            "std_line_length": std_line_length(lines),
            "blank_line_ratio": blank_line_ratio(lines),
            "indentation_variance": indentation_variance(lines),
            "punctuation_density": punctuation_density(code)
        })

    return pd.DataFrame(features)

if __name__ == "__main__":

    os.makedirs("featureextraction/step3_stylometry_extraction", exist_ok=True)

    for split in ["train", "val", "test"]:
        input_path = f"dataset/processed/dataset_{split}.csv"
        df = pd.read_csv(input_path)

        if "Label (0- HUMAN, 1-AI)" in df.columns:
            df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})

        X_style = extract_stylometry_features(df)
        X_style["Label"] = df["Label"]

        output_path = f"featureextraction/step3_stylometry_extraction/{split}_features.csv"
        X_style.to_csv(output_path, index=False)

        print(f"Stylometry features extracted for {split}")