ai-code-detection / basemodel /token_level_feature_extraction.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
import numpy as np
import re
import os
from collections import Counter
from math import log2
def tokenize(code):
# identifiers, numbers, operators
return re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|[+\-*/%]", code)
def token_entropy(tokens):
if not tokens:
return 0.0
counts = Counter(tokens)
total = len(tokens)
probs = [c / total for c in counts.values()]
return -sum(p * log2(p) for p in probs)
def burstiness(tokens):
if not tokens:
return 0.0
counts = Counter(tokens)
repeated = sum(c for c in counts.values() if c > 1)
return repeated / len(tokens)
def repetition_ratio(tokens):
if not tokens:
return 0.0
return 1 - (len(set(tokens)) / len(tokens))
def avg_token_length(tokens):
if not tokens:
return 0.0
return np.mean([len(t) for t in tokens])
def vocab_richness(tokens):
if not tokens:
return 0.0
return len(set(tokens)) / len(tokens)
def extract_features(df):
features = []
for _, row in df.iterrows():
code = str(row["normalized_code"])
tokens = tokenize(code)
features.append({
"entropy": token_entropy(tokens),
"burstiness": burstiness(tokens),
"repetition_ratio": repetition_ratio(tokens),
"avg_token_length": avg_token_length(tokens),
"vocab_richness": vocab_richness(tokens),
"language": row.get("Language", "unknown")
})
return pd.DataFrame(features)
if __name__ == "__main__":
os.makedirs("basemodel", exist_ok=True)
for split in ["train", "val", "test"]:
input_path = f"dataset/processed/dataset_{split}.csv"
df = pd.read_csv(input_path)
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
X = extract_features(df)
X["Label"] = df["Label"]
output_path = f"basemodel/{split}_features.csv"
X.to_csv(output_path, index=False)
print(f"Statistical baseline features extracted for {split}")