import pandas as pd import numpy as np import re import os from collections import Counter from math import log2 def tokenize(code): # identifiers, numbers, operators return re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|[+\-*/%]", code) def token_entropy(tokens): if not tokens: return 0.0 counts = Counter(tokens) total = len(tokens) probs = [c / total for c in counts.values()] return -sum(p * log2(p) for p in probs) def burstiness(tokens): if not tokens: return 0.0 counts = Counter(tokens) repeated = sum(c for c in counts.values() if c > 1) return repeated / len(tokens) def repetition_ratio(tokens): if not tokens: return 0.0 return 1 - (len(set(tokens)) / len(tokens)) def avg_token_length(tokens): if not tokens: return 0.0 return np.mean([len(t) for t in tokens]) def vocab_richness(tokens): if not tokens: return 0.0 return len(set(tokens)) / len(tokens) def extract_features(df): features = [] for _, row in df.iterrows(): code = str(row["normalized_code"]) tokens = tokenize(code) features.append({ "entropy": token_entropy(tokens), "burstiness": burstiness(tokens), "repetition_ratio": repetition_ratio(tokens), "avg_token_length": avg_token_length(tokens), "vocab_richness": vocab_richness(tokens), "language": row.get("Language", "unknown") }) return pd.DataFrame(features) if __name__ == "__main__": os.makedirs("basemodel", exist_ok=True) for split in ["train", "val", "test"]: input_path = f"dataset/processed/dataset_{split}.csv" df = pd.read_csv(input_path) if "Label (0- HUMAN, 1-AI)" in df.columns: df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) X = extract_features(df) X["Label"] = df["Label"] output_path = f"basemodel/{split}_features.csv" X.to_csv(output_path, index=False) print(f"Statistical baseline features extracted for {split}")