Spaces:
Runtime error
Runtime error
File size: 2,100 Bytes
b144cb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | import pandas as pd
import numpy as np
import re
import os
from collections import Counter
from math import log2
def tokenize(code):
# identifiers, numbers, operators
return re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|[+\-*/%]", code)
def token_entropy(tokens):
if not tokens:
return 0.0
counts = Counter(tokens)
total = len(tokens)
probs = [c / total for c in counts.values()]
return -sum(p * log2(p) for p in probs)
def burstiness(tokens):
if not tokens:
return 0.0
counts = Counter(tokens)
repeated = sum(c for c in counts.values() if c > 1)
return repeated / len(tokens)
def repetition_ratio(tokens):
if not tokens:
return 0.0
return 1 - (len(set(tokens)) / len(tokens))
def avg_token_length(tokens):
if not tokens:
return 0.0
return np.mean([len(t) for t in tokens])
def vocab_richness(tokens):
if not tokens:
return 0.0
return len(set(tokens)) / len(tokens)
def extract_features(df):
features = []
for _, row in df.iterrows():
code = str(row["normalized_code"])
tokens = tokenize(code)
features.append({
"entropy": token_entropy(tokens),
"burstiness": burstiness(tokens),
"repetition_ratio": repetition_ratio(tokens),
"avg_token_length": avg_token_length(tokens),
"vocab_richness": vocab_richness(tokens),
"language": row.get("Language", "unknown")
})
return pd.DataFrame(features)
if __name__ == "__main__":
os.makedirs("basemodel", exist_ok=True)
for split in ["train", "val", "test"]:
input_path = f"dataset/processed/dataset_{split}.csv"
df = pd.read_csv(input_path)
if "Label (0- HUMAN, 1-AI)" in df.columns:
df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})
X = extract_features(df)
X["Label"] = df["Label"]
output_path = f"basemodel/{split}_features.csv"
X.to_csv(output_path, index=False)
print(f"Statistical baseline features extracted for {split}")
|