Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import re | |
| import os | |
| from collections import Counter | |
| from math import log2 | |
| def tokenize(code): | |
| # identifiers, numbers, operators | |
| return re.findall(r"[A-Za-z_]+|\d+|==|!=|<=|>=|[+\-*/%]", code) | |
| def token_entropy(tokens): | |
| if not tokens: | |
| return 0.0 | |
| counts = Counter(tokens) | |
| total = len(tokens) | |
| probs = [c / total for c in counts.values()] | |
| return -sum(p * log2(p) for p in probs) | |
| def burstiness(tokens): | |
| if not tokens: | |
| return 0.0 | |
| counts = Counter(tokens) | |
| repeated = sum(c for c in counts.values() if c > 1) | |
| return repeated / len(tokens) | |
| def repetition_ratio(tokens): | |
| if not tokens: | |
| return 0.0 | |
| return 1 - (len(set(tokens)) / len(tokens)) | |
| def avg_token_length(tokens): | |
| if not tokens: | |
| return 0.0 | |
| return np.mean([len(t) for t in tokens]) | |
| def vocab_richness(tokens): | |
| if not tokens: | |
| return 0.0 | |
| return len(set(tokens)) / len(tokens) | |
| def extract_features(df): | |
| features = [] | |
| for _, row in df.iterrows(): | |
| code = str(row["normalized_code"]) | |
| tokens = tokenize(code) | |
| features.append({ | |
| "entropy": token_entropy(tokens), | |
| "burstiness": burstiness(tokens), | |
| "repetition_ratio": repetition_ratio(tokens), | |
| "avg_token_length": avg_token_length(tokens), | |
| "vocab_richness": vocab_richness(tokens), | |
| "language": row.get("Language", "unknown") | |
| }) | |
| return pd.DataFrame(features) | |
| if __name__ == "__main__": | |
| os.makedirs("basemodel", exist_ok=True) | |
| for split in ["train", "val", "test"]: | |
| input_path = f"dataset/processed/dataset_{split}.csv" | |
| df = pd.read_csv(input_path) | |
| if "Label (0- HUMAN, 1-AI)" in df.columns: | |
| df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"}) | |
| X = extract_features(df) | |
| X["Label"] = df["Label"] | |
| output_path = f"basemodel/{split}_features.csv" | |
| X.to_csv(output_path, index=False) | |
| print(f"Statistical baseline features extracted for {split}") | |