Spaces:

joshnavip
/

ai-code-detection

Runtime error

App Files Files Community

ai-code-detection / basemodel /token_level_feature_extraction.py

joshnavip

Initial commit: AI code detection project (without binary files)

b144cb7 about 1 month ago

raw

history blame contribute delete

2.1 kB

	import pandas as pd
	import numpy as np
	import re
	import os
	from collections import Counter
	from math import log2


	def tokenize(code):
	# identifiers, numbers, operators
	return re.findall(r"[A-Za-z_]+\|\d+\|==\|!=\|<=\|>=\|[+\-*/%]", code)


	def token_entropy(tokens):
	if not tokens:
	return 0.0
	counts = Counter(tokens)
	total = len(tokens)
	probs = [c / total for c in counts.values()]
	return -sum(p * log2(p) for p in probs)

	def burstiness(tokens):
	if not tokens:
	return 0.0
	counts = Counter(tokens)
	repeated = sum(c for c in counts.values() if c > 1)
	return repeated / len(tokens)

	def repetition_ratio(tokens):
	if not tokens:
	return 0.0
	return 1 - (len(set(tokens)) / len(tokens))

	def avg_token_length(tokens):
	if not tokens:
	return 0.0
	return np.mean([len(t) for t in tokens])

	def vocab_richness(tokens):
	if not tokens:
	return 0.0
	return len(set(tokens)) / len(tokens)


	def extract_features(df):
	features = []

	for _, row in df.iterrows():
	code = str(row["normalized_code"])
	tokens = tokenize(code)

	features.append({
	"entropy": token_entropy(tokens),
	"burstiness": burstiness(tokens),
	"repetition_ratio": repetition_ratio(tokens),
	"avg_token_length": avg_token_length(tokens),
	"vocab_richness": vocab_richness(tokens),
	"language": row.get("Language", "unknown")
	})

	return pd.DataFrame(features)


	if __name__ == "__main__":

	os.makedirs("basemodel", exist_ok=True)

	for split in ["train", "val", "test"]:
	input_path = f"dataset/processed/dataset_{split}.csv"
	df = pd.read_csv(input_path)

	if "Label (0- HUMAN, 1-AI)" in df.columns:
	df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})

	X = extract_features(df)
	X["Label"] = df["Label"]

	output_path = f"basemodel/{split}_features.csv"
	X.to_csv(output_path, index=False)

	print(f"Statistical baseline features extracted for {split}")