Spaces:

joshnavip
/

ai-code-detection

Runtime error

App Files Files Community

ai-code-detection / featureextraction /step3_stylometry_extraction /step3_stylometry_extraction.py

joshnavip

Initial commit: AI code detection project (without binary files)

b144cb7 about 1 month ago

raw

history blame contribute delete

3.76 kB

	import pandas as pd
	import re
	import os
	import numpy as np

	IDENTIFIER_REGEX = re.compile(r"\b[a-zA-Z_][a-zA-Z0-9_]*\b")

	def extract_identifiers(code):
	return IDENTIFIER_REGEX.findall(code)

	def is_snake_case(name):
	return "_" in name and name.lower() == name

	def is_camel_case(name):
	return name != name.lower() and "_" not in name

	def count_comments(code, language):
	lines = code.split("\n")
	count = 0

	for line in lines:
	line = line.strip()
	if language == "python" and line.startswith("#"):
	count += 1
	elif language == "java" and line.startswith("//"):
	count += 1

	return count, len(lines)

	def avg_line_length(lines):
	return np.mean([len(l) for l in lines]) if lines else 0.0

	def std_line_length(lines):
	return np.std([len(l) for l in lines]) if lines else 0.0

	def blank_line_ratio(lines):
	if not lines:
	return 0.0
	blanks = sum(1 for l in lines if not l.strip())
	return blanks / len(lines)

	def indentation_variance(lines):
	indents = [(len(l) - len(l.lstrip(" "))) for l in lines]
	return np.var(indents) if indents else 0.0

	def punctuation_density(code):
	punct = re.findall(r"[;:{}()\[\]]", code)
	return len(punct) / max(len(code), 1)

	def extract_stylometry_features(df):
	features = []

	for _, row in df.iterrows():
	code = str(row["normalized_code"])
	language = str(row["Language"]).lower()
	lines = code.split("\n")

	identifiers = extract_identifiers(code)
	identifier_count = len(identifiers)

	unique_identifier_ratio = (
	len(set(identifiers)) / identifier_count if identifier_count > 0 else 0.0
	)

	avg_identifier_length = (
	np.mean([len(i) for i in identifiers]) if identifiers else 0.0
	)

	snake_case_count = sum(is_snake_case(i) for i in identifiers)
	camel_case_count = sum(is_camel_case(i) for i in identifiers)

	snake_case_ratio = (
	snake_case_count / identifier_count if identifier_count > 0 else 0.0
	)

	camel_case_ratio = (
	camel_case_count / identifier_count if identifier_count > 0 else 0.0
	)


	comment_lines, total_lines = count_comments(code, language)
	comment_ratio = (
	comment_lines / total_lines if total_lines > 0 else 0.0
	)


	features.append({

	"identifier_count": identifier_count,
	"unique_identifier_ratio": unique_identifier_ratio,
	"avg_identifier_length": avg_identifier_length,
	"comment_ratio": comment_ratio,
	"snake_case_ratio": snake_case_ratio,
	"camel_case_ratio": camel_case_ratio,


	"avg_line_length": avg_line_length(lines),
	"std_line_length": std_line_length(lines),
	"blank_line_ratio": blank_line_ratio(lines),
	"indentation_variance": indentation_variance(lines),
	"punctuation_density": punctuation_density(code)
	})

	return pd.DataFrame(features)

	if __name__ == "__main__":

	os.makedirs("featureextraction/step3_stylometry_extraction", exist_ok=True)

	for split in ["train", "val", "test"]:
	input_path = f"dataset/processed/dataset_{split}.csv"
	df = pd.read_csv(input_path)

	if "Label (0- HUMAN, 1-AI)" in df.columns:
	df = df.rename(columns={"Label (0- HUMAN, 1-AI)": "Label"})

	X_style = extract_stylometry_features(df)
	X_style["Label"] = df["Label"]

	output_path = f"featureextraction/step3_stylometry_extraction/{split}_features.csv"
	X_style.to_csv(output_path, index=False)

	print(f"Stylometry features extracted for {split}")