ai-code-detection / featureextraction /feature_concatenation.py
joshnavip's picture
Initial commit: AI code detection project (without binary files)
b144cb7
import pandas as pd
import numpy as np
import os
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
STAT_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step1_statistical_extraction")
AST_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step2_ast_extraction")
STYLE_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step3_stylometry_extraction")
SEM_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "semantic_features")
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "featureextraction", "final_features")
os.makedirs(OUTPUT_DIR, exist_ok=True)
def load_and_concatenate(split):
stat_df = pd.read_csv(os.path.join(STAT_PATH, f"{split}_features.csv"))
ast_df = pd.read_csv(os.path.join(AST_PATH, f"{split}_features.csv"))
style_df = pd.read_csv(os.path.join(STYLE_PATH, f"{split}_features.csv"))
y = stat_df["Label"].values
if "language" in stat_df.columns:
lang_onehot = pd.get_dummies(stat_df["language"], prefix="lang")
else:
raise ValueError("Language column not found for one-hot encoding!")
X_stat = stat_df.drop(columns=["Label", "language"]).values
X_ast = ast_df.drop(columns=["Label"]).values
X_style = style_df.drop(columns=["Label"]).values
X_lang = lang_onehot.values
X_sem = np.load(os.path.join(SEM_PATH, f"{split}_unixcoder.npy"))
assert (
len(X_stat) == len(X_ast) == len(X_style) == len(X_lang) == len(X_sem)
), f"Row mismatch in {split} split!"
X_final = np.hstack([X_stat, X_ast, X_style, X_lang, X_sem])
return X_final.astype(float), y
if __name__ == "__main__":
for split in ["train", "val", "test"]:
X, y = load_and_concatenate(split)
np.save(os.path.join(OUTPUT_DIR, f"{split}_X.npy"), X)
np.save(os.path.join(OUTPUT_DIR, f"{split}_y.npy"), y)
print(f"{split.upper()} FEATURES SHAPE: {X.shape}")
print(f"{split.upper()} FEATURE TYPE:", X.dtype)
print("\nFeature concatenation with ONE-HOT language encoding completed")