import pandas as pd import numpy as np import os PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) STAT_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step1_statistical_extraction") AST_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step2_ast_extraction") STYLE_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step3_stylometry_extraction") SEM_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "semantic_features") OUTPUT_DIR = os.path.join(PROJECT_ROOT, "featureextraction", "final_features") os.makedirs(OUTPUT_DIR, exist_ok=True) def load_and_concatenate(split): stat_df = pd.read_csv(os.path.join(STAT_PATH, f"{split}_features.csv")) ast_df = pd.read_csv(os.path.join(AST_PATH, f"{split}_features.csv")) style_df = pd.read_csv(os.path.join(STYLE_PATH, f"{split}_features.csv")) y = stat_df["Label"].values if "language" in stat_df.columns: lang_onehot = pd.get_dummies(stat_df["language"], prefix="lang") else: raise ValueError("Language column not found for one-hot encoding!") X_stat = stat_df.drop(columns=["Label", "language"]).values X_ast = ast_df.drop(columns=["Label"]).values X_style = style_df.drop(columns=["Label"]).values X_lang = lang_onehot.values X_sem = np.load(os.path.join(SEM_PATH, f"{split}_unixcoder.npy")) assert ( len(X_stat) == len(X_ast) == len(X_style) == len(X_lang) == len(X_sem) ), f"Row mismatch in {split} split!" X_final = np.hstack([X_stat, X_ast, X_style, X_lang, X_sem]) return X_final.astype(float), y if __name__ == "__main__": for split in ["train", "val", "test"]: X, y = load_and_concatenate(split) np.save(os.path.join(OUTPUT_DIR, f"{split}_X.npy"), X) np.save(os.path.join(OUTPUT_DIR, f"{split}_y.npy"), y) print(f"{split.upper()} FEATURES SHAPE: {X.shape}") print(f"{split.upper()} FEATURE TYPE:", X.dtype) print("\nFeature concatenation with ONE-HOT language encoding completed")