Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import os | |
| PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| STAT_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step1_statistical_extraction") | |
| AST_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step2_ast_extraction") | |
| STYLE_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "step3_stylometry_extraction") | |
| SEM_PATH = os.path.join(PROJECT_ROOT, "featureextraction", "semantic_features") | |
| OUTPUT_DIR = os.path.join(PROJECT_ROOT, "featureextraction", "final_features") | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| def load_and_concatenate(split): | |
| stat_df = pd.read_csv(os.path.join(STAT_PATH, f"{split}_features.csv")) | |
| ast_df = pd.read_csv(os.path.join(AST_PATH, f"{split}_features.csv")) | |
| style_df = pd.read_csv(os.path.join(STYLE_PATH, f"{split}_features.csv")) | |
| y = stat_df["Label"].values | |
| if "language" in stat_df.columns: | |
| lang_onehot = pd.get_dummies(stat_df["language"], prefix="lang") | |
| else: | |
| raise ValueError("Language column not found for one-hot encoding!") | |
| X_stat = stat_df.drop(columns=["Label", "language"]).values | |
| X_ast = ast_df.drop(columns=["Label"]).values | |
| X_style = style_df.drop(columns=["Label"]).values | |
| X_lang = lang_onehot.values | |
| X_sem = np.load(os.path.join(SEM_PATH, f"{split}_unixcoder.npy")) | |
| assert ( | |
| len(X_stat) == len(X_ast) == len(X_style) == len(X_lang) == len(X_sem) | |
| ), f"Row mismatch in {split} split!" | |
| X_final = np.hstack([X_stat, X_ast, X_style, X_lang, X_sem]) | |
| return X_final.astype(float), y | |
| if __name__ == "__main__": | |
| for split in ["train", "val", "test"]: | |
| X, y = load_and_concatenate(split) | |
| np.save(os.path.join(OUTPUT_DIR, f"{split}_X.npy"), X) | |
| np.save(os.path.join(OUTPUT_DIR, f"{split}_y.npy"), y) | |
| print(f"{split.upper()} FEATURES SHAPE: {X.shape}") | |
| print(f"{split.upper()} FEATURE TYPE:", X.dtype) | |
| print("\nFeature concatenation with ONE-HOT language encoding completed") | |