Spaces:
Runtime error
Runtime error
File size: 3,746 Bytes
b144cb7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | import sys
import os
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(PROJECT_ROOT)
import numpy as np
import pandas as pd
import xgboost as xgb
# -------------------------------
# IMPORT FEATURE EXTRACTORS
# -------------------------------
from featureextraction.step1_statistical_extraction.step1_statistical_extraction import extract_features as extract_stat
from featureextraction.step2_ast_extraction.step2_ast_extraction import extract_ast_features
from featureextraction.step3_stylometry_extraction.step3_stylometry_extraction import extract_stylometry_features
from featureextraction.semantic_features.unixcoder_embedding import get_unixcoder_embedding
# XAI modules
from xai.shaplayer import shap_explain
from xai.grouping import group_shap_explanations
from xai.text_explainer import generate_text_explanation
# -------------------------------
# LOAD MODEL
# -------------------------------
model = xgb.XGBClassifier()
model.load_model("classifier/xgboost_final_model.json")
# -------------------------------
# LANGUAGE ONE-HOT
# -------------------------------
def encode_language(language):
language = language.lower()
if language == "python":
return np.array([1, 0])
elif language == "java":
return np.array([0, 1])
else:
raise ValueError("Language must be python or java")
# -------------------------------
# BUILD FEATURES FROM CODE
# -------------------------------
def build_features_from_code(code, language):
df = pd.DataFrame({
"normalized_code": [code],
"Language": [language]
})
stat_df = extract_stat(df)
ast_df = extract_ast_features(df)
style_df = extract_stylometry_features(df)
X_stat = stat_df.drop(columns=["language"]).values.flatten()
X_ast = ast_df.values.flatten()
X_style = style_df.values.flatten()
X_lang = encode_language(language)
X_sem = get_unixcoder_embedding(code)
X_final = np.hstack([
X_stat,
X_ast,
X_style,
X_lang,
X_sem
]).reshape(1, -1)
return X_final
# -------------------------------
# BASIC PREDICT FUNCTION
# -------------------------------
def predict_from_features(X_final):
prediction = model.predict(X_final)[0]
probability = model.predict_proba(X_final)[0][1]
label_name = "AI" if prediction == 1 else "Human"
return label_name, probability
# -------------------------------
# INTERACTIVE CLI
# -------------------------------
if __name__ == "__main__":
print("\n======================================")
print(" AI vs Human Code Classification")
print("======================================")
language = input("Choose language (python/java): ").strip().lower()
print("\nPaste your code below.")
print("Type 'END' on a new line when finished.\n")
lines = []
while True:
line = input()
if line.strip() == "END":
break
lines.append(line)
code_input = "\n".join(lines)
# build features
X_final = build_features_from_code(code_input, language)
# predict
label, prob = predict_from_features(X_final)
# shap
shap_result = shap_explain(model, X_final)
# grouping
grouped = group_shap_explanations(shap_result)
# text explanation
text_reason = generate_text_explanation(grouped, label, prob)
print("\n========== RESULT ==========")
print("Prediction :", label)
print("Confidence :", prob)
print("\nTop SHAP features:")
for e in shap_result:
print(f"Feature {e['feature_index']} → {e['impact']} ({e['pushes_toward']})")
print("\nGrouped SHAP importance:", grouped)
print("\nExplanation:\n")
print(text_reason) |