Spaces:
Sleeping
Sleeping
| """ | |
| classifier.py | |
| ============= | |
| Trenira Random Forest klasifikator na datasetu izvuΔenih znaΔajki, | |
| evaluira ga i sprema na disk za koriΕ‘tenje u web aplikaciji. | |
| Pokretanje (treniranje): | |
| python classifier.py | |
| KoriΕ‘tenje iz drugog fajla (predikcija): | |
| from classifier import predict | |
| result = predict(code="def foo(x): return x", language="python") | |
| print(result["ai_probability"]) # npr. 0.73 | |
| print(result["verdict"]) # "Vjerojatno AI" | |
| print(result["top_features"]) # koje znaΔajke su bile kljuΔne | |
| """ | |
| import os | |
| import csv | |
| import pickle | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| import numpy as np | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import train_test_split, cross_val_score | |
| from sklearn.metrics import ( | |
| classification_report, | |
| confusion_matrix, | |
| roc_auc_score, | |
| precision_recall_curve, | |
| ) | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.calibration import CalibratedClassifierCV | |
| from feature_extraction import extract_all_features | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # KONFIGURACIJA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATASET_PATH = os.path.join("data", "dataset.csv") | |
| MODEL_DIR = "model" | |
| MODEL_PATH = os.path.join(MODEL_DIR, "classifier.pkl") | |
| SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl") | |
| FEATURES_PATH = os.path.join(MODEL_DIR, "feature_names.pkl") | |
| # Kolone koje ne koristimo kao znaΔajke za treniranje | |
| IGNORED_COLUMNS = {"label", "source", "detected_language", "model_available"} | |
| # ZnaΔajka perplexity je -1.0 kad model nije uΔitan β tretiramo kao missing | |
| PERPLEXITY_MISSING = -1.0 | |
| # Random Forest parametri | |
| RF_PARAMS = { | |
| "n_estimators": 300, | |
| "max_depth": None, | |
| "min_samples_leaf": 2, | |
| # Dajemo veΔu kaznu za laΕΎno pozitivne (nevin student oznaΔen kao AI) | |
| # {0: 1.0, 1: 0.8} znaΔi da je greΕ‘ka na human klasi 1.25x skuplja od greΕ‘ke na AI klasi | |
| # "balanced" automatski kompenzira neravnoteΕΎu klasa | |
| # Human dobiva veΔi teΕΎinski faktor jer je manjina (29% vs 71%) | |
| "class_weight": "balanced", | |
| # Uz to koristimo max_features za bolju generalizaciju | |
| "max_features": "sqrt", | |
| "random_state": 42, | |
| "n_jobs": -1, | |
| } | |
| # Prag ispod kojeg smatramo kod "premalog" za pouzdanu analizu | |
| MINIMUM_LINES = 5 | |
| # Prag vjerojatnosti β konzervativniji pragovi smanjuju laΕΎno pozitivne | |
| THRESHOLDS = { | |
| "likely_ai": 0.80, # gore β "Vjerojatno AI" | |
| "possible_ai": 0.65, # gore β "MoguΔe AI" | |
| "unclear": 0.45, # gore β "Nejasno" | |
| "possible_human": 0.25, # gore β "MoguΔe ΔovjeΔji" | |
| # ispod β "Vjerojatno ΔovjeΔji" | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # UΔITAVANJE DATASETA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ucitaj_dataset(path: str): | |
| """ | |
| UΔitava dataset.csv i vraΔa feature matricu X i vektor oznaka y. | |
| Tretira -1.0 vrijednosti (perplexity bez modela) kao 0.0 | |
| jer klasifikator ne smije vidjeti negativne vrijednosti kao signal. | |
| Parametri: | |
| path (str): Putanja do CSV datoteke. | |
| VraΔa: | |
| X (np.ndarray): Matrica znaΔajki oblika (n_samples, n_features). | |
| y (np.ndarray): Vektor oznaka (0=human, 1=ai). | |
| feature_names (list): Nazivi stupaca koji odgovaraju stupcima X. | |
| """ | |
| if not os.path.exists(path): | |
| raise FileNotFoundError( | |
| f"Dataset nije pronaΔen na '{path}'.\n" | |
| f"Pokreni prvo: python download_dataset.py" | |
| ) | |
| redovi = [] | |
| with open(path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| redovi.append(row) | |
| if not redovi: | |
| raise ValueError("Dataset je prazan.") | |
| # Odredi nazive znaΔajki (svi stupci osim ignoriranih) | |
| sve_kolone = list(redovi[0].keys()) | |
| feature_names = [c for c in sve_kolone if c not in IGNORED_COLUMNS] | |
| X_rows = [] | |
| y_list = [] | |
| for row in redovi: | |
| try: | |
| y_list.append(int(row["label"])) | |
| # Pretvori svaku znaΔajku u float | |
| # Perplexity -1.0 β 0.0 (nije dostupan, ne smije biti signal) | |
| vrijednosti = [] | |
| for feat in feature_names: | |
| val = float(row[feat]) | |
| if feat == "perplexity" and val == PERPLEXITY_MISSING: | |
| val = 0.0 | |
| vrijednosti.append(val) | |
| X_rows.append(vrijednosti) | |
| except (ValueError, KeyError): | |
| continue # preskoΔi neispravne retke | |
| X = np.array(X_rows, dtype=np.float32) | |
| y = np.array(y_list, dtype=np.int32) | |
| print(f" UΔitano {len(y)} primjera, {len(feature_names)} znaΔajki") | |
| print(f" Human (0): {sum(y == 0)} | AI (1): {sum(y == 1)}") | |
| return X, y, feature_names | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # TRENIRANJE | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def treniraj(X, y, feature_names): | |
| """ | |
| Trenira Random Forest klasifikator i vraΔa trenirani model zajedno | |
| sa scalerom i rezultatima evaluacije. | |
| Pipeline: | |
| 1. Podijeli podatke 80% trening / 20% test | |
| 2. Normalizira znaΔajke (StandardScaler) | |
| 3. Trenira Random Forest | |
| 4. Evaluira na test skupu | |
| 5. PokreΔe 5-fold cross-validation za pouzdaniju procjenu | |
| Parametri: | |
| X (np.ndarray): Matrica znaΔajki. | |
| y (np.ndarray): Vektor oznaka. | |
| feature_names (list): Nazivi znaΔajki. | |
| VraΔa: | |
| model: Trenirani RandomForestClassifier. | |
| scaler: Trenirani StandardScaler. | |
| metrics: RjeΔnik s metrikama evaluacije. | |
| """ | |
| # 1. Podjela na trening i test skup | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| # stratify=y osigurava da i trening i test imaju isti omjer klasa | |
| ) | |
| print(f"\n Trening: {len(y_train)} primjera") | |
| print(f" Test: {len(y_test)} primjera") | |
| # 2. Normalizacija β StandardScaler svaku znaΔajku svede na | |
| # srednju vrijednost 0 i standardnu devijaciju 1. | |
| # VAΕ½NO: scaler se fitira SAMO na trening skupu, | |
| # a transformira i trening i test (da ne bi "curilo" znanje) | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # 3. Treniranje Random Foresta + kalibracija vjerojatnosti | |
| # CalibratedClassifierCV popravlja iskrivljene vjerojatnosti RF-a. | |
| # Bez kalibracije, RF moΕΎe davati 60% za neΕ‘to Ε‘to je zapravo 30%. | |
| # method='isotonic' je jaΔi, ali treba viΕ‘e podataka (>1000 primjera β ok) | |
| print("\n Treniram Random Forest + kalibriram vjerojatnosti...") | |
| base_model = RandomForestClassifier(**RF_PARAMS) | |
| model = CalibratedClassifierCV(base_model, method='isotonic', cv=3) | |
| model.fit(X_train_scaled, y_train) | |
| # 4. Evaluacija na test skupu | |
| y_pred = model.predict(X_test_scaled) | |
| y_pred_prob = model.predict_proba(X_test_scaled)[:, 1] | |
| print("\n" + "β" * 50) | |
| print(" REZULTATI EVALUACIJE") | |
| print("β" * 50) | |
| print(classification_report( | |
| y_test, y_pred, | |
| target_names=["Human (0)", "AI (1)"], | |
| digits=3 | |
| )) | |
| # Matrica zabune β pokazuje laΕΎno pozitivne i laΕΎno negativne | |
| cm = confusion_matrix(y_test, y_pred) | |
| tn, fp, fn, tp = cm.ravel() | |
| print(f" Matrica zabune:") | |
| print(f" Ispravno human: {tn} (true negative)") | |
| print(f" LaΕΎno oznaΔen AI: {fp} (false positive)") | |
| print(f" PropuΕ‘ten AI: {fn} (false negative)") | |
| print(f" Ispravno AI: {tp} (true positive)\n") | |
| # AUC-ROC β mjera kvalitete rankiranja (0.5=sluΔajno, 1.0=savrΕ‘eno) | |
| auc = roc_auc_score(y_test, y_pred_prob) | |
| print(f" AUC-ROC: {auc:.4f}") | |
| # 5. Pronalazi optimalni prag odluke koji maksimizira F1 za human klasu | |
| # Cilj: smanjiti laΕΎno pozitivne (FP) Δak i ako propustimo koji AI | |
| precisions, recalls, thresholds = precision_recall_curve( | |
| y_test, y_pred_prob, pos_label=0 # gledamo human klasu (0) | |
| ) | |
| # TraΕΎimo prag gdje je precision za human >= 0.85 | |
| # (tj. kad kaΕΎemo "human", u barem 85% sluΔajeva stvarno je human) | |
| optimal_threshold = 0.5 # fallback | |
| for prec, rec, thr in zip(precisions, recalls, thresholds): | |
| if prec >= 0.85 and rec >= 0.30: | |
| optimal_threshold = thr | |
| break | |
| print(f"\n Optimalni prag odluke za AI klasu: {1 - optimal_threshold:.2f}") | |
| print(f" (Prag ispod kojeg klasificiramo kao Human)") | |
| # Spremi optimalni prag uz model | |
| threshold_path = os.path.join(MODEL_DIR, "threshold.pkl") | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| with open(threshold_path, "wb") as f_thr: | |
| pickle.dump(float(1 - optimal_threshold), f_thr) | |
| # 6. Cross-validation s mijeΕ‘anjem β pouzdanija procjena | |
| # StratifiedKFold + shuffle sprjeΔava situaciju gdje jedna fold | |
| # sadrΕΎi samo jedan tip podataka (npr. samo AIGCodeSet) | |
| from sklearn.model_selection import StratifiedKFold | |
| print("\n 5-fold cross-validation (moΕΎe potrajati minutu)...") | |
| X_scaled_full = scaler.transform(X) | |
| skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| cv_scores = cross_val_score( | |
| model, X_scaled_full, y, | |
| cv=skf, scoring="f1", n_jobs=-1 | |
| ) | |
| print(f" CV F1 scores: {[f'{s:.3f}' for s in cv_scores]}") | |
| print(f" CV F1 prosjek: {cv_scores.mean():.3f} " | |
| f"(Β±{cv_scores.std():.3f})") | |
| # Top 10 najvaΕΎnijih znaΔajki | |
| # CalibratedClassifierCV omotava base estimator, pa trebamo | |
| # dohvatiti feature_importances_ iz jednog od kalibriranih estimatora | |
| try: | |
| base_rf = model.calibrated_classifiers_[0].estimator | |
| importances = base_rf.feature_importances_ | |
| except Exception: | |
| # Fallback ako struktura nije oΔekivana | |
| importances = np.zeros(len(feature_names)) | |
| top_idx = np.argsort(importances)[::-1][:10] | |
| print("\n Top 10 najvaΕΎnijih znaΔajki:") | |
| for rank, idx in enumerate(top_idx, 1): | |
| print(f" {rank:2}. {feature_names[idx]:<38} {importances[idx]:.4f}") | |
| metrics = { | |
| "auc_roc": auc, | |
| "cv_f1_mean": cv_scores.mean(), | |
| "cv_f1_std": cv_scores.std(), | |
| "true_negative": int(tn), | |
| "false_positive": int(fp), | |
| "false_negative": int(fn), | |
| "true_positive": int(tp), | |
| } | |
| return model, scaler, metrics | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # SPREMANJE MODELA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def spremi_model(model, scaler, feature_names): | |
| """ | |
| Sprema trenirani model, scaler i listu naziva znaΔajki na disk. | |
| Sva tri fajla su potrebna za predikciju: | |
| - model : donosi odluku | |
| - scaler : normalizira ulaz na isti naΔin kao pri treniranju | |
| - feature_names : osigurava da se znaΔajke Ε‘alju u ispravnom redoslijedu | |
| Parametri: | |
| model: Trenirani RandomForestClassifier. | |
| scaler: Trenirani StandardScaler. | |
| feature_names: Lista naziva znaΔajki. | |
| """ | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| THRESHOLD_PATH = os.path.join(MODEL_DIR, "threshold.pkl") | |
| with open(MODEL_PATH, "wb") as f: pickle.dump(model, f) | |
| with open(SCALER_PATH, "wb") as f: pickle.dump(scaler, f) | |
| with open(FEATURES_PATH, "wb") as f: pickle.dump(feature_names, f) | |
| print(f"\n Model spremljen u: {MODEL_PATH}") | |
| print(f" Scaler spremljen u: {SCALER_PATH}") | |
| print(f" Nazivi znaΔajki spremljeni: {FEATURES_PATH}") | |
| if os.path.exists(THRESHOLD_PATH): | |
| with open(THRESHOLD_PATH, "rb") as f: | |
| thr = pickle.load(f) | |
| print(f" Optimalni prag: {thr:.2f}") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # PREDIKCIJA β koristi se iz web aplikacije | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def ucitaj_model(): | |
| """ | |
| UΔitava model, scaler, nazive znaΔajki i optimalni prag s diska. | |
| Poziva se jednom pri pokretanju web servera. | |
| VraΔa: | |
| (model, scaler, feature_names, threshold) ili | |
| (None, None, None, 0.65) ako model ne postoji. | |
| """ | |
| THRESHOLD_PATH = os.path.join(MODEL_DIR, "threshold.pkl") | |
| if not all(os.path.exists(p) for p in [MODEL_PATH, SCALER_PATH, FEATURES_PATH]): | |
| return None, None, None, 0.65 | |
| with open(MODEL_PATH, "rb") as f: model = pickle.load(f) | |
| with open(SCALER_PATH, "rb") as f: scaler = pickle.load(f) | |
| with open(FEATURES_PATH, "rb") as f: feature_names = pickle.load(f) | |
| threshold = 0.65 # konzervativni default | |
| if os.path.exists(THRESHOLD_PATH): | |
| with open(THRESHOLD_PATH, "rb") as f: | |
| threshold = pickle.load(f) | |
| return model, scaler, feature_names, threshold | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GENERIRANJE OBJAΕ NJENJA | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_explanations(features: dict, ai_prob: float) -> list: | |
| """ | |
| Generira listu objaΕ‘njenja na engleskom jeziku koja opisuju | |
| zaΕ‘to kod izgleda AI generiran ili ΔovjeΔji. | |
| Svako objaΕ‘njenje je rjeΔnik s: | |
| "text" β reΔenica objaΕ‘njenja | |
| "severity" β "high" | "medium" | "low" | "positive" | |
| "feature" β naziv znaΔajke na koju se objaΕ‘njenje odnosi | |
| Pragovi su kalibrirani na temelju tipiΔnih vrijednosti u | |
| AI-Detector i HMCorp datasetovima. | |
| Parametri: | |
| features (dict): RjeΔnik znaΔajki iz extract_all_features(). | |
| ai_prob (float): Vjerojatnost AI podrijetla (0.0 β 1.0). | |
| VraΔa: | |
| list: Lista rjeΔnika s objaΕ‘njenjima, sortirana po ozbiljnosti. | |
| """ | |
| objasnjenja = [] | |
| def dodaj(text, severity, feature): | |
| objasnjenja.append({"text": text, "severity": severity, "feature": feature}) | |
| # ββ IMENOVANJE βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| avg_id_len = features.get("avg_identifier_length", 0) | |
| if avg_id_len > 7.5: | |
| dodaj( | |
| f"Identifier names are unusually long and descriptive " | |
| f"(average {avg_id_len:.1f} characters). " | |
| f"AI-generated code consistently favours verbose, self-documenting names " | |
| f"such as 'calculate_average_value' over typical student shorthand like 'avg'.", | |
| "high", "avg_identifier_length" | |
| ) | |
| elif avg_id_len > 5.5: | |
| dodaj( | |
| f"Identifier names are moderately long (average {avg_id_len:.1f} characters), " | |
| f"which is slightly above the typical range for human-written student code.", | |
| "medium", "avg_identifier_length" | |
| ) | |
| elif avg_id_len < 2.5 and avg_id_len > 0: | |
| dodaj( | |
| f"Identifier names are very short (average {avg_id_len:.1f} characters), " | |
| f"consistent with a human programmer's preference for concise variable names.", | |
| "positive", "avg_identifier_length" | |
| ) | |
| naming_cons = features.get("naming_consistency", 0) | |
| if naming_cons > 0.85: | |
| dodaj( | |
| f"Naming convention is highly consistent throughout the submission " | |
| f"({naming_cons * 100:.0f}% of identifiers follow the same pattern). " | |
| f"Human programmers typically mix conventions, especially in longer submissions.", | |
| "high", "naming_consistency" | |
| ) | |
| elif naming_cons < 0.3 and naming_cons > 0: | |
| dodaj( | |
| f"Naming convention varies across the submission, which is characteristic " | |
| f"of code written incrementally by a human programmer.", | |
| "positive", "naming_consistency" | |
| ) | |
| single_char = features.get("single_char_name_ratio", 0) | |
| if single_char < 0.03 and features.get("num_functions", 0) > 1: | |
| dodaj( | |
| f"No single-character variable names were detected. " | |
| f"Human programmers routinely use short names such as 'i', 'x', or 'n' " | |
| f"in loops and helper functions; their absence is atypical.", | |
| "medium", "single_char_name_ratio" | |
| ) | |
| elif single_char > 0.25: | |
| dodaj( | |
| f"A notable proportion of variables use single-character names " | |
| f"({single_char * 100:.0f}%), which is common in human-written code.", | |
| "positive", "single_char_name_ratio" | |
| ) | |
| # ββ KOMENTARI I DOCSTRINGOVI βββββββββββββββββββββββββββββββββββββββββββ | |
| comment_ratio = features.get("comment_ratio", 0) | |
| if comment_ratio > 0.30: | |
| dodaj( | |
| f"Comment density is substantially above average β " | |
| f"{comment_ratio * 100:.0f}% of lines contain inline comments. " | |
| f"AI models tend to annotate nearly every logical step, " | |
| f"whereas students typically comment only non-obvious sections.", | |
| "high", "comment_ratio" | |
| ) | |
| elif comment_ratio > 0.18: | |
| dodaj( | |
| f"Comment density ({comment_ratio * 100:.0f}% of lines) is higher than " | |
| f"typically observed in student submissions at this level.", | |
| "medium", "comment_ratio" | |
| ) | |
| elif comment_ratio < 0.03: | |
| dodaj( | |
| f"Very few or no inline comments are present, which is more consistent " | |
| f"with human-written code at this stage of the course.", | |
| "positive", "comment_ratio" | |
| ) | |
| num_docs = features.get("num_docstrings", 0) | |
| # Aproksimiramo broj funkcija iz function_density i total_lines | |
| # jer structural features sada vraΔaju gustoΔe, ne apsolutne brojeve | |
| fn_density = features.get("function_density", 0) | |
| total_lines = features.get("total_lines", 1) | |
| num_fns_est = max(1, round(fn_density * total_lines)) | |
| if num_docs > 0: | |
| doc_coverage = num_docs / max(num_fns_est, 1) | |
| if num_docs >= 3 and doc_coverage >= 0.8: | |
| dodaj( | |
| f"Every function in the submission includes a formal docstring " | |
| f"({num_docs} docstrings detected). " | |
| f"Complete docstring coverage is a strong marker of AI-generated code; " | |
| f"students rarely document all functions unless explicitly required.", | |
| "high", "num_docstrings" | |
| ) | |
| elif num_docs >= 2: | |
| dodaj( | |
| f"Multiple functions include docstrings ({num_docs} detected), " | |
| f"which is above the typical student average.", | |
| "medium", "num_docstrings" | |
| ) | |
| # ββ STRUKTURNE ZNAΔAJKE ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| avg_fn_len = features.get("avg_function_length", 0) | |
| if avg_fn_len > 20: | |
| dodaj( | |
| f"Functions are notably long on average ({avg_fn_len:.0f} lines per function). " | |
| f"AI models tend to produce complete, self-contained implementations; " | |
| f"students more often break logic across multiple smaller functions " | |
| f"or leave parts incomplete.", | |
| "medium", "avg_function_length" | |
| ) | |
| elif 0 < avg_fn_len < 6: | |
| dodaj( | |
| f"Functions are concise on average ({avg_fn_len:.1f} lines), " | |
| f"which is consistent with a human programmer's incremental coding style.", | |
| "positive", "avg_function_length" | |
| ) | |
| try_density = features.get("try_density", 0) | |
| if try_density > 0.06: | |
| dodaj( | |
| f"The submission contains a relatively high density of try/except blocks. " | |
| f"Comprehensive error handling across all edge cases is a pattern " | |
| f"commonly exhibited by AI generators, which anticipate and handle " | |
| f"exceptions that students typically overlook.", | |
| "medium", "try_density" | |
| ) | |
| nesting = features.get("max_nesting_depth", 0) | |
| if nesting > 5: | |
| dodaj( | |
| f"Code nesting reaches a depth of {int(nesting)} levels. " | |
| f"While not conclusive, deeply nested logic can reflect an AI model's " | |
| f"tendency to handle all conditional branches explicitly.", | |
| "medium", "max_nesting_depth" | |
| ) | |
| # ββ STATISTIΔKA ANALIZA ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| token_entropy = features.get("token_entropy", 0) | |
| if token_entropy > 0 and token_entropy < 3.8: | |
| dodaj( | |
| f"Token entropy is low ({token_entropy:.2f}), indicating that the vocabulary " | |
| f"of the submission is repetitive and predictable. " | |
| f"This is consistent with language model output, which tends to reuse " | |
| f"the same phrasing and structural patterns.", | |
| "high", "token_entropy" | |
| ) | |
| elif token_entropy > 5.5: | |
| dodaj( | |
| f"Token entropy is relatively high ({token_entropy:.2f}), suggesting " | |
| f"a diverse and varied vocabulary more typical of human authorship.", | |
| "positive", "token_entropy" | |
| ) | |
| perplexity = features.get("perplexity", -1) | |
| if perplexity != -1 and perplexity > 0: | |
| if perplexity < 8: | |
| dodaj( | |
| f"The code's perplexity score is very low ({perplexity:.1f}), meaning " | |
| f"a language model finds the token sequence highly predictable. " | |
| f"This strongly suggests the code was generated by a similar model.", | |
| "high", "perplexity" | |
| ) | |
| elif perplexity < 20: | |
| dodaj( | |
| f"Perplexity ({perplexity:.1f}) falls within a range that is " | |
| f"moderately consistent with AI-generated code.", | |
| "medium", "perplexity" | |
| ) | |
| elif perplexity > 50: | |
| dodaj( | |
| f"Perplexity is high ({perplexity:.1f}), indicating the code " | |
| f"contains patterns that a language model would consider unexpected β " | |
| f"a characteristic of human authorship.", | |
| "positive", "perplexity" | |
| ) | |
| # ββ FORMATIRANJE βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| trailing = features.get("trailing_whitespace_ratio", 0) | |
| if trailing > 0.15: | |
| dodaj( | |
| f"A notable proportion of lines contain trailing whitespace " | |
| f"({trailing * 100:.0f}%), which is typical of code edited by hand " | |
| f"and inconsistent with AI-generated output.", | |
| "positive", "trailing_whitespace_ratio" | |
| ) | |
| op_cons = features.get("operator_spacing_consistency", 0) | |
| if op_cons > 0.95: | |
| dodaj( | |
| f"Spacing around operators is perfectly consistent throughout the submission. " | |
| f"AI models apply style conventions uniformly; human programmers " | |
| f"occasionally deviate, particularly under time pressure.", | |
| "medium", "operator_spacing_consistency" | |
| ) | |
| # Ako nema signala, dodaj neutralnu poruku | |
| if not objasnjenja: | |
| if ai_prob > 0.5: | |
| dodaj( | |
| "No single dominant signal was identified; the classification is based " | |
| "on a combination of subtle stylistic and structural patterns.", | |
| "medium", "combined" | |
| ) | |
| else: | |
| dodaj( | |
| "No strong AI-generation markers were detected. " | |
| "The submission's style and structure are consistent with human authorship.", | |
| "positive", "combined" | |
| ) | |
| # Sortiraj: high β medium β positive/low | |
| priority = {"high": 0, "medium": 1, "low": 2, "positive": 3} | |
| objasnjenja.sort(key=lambda x: priority.get(x["severity"], 2)) | |
| return objasnjenja | |
| def predict(code: str, language: str = None, filename: str = None, | |
| model=None, scaler=None, feature_names=None, threshold: float = None) -> dict: | |
| """ | |
| Analizira isjeΔak koda i vraΔa procjenu vjerojatnosti AI podrijetla. | |
| Ako model/scaler/feature_names nisu proslijeΔeni, automatski ih uΔita s diska. | |
| Parametri: | |
| code (str): Izvorni kod za analizu. | |
| language (str): Programski jezik (opcionalno, automatska detekcija). | |
| filename (str): Ime datoteke (opcionalno, pomaΕΎe detekciji jezika). | |
| model: UΔitani model (opcionalno, za viΕ‘ekratnu upotrebu). | |
| scaler: UΔitani scaler (opcionalno). | |
| feature_names (list):Lista naziva znaΔajki (opcionalno). | |
| VraΔa: | |
| dict s kljuΔevima: | |
| "ai_probability" β float 0.0-1.0, vjerojatnost AI podrijetla | |
| "verdict" β string s tumaΔenjem rezultata | |
| "detected_language" β prepoznati programski jezik | |
| "top_features" β lista (naziv, vrijednost) top 5 znaΔajki | |
| "all_features" β rjeΔnik svih izvuΔenih znaΔajki | |
| "error" β string s greΕ‘kom, ili None ako je sve OK | |
| """ | |
| # UΔitaj model ako nije proslijeΔen | |
| if model is None: | |
| model, scaler, feature_names, threshold = ucitaj_model() | |
| else: | |
| threshold = 0.65 # konzervativni default ako je model proslijeΔen direktno | |
| if model is None: | |
| return { | |
| "ai_probability": None, | |
| "verdict": "Model nije dostupan", | |
| "detected_language": None, | |
| "top_features": [], | |
| "all_features": {}, | |
| "error": "Model nije treniran. Pokreni: python classifier.py" | |
| } | |
| # Provjera minimalne duljine β kratki kodovi nemaju dovoljno signala | |
| # za pouzdanu analizu i skloni su laΕΎno pozitivnim rezultatima | |
| meaningful_lines = len([l for l in code.splitlines() if l.strip()]) | |
| if meaningful_lines < MINIMUM_LINES: | |
| return { | |
| "ai_probability": None, | |
| "verdict": "Premalo koda za analizu", | |
| "detected_language": None, | |
| "top_features": [], | |
| "all_features": {}, | |
| "error": ( | |
| f"Analiza zahtijeva najmanje {MINIMUM_LINES} nepraznih linija koda. " | |
| f"Predani isjeΔak ima {meaningful_lines} " | |
| f"({'liniju' if meaningful_lines == 1 else 'linije' if meaningful_lines < 5 else 'linija'})." | |
| ) | |
| } | |
| # Izvuci znaΔajke | |
| sve_znacajke = extract_all_features( | |
| code=code, language=language, filename=filename | |
| ) | |
| # SloΕΎi feature vektor u TOΔNO isti redosljed kao pri treniranju | |
| feature_vector = [] | |
| for feat in feature_names: | |
| val = sve_znacajke.get(feat, 0.0) | |
| if feat == "perplexity" and val == PERPLEXITY_MISSING: | |
| val = 0.0 | |
| feature_vector.append(float(val)) | |
| X = np.array([feature_vector], dtype=np.float32) | |
| X_scaled = scaler.transform(X) | |
| # Predikcija | |
| ai_prob = float(model.predict_proba(X_scaled)[0][1]) | |
| # TumaΔenje β koristimo optimalni prag pronaΔen pri treniranju | |
| # Sve iznad threshold-a ide prema "AI", sve ispod prema "Human" | |
| ai_cutoff = threshold # npr. 0.68 pronaΔen automatski | |
| if ai_prob >= min(ai_cutoff + 0.15, 0.90): | |
| verdict = "Vjerojatno AI" | |
| elif ai_prob >= ai_cutoff: | |
| verdict = "MoguΔe AI" | |
| elif ai_prob >= ai_cutoff - 0.20: | |
| verdict = "Nejasno" | |
| elif ai_prob >= ai_cutoff - 0.40: | |
| verdict = "MoguΔe ΔovjeΔji" | |
| else: | |
| verdict = "Vjerojatno ΔovjeΔji" | |
| # Top 5 znaΔajki koje su doprinijele odluci | |
| # Dohvati importances iz base estimatora unutar CalibratedClassifierCV | |
| try: | |
| base_rf = model.calibrated_classifiers_[0].estimator | |
| importances = base_rf.feature_importances_ | |
| except Exception: | |
| importances = np.ones(len(feature_names)) / len(feature_names) | |
| top_idx = np.argsort(importances)[::-1][:5] | |
| top_features = [ | |
| { | |
| "name": feature_names[i], | |
| "value": round(feature_vector[i], 4), | |
| "importance": round(float(importances[i]), 4), | |
| } | |
| for i in top_idx | |
| ] | |
| # Generiraj objaΕ‘njenja zaΕ‘to je kod klasificiran ovako | |
| objasnjenja = generate_explanations(sve_znacajke, ai_prob) | |
| return { | |
| "ai_probability": round(ai_prob, 4), | |
| "verdict": verdict, | |
| "detected_language": sve_znacajke.get("detected_language", "nepoznat"), | |
| "top_features": top_features, | |
| "all_features": sve_znacajke, | |
| "explanations": objasnjenja, | |
| "error": None, | |
| } | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GLAVNI PROGRAM | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| print("=" * 50) | |
| print(" Treniranje klasifikatora") | |
| print("=" * 50) | |
| # UΔitaj dataset | |
| print(f"\n UΔitavam dataset iz '{DATASET_PATH}'...") | |
| X, y, feature_names = ucitaj_dataset(DATASET_PATH) | |
| # ββ UNDERSAMPLING ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Balansiramo klase uzimanjem max 3x viΕ‘e human primjera nego AI. | |
| # Bez ovoga model s 63:1 omjerom gotovo uvijek predviΔa human. | |
| # Cilj: human β 2-3x AI β model nauΔi obje klase jednako dobro. | |
| n_ai = int(np.sum(y == 1)) | |
| n_human = int(np.sum(y == 0)) | |
| target_human = min(n_human, n_ai * 3) # max 3x viΕ‘e human nego AI | |
| if n_human > target_human: | |
| print(f"\n Undersampling: {n_human} β {target_human} human primjera") | |
| print(f" (zadrΕΎavamo svih {n_ai} AI + {target_human} human = " | |
| f"{n_ai + target_human} ukupno, omjer {target_human//n_ai}:1)") | |
| rng = np.random.default_rng(42) | |
| human_idx = np.where(y == 0)[0] | |
| ai_idx = np.where(y == 1)[0] | |
| # NasumiΔno uzimamo target_human primjera iz human klase | |
| chosen_human = rng.choice(human_idx, size=target_human, replace=False) | |
| # Spajamo s AI primjerima i mijeΕ‘amo | |
| all_idx = np.concatenate([chosen_human, ai_idx]) | |
| rng.shuffle(all_idx) | |
| X = X[all_idx] | |
| y = y[all_idx] | |
| print(f" Nakon undersamplinga: Human={int(np.sum(y==0))}, " | |
| f"AI={int(np.sum(y==1))}, Ukupno={len(y)}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Treniraj | |
| model, scaler, metrics = treniraj(X, y, feature_names) | |
| # Spremi | |
| spremi_model(model, scaler, feature_names) | |
| # Brzi test predikcije | |
| print("\n" + "β" * 50) | |
| print(" BRZI TEST PREDIKCIJE") | |
| print("β" * 50) | |
| test_kodovi = { | |
| "AI Python": ''' | |
| def calculate_fibonacci(n: int) -> list: | |
| """ | |
| Generate a Fibonacci sequence up to n terms. | |
| Args: | |
| n: The number of terms to generate. | |
| Returns: | |
| A list containing the Fibonacci sequence. | |
| """ | |
| if n <= 0: | |
| raise ValueError("Number of terms must be positive.") | |
| fibonacci_sequence = [0, 1] | |
| for i in range(2, n): | |
| next_value = fibonacci_sequence[i - 1] + fibonacci_sequence[i - 2] | |
| fibonacci_sequence.append(next_value) | |
| return fibonacci_sequence[:n] | |
| ''', | |
| "Human Python": ''' | |
| def fib(n): | |
| # quick fib | |
| a, b = 0, 1 | |
| res = [] | |
| for _ in range(n): | |
| res.append(a) | |
| a, b = b, a+b | |
| return res | |
| ''', | |
| } | |
| for naziv, kod in test_kodovi.items(): | |
| rezultat = predict(kod, model=model, scaler=scaler, | |
| feature_names=feature_names) | |
| prob = rezultat["ai_probability"] | |
| verdict = rezultat["verdict"] | |
| lang = rezultat["detected_language"] | |
| print(f"\n [{naziv}]") | |
| print(f" Jezik: {lang}") | |
| print(f" AI vjerojatnost: {prob:.1%}") | |
| print(f" ZakljuΔak: {verdict}") | |
| print(f" KljuΔne znaΔajke:") | |
| for feat in rezultat["top_features"]: | |
| print(f" {feat['name']:<35} vrijednost={feat['value']:.4f}") | |
| print("\n" + "=" * 50) | |
| print(" Treniranje zavrΕ‘eno.") | |
| print(" SljedeΔi korak: python app.py") | |
| print("=" * 50) | |
| if __name__ == "__main__": | |
| main() | |