import numpy as np import pandas as pd import torch import matplotlib.pyplot as plt import seaborn as sns from transformers import AutoTokenizer, AutoModelForSequenceClassification from sklearn.metrics import ( accuracy_score, f1_score, classification_report, confusion_matrix, ) # ══════════════════════════════════════════════════════════════ # CONFIG # ══════════════════════════════════════════════════════════════ MODEL_DIR = "models/finbert-finetuned" TEST_CSV = "data/test_set.csv" ID2LABEL = {0: "negative", 1: "neutral", 2: "positive"} LABEL2ID = {"negative": 0, "neutral": 1, "positive": 2} MAX_LENGTH = 128 BATCH_SIZE = 32 COLORS = {"negative": "#e74c3c", "neutral": "#95a5a6", "positive": "#2ecc71"} DEVICE = ( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) print(f"Cihaz: {DEVICE}") # ══════════════════════════════════════════════════════════════ # 1. MODEL & TOKENİZER YÜKLE # ══════════════════════════════════════════════════════════════ print("Model yükleniyor...") tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR) model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(DEVICE) model.eval() # ══════════════════════════════════════════════════════════════ # 2. TAHMİN FONKSİYONU # ══════════════════════════════════════════════════════════════ def predict(texts: list[str]) -> tuple[np.ndarray, np.ndarray]: """Batch tahmin — logits ve predicted label id'leri döner.""" all_preds, all_probs = [], [] for i in range(0, len(texts), BATCH_SIZE): batch = texts[i : i + BATCH_SIZE] enc = tokenizer( batch, padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt", ).to(DEVICE) with torch.no_grad(): logits = model(**enc).logits probs = torch.softmax(logits, dim=-1).cpu().numpy() preds = np.argmax(probs, axis=-1) all_preds.append(preds) all_probs.append(probs) return np.concatenate(all_preds), np.vstack(all_probs) # ══════════════════════════════════════════════════════════════ # 3. TEST SETİ TAHMİNLERİ # ══════════════════════════════════════════════════════════════ df = pd.read_csv(TEST_CSV) df["label"] = df["label_str"].map(LABEL2ID) print(f"Test seti: {len(df)} örnek") preds, probs = predict(df["sentence"].tolist()) df["pred"] = preds df["pred_str"] = df["pred"].map(ID2LABEL) df["confidence"] = probs.max(axis=1) df["correct"] = df["label"] == df["pred"] # ══════════════════════════════════════════════════════════════ # 4. METRİKLER # ══════════════════════════════════════════════════════════════ acc = accuracy_score(df["label"], df["pred"]) f1_macro = f1_score(df["label"], df["pred"], average="macro") f1_weighted= f1_score(df["label"], df["pred"], average="weighted") print("\n" + "="*55) print(" TEST METRİKLERİ") print("="*55) print(f" Accuracy : {acc:.4f}") print(f" F1 Macro : {f1_macro:.4f}") print(f" F1 Weighted : {f1_weighted:.4f}") print("\n--- Classification Report ---") print(classification_report( df["label"], df["pred"], target_names=["negative", "neutral", "positive"] )) # ══════════════════════════════════════════════════════════════ # 5. GÖRSELLEŞTİRME # ══════════════════════════════════════════════════════════════ fig, axes = plt.subplots(1, 3, figsize=(16, 5)) fig.suptitle("Model Değerlendirme — Test Seti", fontweight="bold") # — 5a. Confusion Matrix — cm = confusion_matrix(df["label"], df["pred"]) labels = ["negative", "neutral", "positive"] sns.heatmap( cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels, ax=axes[0], cbar=False, annot_kws={"size": 13, "weight": "bold"}, ) axes[0].set_title("Confusion Matrix") axes[0].set_ylabel("Gerçek") axes[0].set_xlabel("Tahmin") # — 5b. Confidence dağılımı (doğru vs. yanlış) — ax = axes[1] for correct, label, color in [(True, "Doğru", "#2ecc71"), (False, "Yanlış", "#e74c3c")]: subset = df[df["correct"] == correct]["confidence"] ax.hist(subset, bins=20, alpha=0.7, color=color, label=f"{label} ({len(subset)})") ax.set_title("Tahmin Güven Skoru") ax.set_xlabel("Confidence (softmax max)") ax.set_ylabel("Frekans") ax.legend() ax.spines[["top", "right"]].set_visible(False) # — 5c. Sınıf bazında F1 — ax = axes[2] report = classification_report( df["label"], df["pred"], target_names=labels, output_dict=True ) f1s = [report[l]["f1-score"] for l in labels] bars = ax.bar(labels, f1s, color=[COLORS[l] for l in labels], edgecolor="white") for bar, val in zip(bars, f1s): ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, f"{val:.3f}", ha="center", fontweight="bold") ax.set_title("Sınıf Bazında F1 Skoru") ax.set_ylim(0, 1.15) ax.set_ylabel("F1 Score") ax.axhline(y=f1_macro, color="gray", linestyle="--", alpha=0.7, label=f"Macro avg: {f1_macro:.3f}") ax.legend() ax.spines[["top", "right"]].set_visible(False) plt.tight_layout() plt.savefig("data/evaluation_plots.png", bbox_inches="tight") print("Grafik kaydedildi: data/evaluation_plots.png") # ══════════════════════════════════════════════════════════════ # 6. HATA ANALİZİ — modelin yanıldığı örnekler # ══════════════════════════════════════════════════════════════ errors = df[~df["correct"]].sort_values("confidence", ascending=False) print(f"\n{'='*55}") print(f" HATA ANALİZİ — {len(errors)} yanlış tahmin") print(f"{'='*55}") if len(errors) > 0: print(f"\nEn güvenli yanlış tahminler (yüksek confidence ama yanlış):") for _, row in errors.head(5).iterrows(): print(f"\n Cümle : {row['sentence'][:100]}...") print(f" Gerçek : {row['label_str']:<10} Tahmin: {row['pred_str']:<10} Conf: {row['confidence']:.3f}") else: print("Hata yok — mükemmel test performansı!") # Hataları kaydet errors[["sentence","label_str","pred_str","confidence"]].to_csv( "data/errors.csv", index=False ) print(f"\nHatalar kaydedildi: data/errors.csv")