Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patches as mpatches | |
| import seaborn as sns | |
| # ββ Genel ayarlar ββββββββββββββββββββββββββββββββββββββββββββββ | |
| plt.rcParams["figure.dpi"] = 120 | |
| plt.rcParams["font.size"] = 11 | |
| COLORS = {"negative": "#e74c3c", "neutral": "#95a5a6", "positive": "#2ecc71"} | |
| df = pd.read_csv("data/financial_phrasebank.csv") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. TEMEL Δ°STATΔ°STΔ°KLER | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("=" * 55) | |
| print(" DATASET GENEL BAKIΕ") | |
| print("=" * 55) | |
| print(f"Toplam ΓΆrnek : {len(df)}") | |
| print(f"SΓΌtunlar : {list(df.columns)}") | |
| print(f"Eksik deΔer : {df.isnull().sum().sum()}") | |
| print() | |
| counts = df["label_str"].value_counts() | |
| print("Label daΔΔ±lΔ±mΔ±:") | |
| for label, count in counts.items(): | |
| pct = count / len(df) * 100 | |
| bar = "β" * int(pct / 2) | |
| print(f" {label:<10} {count:>5} ({pct:5.1f}%) {bar}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. METΔ°N UZUNLUΔU ANALΔ°ZΔ° | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| df["char_count"] = df["sentence"].str.len() | |
| df["word_count"] = df["sentence"].str.split().str.len() | |
| df["token_approx"] = (df["char_count"] / 4).astype(int) # kaba token tahmini | |
| print() | |
| print("=" * 55) | |
| print(" METΔ°N UZUNLUΔU (kelime sayΔ±sΔ±)") | |
| print("=" * 55) | |
| stats = df.groupby("label_str")["word_count"].describe()[["mean","min","50%","max"]] | |
| print(stats.round(1).to_string()) | |
| print() | |
| print(f"512 token'Δ± aΕan cΓΌmle (BERT limiti): " | |
| f"{(df['token_approx'] > 512).sum()} adet") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. GΓRSELLEΕTΔ°RME | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| fig, axes = plt.subplots(1, 3, figsize=(15, 4)) | |
| fig.suptitle("Financial Phrasebank β EDA", fontweight="bold", fontsize=13) | |
| # β 3a. Label daΔΔ±lΔ±mΔ± (bar chart) β | |
| ax = axes[0] | |
| bars = ax.bar(counts.index, counts.values, | |
| color=[COLORS[l] for l in counts.index], edgecolor="white", linewidth=1.5) | |
| ax.set_title("Label DaΔΔ±lΔ±mΔ±") | |
| ax.set_ylabel("Γrnek SayΔ±sΔ±") | |
| for bar, val in zip(bars, counts.values): | |
| ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 15, | |
| str(val), ha="center", fontweight="bold") | |
| ax.set_ylim(0, counts.max() * 1.15) | |
| ax.spines[["top","right"]].set_visible(False) | |
| # β 3b. Kelime sayΔ±sΔ± daΔΔ±lΔ±mΔ± (histogram, label'a gΓΆre renkli) β | |
| ax = axes[1] | |
| for label in ["negative", "neutral", "positive"]: | |
| subset = df[df["label_str"] == label]["word_count"] | |
| ax.hist(subset, bins=30, alpha=0.6, color=COLORS[label], label=label, edgecolor="none") | |
| ax.set_title("Kelime SayΔ±sΔ± DaΔΔ±lΔ±mΔ±") | |
| ax.set_xlabel("Kelime SayΔ±sΔ±") | |
| ax.set_ylabel("Frekans") | |
| ax.legend() | |
| ax.spines[["top","right"]].set_visible(False) | |
| # β 3c. Boxplot β label baΕΔ±na uzunluk β | |
| ax = axes[2] | |
| data_to_plot = [df[df["label_str"]==l]["word_count"].values | |
| for l in ["negative","neutral","positive"]] | |
| bp = ax.boxplot(data_to_plot, patch_artist=True, notch=False, | |
| medianprops=dict(color="white", linewidth=2)) | |
| for patch, label in zip(bp["boxes"], ["negative","neutral","positive"]): | |
| patch.set_facecolor(COLORS[label]) | |
| ax.set_xticklabels(["negative","neutral","positive"]) | |
| ax.set_title("Kelime SayΔ±sΔ± β Boxplot") | |
| ax.set_ylabel("Kelime SayΔ±sΔ±") | |
| ax.spines[["top","right"]].set_visible(False) | |
| plt.tight_layout() | |
| plt.savefig("data/eda_plots.png", bbox_inches="tight") | |
| print() | |
| print("Grafik kaydedildi: data/eda_plots.png") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. ΓRNEK CΓMLELER β her sΔ±nΔ±ftan 2'Εer tane | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print() | |
| print("=" * 55) | |
| print(" ΓRNEK CΓMLELER") | |
| print("=" * 55) | |
| for label in ["negative", "neutral", "positive"]: | |
| print(f"\n[ {label.upper()} ]") | |
| samples = df[df["label_str"] == label]["sentence"].sample(2, random_state=42) | |
| for i, s in enumerate(samples, 1): | |
| print(f" {i}. {s[:120]}{'...' if len(s)>120 else ''}") | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. CLASS IMBALANCE β ne yapmalΔ±yΔ±z? | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| majority = counts.max() | |
| print() | |
| print("=" * 55) | |
| print(" CLASS IMBALANCE ANALΔ°ZΔ°") | |
| print("=" * 55) | |
| for label, count in counts.items(): | |
| ratio = majority / count | |
| print(f" {label:<10} imbalance ratio: {ratio:.2f}x") | |
| print(""" | |
| Strateji: Fine-tuning sΔ±rasΔ±nda class_weight='balanced' | |
| veya WeightedRandomSampler kullanacaΔΔ±z. (AdΔ±m 3'te ele alacaΔΔ±z) | |
| """) | |