Spaces:
Sleeping
Sleeping
File size: 6,022 Bytes
7701077 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 | import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
# ββ Genel ayarlar ββββββββββββββββββββββββββββββββββββββββββββββ
plt.rcParams["figure.dpi"] = 120
plt.rcParams["font.size"] = 11
COLORS = {"negative": "#e74c3c", "neutral": "#95a5a6", "positive": "#2ecc71"}
df = pd.read_csv("data/financial_phrasebank.csv")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. TEMEL Δ°STATΔ°STΔ°KLER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("=" * 55)
print(" DATASET GENEL BAKIΕ")
print("=" * 55)
print(f"Toplam ΓΆrnek : {len(df)}")
print(f"SΓΌtunlar : {list(df.columns)}")
print(f"Eksik deΔer : {df.isnull().sum().sum()}")
print()
counts = df["label_str"].value_counts()
print("Label daΔΔ±lΔ±mΔ±:")
for label, count in counts.items():
pct = count / len(df) * 100
bar = "β" * int(pct / 2)
print(f" {label:<10} {count:>5} ({pct:5.1f}%) {bar}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. METΔ°N UZUNLUΔU ANALΔ°ZΔ°
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
df["char_count"] = df["sentence"].str.len()
df["word_count"] = df["sentence"].str.split().str.len()
df["token_approx"] = (df["char_count"] / 4).astype(int) # kaba token tahmini
print()
print("=" * 55)
print(" METΔ°N UZUNLUΔU (kelime sayΔ±sΔ±)")
print("=" * 55)
stats = df.groupby("label_str")["word_count"].describe()[["mean","min","50%","max"]]
print(stats.round(1).to_string())
print()
print(f"512 token'Δ± aΕan cΓΌmle (BERT limiti): "
f"{(df['token_approx'] > 512).sum()} adet")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. GΓRSELLEΕTΔ°RME
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle("Financial Phrasebank β EDA", fontweight="bold", fontsize=13)
# β 3a. Label daΔΔ±lΔ±mΔ± (bar chart) β
ax = axes[0]
bars = ax.bar(counts.index, counts.values,
color=[COLORS[l] for l in counts.index], edgecolor="white", linewidth=1.5)
ax.set_title("Label DaΔΔ±lΔ±mΔ±")
ax.set_ylabel("Γrnek SayΔ±sΔ±")
for bar, val in zip(bars, counts.values):
ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 15,
str(val), ha="center", fontweight="bold")
ax.set_ylim(0, counts.max() * 1.15)
ax.spines[["top","right"]].set_visible(False)
# β 3b. Kelime sayΔ±sΔ± daΔΔ±lΔ±mΔ± (histogram, label'a gΓΆre renkli) β
ax = axes[1]
for label in ["negative", "neutral", "positive"]:
subset = df[df["label_str"] == label]["word_count"]
ax.hist(subset, bins=30, alpha=0.6, color=COLORS[label], label=label, edgecolor="none")
ax.set_title("Kelime SayΔ±sΔ± DaΔΔ±lΔ±mΔ±")
ax.set_xlabel("Kelime SayΔ±sΔ±")
ax.set_ylabel("Frekans")
ax.legend()
ax.spines[["top","right"]].set_visible(False)
# β 3c. Boxplot β label baΕΔ±na uzunluk β
ax = axes[2]
data_to_plot = [df[df["label_str"]==l]["word_count"].values
for l in ["negative","neutral","positive"]]
bp = ax.boxplot(data_to_plot, patch_artist=True, notch=False,
medianprops=dict(color="white", linewidth=2))
for patch, label in zip(bp["boxes"], ["negative","neutral","positive"]):
patch.set_facecolor(COLORS[label])
ax.set_xticklabels(["negative","neutral","positive"])
ax.set_title("Kelime SayΔ±sΔ± β Boxplot")
ax.set_ylabel("Kelime SayΔ±sΔ±")
ax.spines[["top","right"]].set_visible(False)
plt.tight_layout()
plt.savefig("data/eda_plots.png", bbox_inches="tight")
print()
print("Grafik kaydedildi: data/eda_plots.png")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. ΓRNEK CΓMLELER β her sΔ±nΔ±ftan 2'Εer tane
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print()
print("=" * 55)
print(" ΓRNEK CΓMLELER")
print("=" * 55)
for label in ["negative", "neutral", "positive"]:
print(f"\n[ {label.upper()} ]")
samples = df[df["label_str"] == label]["sentence"].sample(2, random_state=42)
for i, s in enumerate(samples, 1):
print(f" {i}. {s[:120]}{'...' if len(s)>120 else ''}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. CLASS IMBALANCE β ne yapmalΔ±yΔ±z?
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
majority = counts.max()
print()
print("=" * 55)
print(" CLASS IMBALANCE ANALΔ°ZΔ°")
print("=" * 55)
for label, count in counts.items():
ratio = majority / count
print(f" {label:<10} imbalance ratio: {ratio:.2f}x")
print("""
Strateji: Fine-tuning sΔ±rasΔ±nda class_weight='balanced'
veya WeightedRandomSampler kullanacaΔΔ±z. (AdΔ±m 3'te ele alacaΔΔ±z)
""")
|