File size: 6,022 Bytes
7701077
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

# ── Genel ayarlar ──────────────────────────────────────────────
plt.rcParams["figure.dpi"] = 120
plt.rcParams["font.size"] = 11
COLORS = {"negative": "#e74c3c", "neutral": "#95a5a6", "positive": "#2ecc71"}

df = pd.read_csv("data/financial_phrasebank.csv")

# ══════════════════════════════════════════════════════════════
# 1. TEMEL Δ°STATΔ°STΔ°KLER
# ══════════════════════════════════════════════════════════════
print("=" * 55)
print("  DATASET GENEL BAKIŞ")
print("=" * 55)
print(f"Toplam ΓΆrnek      : {len(df)}")
print(f"SΓΌtunlar          : {list(df.columns)}")
print(f"Eksik değer       : {df.isnull().sum().sum()}")
print()

counts = df["label_str"].value_counts()
print("Label dağılımı:")
for label, count in counts.items():
    pct = count / len(df) * 100
    bar = "β–ˆ" * int(pct / 2)
    print(f"  {label:<10} {count:>5}  ({pct:5.1f}%)  {bar}")

# ══════════════════════════════════════════════════════════════
# 2. METİN UZUNLUĞU ANALİZİ
# ══════════════════════════════════════════════════════════════
df["char_count"]  = df["sentence"].str.len()
df["word_count"]  = df["sentence"].str.split().str.len()
df["token_approx"] = (df["char_count"] / 4).astype(int)  # kaba token tahmini

print()
print("=" * 55)
print("  METİN UZUNLUĞU (kelime sayısı)")
print("=" * 55)
stats = df.groupby("label_str")["word_count"].describe()[["mean","min","50%","max"]]
print(stats.round(1).to_string())

print()
print(f"512 token'ı aşan cümle (BERT limiti): "
      f"{(df['token_approx'] > 512).sum()} adet")

# ══════════════════════════════════════════════════════════════
# 3. GΓ–RSELLEŞTΔ°RME
# ══════════════════════════════════════════════════════════════
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
fig.suptitle("Financial Phrasebank β€” EDA", fontweight="bold", fontsize=13)

# β€” 3a. Label dağılΔ±mΔ± (bar chart) β€”
ax = axes[0]
bars = ax.bar(counts.index, counts.values,
              color=[COLORS[l] for l in counts.index], edgecolor="white", linewidth=1.5)
ax.set_title("Label Dağılımı")
ax.set_ylabel("Γ–rnek SayΔ±sΔ±")
for bar, val in zip(bars, counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 15,
            str(val), ha="center", fontweight="bold")
ax.set_ylim(0, counts.max() * 1.15)
ax.spines[["top","right"]].set_visible(False)

# β€” 3b. Kelime sayΔ±sΔ± dağılΔ±mΔ± (histogram, label'a gΓΆre renkli) β€”
ax = axes[1]
for label in ["negative", "neutral", "positive"]:
    subset = df[df["label_str"] == label]["word_count"]
    ax.hist(subset, bins=30, alpha=0.6, color=COLORS[label], label=label, edgecolor="none")
ax.set_title("Kelime Sayısı Dağılımı")
ax.set_xlabel("Kelime SayΔ±sΔ±")
ax.set_ylabel("Frekans")
ax.legend()
ax.spines[["top","right"]].set_visible(False)

# β€” 3c. Boxplot β€” label başına uzunluk β€”
ax = axes[2]
data_to_plot = [df[df["label_str"]==l]["word_count"].values
                for l in ["negative","neutral","positive"]]
bp = ax.boxplot(data_to_plot, patch_artist=True, notch=False,
                medianprops=dict(color="white", linewidth=2))
for patch, label in zip(bp["boxes"], ["negative","neutral","positive"]):
    patch.set_facecolor(COLORS[label])
ax.set_xticklabels(["negative","neutral","positive"])
ax.set_title("Kelime SayΔ±sΔ± β€” Boxplot")
ax.set_ylabel("Kelime SayΔ±sΔ±")
ax.spines[["top","right"]].set_visible(False)

plt.tight_layout()
plt.savefig("data/eda_plots.png", bbox_inches="tight")
print()
print("Grafik kaydedildi: data/eda_plots.png")

# ══════════════════════════════════════════════════════════════
# 4. Γ–RNEK CÜMLELER β€” her sΔ±nΔ±ftan 2'şer tane
# ══════════════════════════════════════════════════════════════
print()
print("=" * 55)
print("  Γ–RNEK CÜMLELER")
print("=" * 55)
for label in ["negative", "neutral", "positive"]:
    print(f"\n[ {label.upper()} ]")
    samples = df[df["label_str"] == label]["sentence"].sample(2, random_state=42)
    for i, s in enumerate(samples, 1):
        print(f"  {i}. {s[:120]}{'...' if len(s)>120 else ''}")

# ══════════════════════════════════════════════════════════════
# 5. CLASS IMBALANCE β€” ne yapmalΔ±yΔ±z?
# ══════════════════════════════════════════════════════════════
majority = counts.max()
print()
print("=" * 55)
print("  CLASS IMBALANCE ANALΔ°ZΔ°")
print("=" * 55)
for label, count in counts.items():
    ratio = majority / count
    print(f"  {label:<10}  imbalance ratio: {ratio:.2f}x")

print("""
Strateji: Fine-tuning sΔ±rasΔ±nda class_weight='balanced' 
veya WeightedRandomSampler kullanacağız. (Adım 3'te ele alacağız)
""")