Spaces:

Liva21
/

financial-sentiment-api

Sleeping

App Files Files Community

financial-sentiment-api / src /eda.py

Liva21

feat: Financial Sentiment API — FinBERT fine-tuned, FastAPI, Docker, TR/EN multilingual

7701077 5 days ago

raw

history blame contribute delete

6.02 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import matplotlib.patches as mpatches
	import seaborn as sns

	# ── Genel ayarlar ──────────────────────────────────────────────
	plt.rcParams["figure.dpi"] = 120
	plt.rcParams["font.size"] = 11
	COLORS = {"negative": "#e74c3c", "neutral": "#95a5a6", "positive": "#2ecc71"}

	df = pd.read_csv("data/financial_phrasebank.csv")

	# ══════════════════════════════════════════════════════════════
	# 1. TEMEL İSTATİSTİKLER
	# ══════════════════════════════════════════════════════════════
	print("=" * 55)
	print(" DATASET GENEL BAKIŞ")
	print("=" * 55)
	print(f"Toplam örnek : {len(df)}")
	print(f"Sütunlar : {list(df.columns)}")
	print(f"Eksik değer : {df.isnull().sum().sum()}")
	print()

	counts = df["label_str"].value_counts()
	print("Label dağılımı:")
	for label, count in counts.items():
	pct = count / len(df) * 100
	bar = "█" * int(pct / 2)
	print(f" {label:<10} {count:>5} ({pct:5.1f}%) {bar}")

	# ══════════════════════════════════════════════════════════════
	# 2. METİN UZUNLUĞU ANALİZİ
	# ══════════════════════════════════════════════════════════════
	df["char_count"] = df["sentence"].str.len()
	df["word_count"] = df["sentence"].str.split().str.len()
	df["token_approx"] = (df["char_count"] / 4).astype(int) # kaba token tahmini

	print()
	print("=" * 55)
	print(" METİN UZUNLUĞU (kelime sayısı)")
	print("=" * 55)
	stats = df.groupby("label_str")["word_count"].describe()[["mean","min","50%","max"]]
	print(stats.round(1).to_string())

	print()
	print(f"512 token'ı aşan cümle (BERT limiti): "
	f"{(df['token_approx'] > 512).sum()} adet")

	# ══════════════════════════════════════════════════════════════
	# 3. GÖRSELLEŞTİRME
	# ══════════════════════════════════════════════════════════════
	fig, axes = plt.subplots(1, 3, figsize=(15, 4))
	fig.suptitle("Financial Phrasebank — EDA", fontweight="bold", fontsize=13)

	# — 3a. Label dağılımı (bar chart) —
	ax = axes[0]
	bars = ax.bar(counts.index, counts.values,
	color=[COLORS[l] for l in counts.index], edgecolor="white", linewidth=1.5)
	ax.set_title("Label Dağılımı")
	ax.set_ylabel("Örnek Sayısı")
	for bar, val in zip(bars, counts.values):
	ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 15,
	str(val), ha="center", fontweight="bold")
	ax.set_ylim(0, counts.max() * 1.15)
	ax.spines[["top","right"]].set_visible(False)

	# — 3b. Kelime sayısı dağılımı (histogram, label'a göre renkli) —
	ax = axes[1]
	for label in ["negative", "neutral", "positive"]:
	subset = df[df["label_str"] == label]["word_count"]
	ax.hist(subset, bins=30, alpha=0.6, color=COLORS[label], label=label, edgecolor="none")
	ax.set_title("Kelime Sayısı Dağılımı")
	ax.set_xlabel("Kelime Sayısı")
	ax.set_ylabel("Frekans")
	ax.legend()
	ax.spines[["top","right"]].set_visible(False)

	# — 3c. Boxplot — label başına uzunluk —
	ax = axes[2]
	data_to_plot = [df[df["label_str"]==l]["word_count"].values
	for l in ["negative","neutral","positive"]]
	bp = ax.boxplot(data_to_plot, patch_artist=True, notch=False,
	medianprops=dict(color="white", linewidth=2))
	for patch, label in zip(bp["boxes"], ["negative","neutral","positive"]):
	patch.set_facecolor(COLORS[label])
	ax.set_xticklabels(["negative","neutral","positive"])
	ax.set_title("Kelime Sayısı — Boxplot")
	ax.set_ylabel("Kelime Sayısı")
	ax.spines[["top","right"]].set_visible(False)

	plt.tight_layout()
	plt.savefig("data/eda_plots.png", bbox_inches="tight")
	print()
	print("Grafik kaydedildi: data/eda_plots.png")

	# ══════════════════════════════════════════════════════════════
	# 4. ÖRNEK CÜMLELER — her sınıftan 2'şer tane
	# ══════════════════════════════════════════════════════════════
	print()
	print("=" * 55)
	print(" ÖRNEK CÜMLELER")
	print("=" * 55)
	for label in ["negative", "neutral", "positive"]:
	print(f"\n[ {label.upper()} ]")
	samples = df[df["label_str"] == label]["sentence"].sample(2, random_state=42)
	for i, s in enumerate(samples, 1):
	print(f" {i}. {s[:120]}{'...' if len(s)>120 else ''}")

	# ══════════════════════════════════════════════════════════════
	# 5. CLASS IMBALANCE — ne yapmalıyız?
	# ══════════════════════════════════════════════════════════════
	majority = counts.max()
	print()
	print("=" * 55)
	print(" CLASS IMBALANCE ANALİZİ")
	print("=" * 55)
	for label, count in counts.items():
	ratio = majority / count
	print(f" {label:<10} imbalance ratio: {ratio:.2f}x")

	print("""
	Strateji: Fine-tuning sırasında class_weight='balanced'
	veya WeightedRandomSampler kullanacağız. (Adım 3'te ele alacağız)
	""")