Spaces:
Sleeping
Sleeping
File size: 9,419 Bytes
7701077 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 | import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from datasets import Dataset
import torch
from transformers import (
AutoTokenizer,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
EarlyStoppingCallback,
)
from transformers.trainer_utils import set_seed
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CONFIG β tek yerden yΓΆnet
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CONFIG = {
"model_name" : "ProsusAI/finbert", # finansal domaine pre-train edilmiΕ BERT
"max_length" : 128, # cΓΌmlelerimiz ~22 kelime, 128 yeterli
"batch_size" : 16,
"epochs" : 5,
"lr" : 2e-5, # BERT fine-tune iΓ§in standart aralΔ±k
"seed" : 42,
"subset_size" : None, # None = tΓΌm veri, int = ilk N ΓΆrnek
"output_dir" : "models/finbert-finetuned",
"label2id" : {"negative": 0, "neutral": 1, "positive": 2},
"id2label" : {0: "negative", 1: "neutral", 2: "positive"},
}
set_seed(CONFIG["seed"])
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"KullanΔ±lan cihaz: {DEVICE}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. VERΔ° YΓKLE & BΓLE
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
df = pd.read_csv("data/financial_phrasebank.csv")
df["label"] = df["label_str"].map(CONFIG["label2id"])
if CONFIG["subset_size"]:
df = df.sample(CONFIG["subset_size"], random_state=CONFIG["seed"])
# Stratified split β her sΔ±nΔ±f oranΔ± korunur
train_df, temp_df = train_test_split(
df, test_size=0.2, stratify=df["label"], random_state=CONFIG["seed"]
)
val_df, test_df = train_test_split(
temp_df, test_size=0.5, stratify=temp_df["label"], random_state=CONFIG["seed"]
)
print(f"\nVeri boyutlarΔ±:")
print(f" Train : {len(train_df)}")
print(f" Val : {len(val_df)}")
print(f" Test : {len(test_df)}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. TOKENΔ°ZASYON
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])
def tokenize(batch):
return tokenizer(
batch["sentence"],
padding="max_length",
truncation=True,
max_length=CONFIG["max_length"],
)
def df_to_dataset(df):
ds = Dataset.from_pandas(df[["sentence", "label"]].reset_index(drop=True))
return ds.map(tokenize, batched=True)
print("\nTokenizer yΓΌkleniyor ve veri tokenize ediliyor...")
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df)
test_ds = df_to_dataset(test_df)
# Test setini ileride kullanmak ΓΌzere kaydet
test_df.to_csv("data/test_set.csv", index=False)
print("Test seti kaydedildi: data/test_set.csv")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. CLASS WEIGHT β imbalance'a karΕΔ±
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class_weights = compute_class_weight(
class_weight="balanced",
classes=np.array([0, 1, 2]),
y=train_df["label"].values,
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
print(f"\nClass weights: {dict(zip(['neg','neu','pos'], class_weights.round(2)))}")
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. MODEL
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
model = AutoModelForSequenceClassification.from_pretrained(
CONFIG["model_name"],
num_labels=3,
id2label=CONFIG["id2label"],
label2id=CONFIG["label2id"],
ignore_mismatched_sizes=True, # FinBERT'in orijinal head'i 3 label, yine de safe
)
model = model.to(DEVICE)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. CUSTOM TRAINER β weighted loss
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class WeightedTrainer(Trainer):
"""Class imbalance'Δ± kompanse etmek iΓ§in weighted CrossEntropy."""
def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
labels = inputs.pop("labels")
outputs = model(**inputs)
logits = outputs.logits
loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
loss = loss_fn(logits, labels)
return (loss, outputs) if return_outputs else loss
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 6. METRΔ°KLER
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, preds),
"f1_macro": f1_score(labels, preds, average="macro"),
"f1_weighted": f1_score(labels, preds, average="weighted"),
}
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 7. TRAINING ARGUMENTS
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
args = TrainingArguments(
output_dir = CONFIG["output_dir"],
num_train_epochs = CONFIG["epochs"],
per_device_train_batch_size = CONFIG["batch_size"],
per_device_eval_batch_size = CONFIG["batch_size"],
learning_rate = CONFIG["lr"],
weight_decay = 0.01,
evaluation_strategy = "epoch",
save_strategy = "epoch",
load_best_model_at_end = True,
metric_for_best_model = "f1_macro",
greater_is_better = True,
logging_steps = 20,
seed = CONFIG["seed"],
report_to = "none", # W&B vs. kapalΔ±
fp16 = DEVICE == "cuda", # GPU varsa hΔ±zlandΔ±r
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 8. TRAIN!
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
trainer = WeightedTrainer(
model = model,
args = args,
train_dataset = train_ds,
eval_dataset = val_ds,
compute_metrics = compute_metrics,
callbacks = [EarlyStoppingCallback(early_stopping_patience=2)],
)
print("\n" + "="*55)
print(" EΔΔ°TΔ°M BAΕLIYOR")
print("="*55)
trainer.train()
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 9. KAYDET
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
os.makedirs(CONFIG["output_dir"], exist_ok=True)
trainer.save_model(CONFIG["output_dir"])
tokenizer.save_pretrained(CONFIG["output_dir"])
print(f"\nModel kaydedildi: {CONFIG['output_dir']}")
# Son val metriklerini gΓΆster
final = trainer.evaluate()
print("\nFinal Validation Metrikleri:")
for k, v in final.items():
if isinstance(v, float):
print(f" {k:<25} {v:.4f}")
|