--- library_name: transformers tags: - ABSA - Sentiment - Transformer - BERT - Turkish license: mit datasets: - ebrukilic/tubitak_clothing_absa_v3 language: - tr metrics: - accuracy base_model: - dbmdz/bert-base-turkish-cased pipeline_tag: text-classification --- # Clothing ABSA Model (v3) Bu model, Türkçe ürün yorumları üzerinde Aspect-Based Sentiment Analysis (ABSA) yapmak için fine-tune edilmiştir. - **Developed by:** Ebru Kılıç & Rumeysa Nur Yasav - **Funded by :** TÜBİTAK 2219/A Projesi - **Shared by :** ebrukilic (Hugging Face) - **Model type:** BERT-based Transformer (Fine-tuned for sequence classification) - **Language(s) (NLP):** Turkish - **License:** CC BY-NC 4.0 - **Dataset:** ebrukilic/tubitak_clothing_absa_v3 - **Finetuned from model:** dbmdz/bert-base-turkish-cased - **Weighted cross-entropy loss uygulanmıştır (sınıf dengesizliğini azaltmak için)** - **Trainer:** WeightedTrainer (transformers.Trainer subclass) ## Model Bilgileri Model tipi: Transformer (BERT tabanlı) Eğitim veri kümesi: ebrukilic/tubitak_clothing_absa_v3 (8396 eğitim verisi + 4011 test verisi) Etiketler: ["negatif", "nötr", "pozitif"] Eğitim epochs: 20 Learning rate: 2e-5 Batch size: 16 ## Yüklenmesi Gerekenler Modeli çalıştırmak için aşağıdaki kütüphaneler gereklidir: ```bash pip install torch transformers datasets scikit-learn ``` ### Modelin ABSA Kapsamında Kullanımı Aspect Wise ve Genel Anlamda (aspecte bağlı olmadan) modeli test etmek adına bir pipeline oluşturulmuştur. ```python import torch from torch.utils.data import Dataset, DataLoader from transformers import AutoTokenizer, AutoModelForSequenceClassification from datasets import load_dataset from sklearn.metrics import accuracy_score, f1_score, classification_report import pandas as pd from collections import defaultdict # ------------------- Sabitler ------------------- MODEL_ID = "ebrukilic/bert-absa-tr-v5" DATASET_ID = "ebrukilic/tubitak_clothing_absa_v3" SPLIT = "test" TEXT_COL = "normalized_yorum" LABEL_COL = "polarity" ASPECT_COL = "aspects" BATCH_SIZE = 16 MAX_LEN = 128 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print("Device:", device) # ---------- Dataset yükleme işlemi ---------------- ds_raw = load_dataset(DATASET_ID, split=SPLIT) df = ds_raw.to_pandas() # Aspect listelerini temizle def to_list(x): if x is None: return [] if isinstance(x, list): return x return [x] def clean_list(lst): return [str(a) for a in lst if str(a).lower() not in {"unknown", "unk", ""}] df["_aspect_list"] = df[ASPECT_COL].apply(to_list).apply(clean_list) df = df[df["_aspect_list"].map(len) > 0].copy() # Label encode label_space = sorted(list(set(map(str, df[LABEL_COL])))) label2id = {label:i for i,label in enumerate(label_space)} id2label = {i:label for label,i in label2id.items()} print("Label mapping:", label2id, "\n") # ------------------- Model yükle ------------------- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) model = AutoModelForSequenceClassification.from_pretrained( MODEL_ID, num_labels=len(label_space), id2label=id2label, label2id=label2id ).to(device) model.eval() # ------------------- Dataset sınıfları ------------------- # Aspect-aware dataset class AspectAwareDataset(Dataset): def __init__(self, dataframe, tokenizer, label2id, max_length=128): self.data = dataframe.explode("_aspect_list").rename(columns={"_aspect_list":"aspect"}).reset_index(drop=True) self.tokenizer = tokenizer self.label2id = label2id self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, idx): row = self.data.iloc[idx] text, aspect = row[TEXT_COL], row["aspect"] #hem text hem aspect verildi label = self.label2id[str(row[LABEL_COL])] inputs = self.tokenizer( aspect, text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt" ) return { 'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0), 'labels': torch.tensor(label) } # Text-only dataset class TextOnlyDataset(Dataset): def __init__(self, dataframe, tokenizer, label2id, max_length=128): self.data = dataframe self.tokenizer = tokenizer self.label2id = label2id self.max_length = max_length def __len__(self): return len(self.data) def __getitem__(self, idx): row = self.data.iloc[idx] text = row[TEXT_COL] #sadece text verildi label = self.label2id[str(row[LABEL_COL])] inputs = self.tokenizer( text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors="pt" ) return { 'input_ids': inputs['input_ids'].squeeze(0), 'attention_mask': inputs['attention_mask'].squeeze(0), 'labels': torch.tensor(label) } # ------------------- DataLoader ------------------- dataloader_aspect = DataLoader(AspectAwareDataset(df, tokenizer, label2id, MAX_LEN), batch_size=BATCH_SIZE, shuffle=False) dataloader_textonly = DataLoader(TextOnlyDataset(df, tokenizer, label2id, MAX_LEN), batch_size=BATCH_SIZE, shuffle=False) # ------------------- Değerlendirme Fonksiyonu ------------------- def evaluate_model(dataloader, device, df=None, aspect_aware=False): y_true, y_pred = [], [] aspect_perf = defaultdict(list) if aspect_aware else None with torch.no_grad(): for idx, batch in enumerate(dataloader): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) predictions = torch.argmax(outputs.logits, dim=1) y_true.extend(labels.cpu().tolist()) y_pred.extend(predictions.cpu().tolist()) if aspect_aware: batch_start = idx * BATCH_SIZE for i in range(len(labels)): data_idx = batch_start + i if data_idx < len(df): true_label = labels[i].cpu().item() pred_label = predictions[i].cpu().item() aspects = df.iloc[data_idx]["_aspect_list"] for aspect in aspects: aspect_perf[aspect].append((true_label, pred_label)) return y_true, y_pred, aspect_perf # ------------------- Modeli Değerlendir ------------------- print("=== Aspect-aware Evaluation ===") y_true1, y_pred1, aspect_perf = evaluate_model(dataloader_aspect, device, df, aspect_aware=True) print(f"Accuracy: {accuracy_score(y_true1, y_pred1):.4f} Macro-F1: {f1_score(y_true1, y_pred1, average='macro'):.4f}") print(classification_report(y_true1, y_pred1, target_names=label_space)) print("\n--- Aspect-wise Performance ---") for aspect, preds in aspect_perf.items(): if len(preds) >= 10: t, p = zip(*preds) print(f"{aspect}: {accuracy_score(t,p):.3f} ({len(preds)} samples)") print("\n" + "="*60) print("=== Text-only Evaluation ===") y_true2, y_pred2, _ = evaluate_model(dataloader_textonly, device) print(f"Accuracy: {accuracy_score(y_true2, y_pred2):.4f} Macro-F1: {f1_score(y_true2, y_pred2, average='macro'):.4f}") print(classification_report(y_true2, y_pred2, target_names=label_space)) # ------------------- Karşılaştırma ------------------- print("\n=== Summary ===") acc1 = accuracy_score(y_true1, y_pred1) acc2 = accuracy_score(y_true2, y_pred2) f1_1 = f1_score(y_true1, y_pred1, average='macro') f1_2 = f1_score(y_true2, y_pred2, average='macro') print(f"Aspect-aware: Acc={acc1:.3f}, F1={f1_1:.3f}") print(f"Text-only: Acc={acc2:.3f}, F1={f1_2:.3f}") print(f"Aspect effect: {((acc1-acc2)/acc2*100):+.1f}%") ``` ### Eğitim Detayları ve Sınıf Ağırlıkları Model eğitimi sırasında, veri kümesindeki dengesiz etiket dağılımını dengelemek için sınıf ağırlıklı loss (class-weighted CrossEntropyLoss) kullanılmıştır. Bu sayede nadir etiketler (örn. nötr) eğitim sırasında daha fazla önemsenmiş ve modelin tüm sınıflar için performansı artırılmıştır. Trainer: WeightedTrainer (transformers.Trainer sınıfının özel versiyonu) Loss fonksiyonu: CrossEntropyLoss + sınıf ağırlıkları Class weights: [0.8415, 1.2915, 0.9641] Etkisi: Accuracy ve F1-score’da belirgin iyileşme sağlanmıştır (örn. Accuracy 0.86’ya yükselmiştir). ### Kullanım Amacı Bu model, ürün yorumlarını otomatik olarak aspect (beden, kalite, fiyat, renk, kargo, kumaş) ve duygu (pozitif, negatif, nötr) olarak sınıflandırır.