File size: 9,269 Bytes

f8dd4fe

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Fine-tune UBC-NLP/MARBERTv2 for Arabic telecom customer comment classification.

Dataset (CSV):
  /home/houssam-nojoom/.cache/huggingface/hub/datasets--houssamboukhalfa--telecom-ch1/snapshots/be06acac69aa411636dbe0e3bef5f0072e670765/train.csv

Columns:
  Commentaire client: str (text)
  Class: int (label - values 1 through 9)

Model:
  - MARBERTv2 encoder
  - Classification head for multi-class prediction (9 classes)
"""

import os
import numpy as np
import torch

from inspect import signature
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# Slight speed boost on Ampere GPUs
if hasattr(torch, "set_float32_matmul_precision"):
    torch.set_float32_matmul_precision("high")

# -------------------------------------------------------------------
# 1. Paths & config
# -------------------------------------------------------------------
DATA_FILE = "/home/houssam-nojoom/.cache/huggingface/hub/datasets--houssamboukhalfa--labelds/snapshots/48f016fd5987875b0e9f79d0689cef2ec3b2ce0b/train.csv"

MODEL_NAME = "UBC-NLP/MARBERTv2"
OUTPUT_DIR = "./telecom_marbertv2_final"

MAX_LENGTH = 256

# Define label mapping - classes are 1-9
LABEL2ID = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8}
ID2LABEL = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9}
NUM_LABELS = 9

# -------------------------------------------------------------------
# 2. Dataset loading
# -------------------------------------------------------------------
print("Loading telecom dataset from CSV...")
dataset = load_dataset(
    "csv",
    data_files=DATA_FILE,
    split="train",
)

print("Sample example:", dataset[0])
print(f"Total examples: {len(dataset)}")

print(f"Number of classes: {NUM_LABELS}")
print("Label mapping (class -> model index):", LABEL2ID)
print("Inverse mapping (model index -> class):", ID2LABEL)


def encode_labels(example):
    """Convert class (1-9) to model label index (0-8)."""
    class_val = example["Class"]
    
    # Handle both int and string types
    if isinstance(class_val, str):
        class_val = int(class_val)
    
    if class_val not in LABEL2ID:
        raise ValueError(f"Unknown class: {class_val}. Expected 1-9.")
    
    example["labels"] = LABEL2ID[class_val]
    return example


dataset = dataset.map(encode_labels)

# Train/val split (90/10)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))

# -------------------------------------------------------------------
# 3. Tokenization
# -------------------------------------------------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


def preprocess_function(examples):
    return tokenizer(
        examples["Commentaire client"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH,
    )


train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=4)
eval_dataset = eval_dataset.map(preprocess_function, batched=True, num_proc=4)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# -------------------------------------------------------------------
# 4. Model - Using AutoModelForSequenceClassification
# -------------------------------------------------------------------
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    id2label=ID2LABEL,
    label2id=LABEL2ID,
)

print("Model initialized with classification head")
print(f"Number of labels: {NUM_LABELS}")
print(f"Classes: {list(ID2LABEL.values())}")

# -------------------------------------------------------------------
# 5. Metrics
# -------------------------------------------------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Overall metrics
    accuracy = accuracy_score(labels, predictions)
    
    # Weighted average (accounts for class imbalance)
    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(
        labels, predictions, average='weighted', zero_division=0
    )
    
    # Macro average (treats all classes equally)
    precision_m, recall_m, f1_m, _ = precision_recall_fscore_support(
        labels, predictions, average='macro', zero_division=0
    )
    
    metrics = {
        'accuracy': accuracy,
        'f1_weighted': f1_w,
        'f1_macro': f1_m,
        'precision_weighted': precision_w,
        'recall_weighted': recall_w,
        'precision_macro': precision_m,
        'recall_macro': recall_m,
    }
    
    # Per-class F1 scores
    per_class_f1 = f1_score(labels, predictions, average=None, zero_division=0)
    for idx in range(NUM_LABELS):
        class_name = ID2LABEL[idx]
        if idx < len(per_class_f1):
            metrics[f'f1_class_{class_name}'] = per_class_f1[idx]
    
    return metrics


# -------------------------------------------------------------------
# 6. TrainingArguments (old/new transformers compatible)
# -------------------------------------------------------------------
ta_sig = signature(TrainingArguments.__init__)
ta_params = set(ta_sig.parameters.keys())

is_bf16_supported = (
    torch.cuda.is_available()
    and hasattr(torch.cuda, "is_bf16_supported")
    and torch.cuda.is_bf16_supported()
)
use_bf16 = bool(is_bf16_supported)
use_fp16 = not use_bf16

print(f"bf16 supported: {is_bf16_supported} -> using bf16={use_bf16}, fp16={use_fp16}")

base_kwargs = {
    "output_dir": OUTPUT_DIR,
    "num_train_epochs": 10,
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 64,
    "learning_rate": 1e-4,
    "weight_decay": 0.02,
    "warmup_ratio": 0.1,
    "logging_steps": 50,
    "save_total_limit": 2,
    "dataloader_num_workers": 4,
}

# Mixed precision flags if supported
if "bf16" in ta_params:
    base_kwargs["bf16"] = use_bf16
if "fp16" in ta_params:
    base_kwargs["fp16"] = use_fp16

# Handle evaluation_strategy compatibility
if "evaluation_strategy" in ta_params:
    base_kwargs["evaluation_strategy"] = "epoch"
    if "save_strategy" in ta_params:
        base_kwargs["save_strategy"] = "epoch"
    if "logging_strategy" in ta_params:
        base_kwargs["logging_strategy"] = "steps"
    if "load_best_model_at_end" in ta_params:
        base_kwargs["load_best_model_at_end"] = True
    if "metric_for_best_model" in ta_params:
        base_kwargs["metric_for_best_model"] = "f1_weighted"
    if "greater_is_better" in ta_params:
        base_kwargs["greater_is_better"] = True
    if "report_to" in ta_params:
        base_kwargs["report_to"] = "none"
else:
    if "report_to" in ta_params:
        base_kwargs["report_to"] = "none"
    print("[TrainingArguments] Old transformers version: no evaluation_strategy argument. Using simple setup.")

filtered_kwargs = {}
for k, v in base_kwargs.items():
    if k in ta_params:
        filtered_kwargs[k] = v
    else:
        print(f"[TrainingArguments] Skipping unsupported arg: {k}={v}")

training_args = TrainingArguments(**filtered_kwargs)

# -------------------------------------------------------------------
# 7. Trainer
# -------------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# -------------------------------------------------------------------
# 8. Train & eval
# -------------------------------------------------------------------
if __name__ == "__main__":
    print("Starting telecom classification training...")
    trainer.train()

    print("Evaluating on validation split...")
    metrics = trainer.evaluate()
    print("Validation metrics:", metrics)

    print("Saving final model & tokenizer...")
    trainer.save_model(OUTPUT_DIR)
    tokenizer.save_pretrained(OUTPUT_DIR)
    
    print(f"Label mappings saved in config:")
    print(f"  ID to Label: {ID2LABEL}")
    print(f"  Label to ID: {LABEL2ID}")

    # Quick sanity-check inference
    example_texts = [
        "الخدمة ممتازة جدا وسريعة",
        "سيء للغاية ولا يستجيبون",
        "متوسط الجودة"
    ]
    
    inputs = tokenizer(
        example_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH
    ).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits.cpu().numpy()
    predictions = np.argmax(logits, axis=-1)

    print("\nSanity-check predictions:")
    for text, pred_idx in zip(example_texts, predictions):
        pred_class = ID2LABEL[pred_idx]
        print(f"Text: {text}")
        print(f"  -> Predicted Class: {pred_class}")
        print()

    print("Training complete and model saved to:", OUTPUT_DIR)