| |
| |
|
|
| """ |
| Fine-tune UBC-NLP/MARBERTv2 for Arabic telecom customer comment classification. |
| |
| Dataset (CSV): |
| /home/houssam-nojoom/.cache/huggingface/hub/datasets--houssamboukhalfa--telecom-ch1/snapshots/be06acac69aa411636dbe0e3bef5f0072e670765/train.csv |
| |
| Columns: |
| Commentaire client: str (text) |
| Class: int (label - values 1 through 9) |
| |
| Model: |
| - MARBERTv2 encoder |
| - Classification head for multi-class prediction (9 classes) |
| """ |
|
|
| import os |
| import numpy as np |
| import torch |
|
|
| from inspect import signature |
| from datasets import load_dataset |
| from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support |
| from transformers import ( |
| AutoTokenizer, |
| AutoModelForSequenceClassification, |
| TrainingArguments, |
| Trainer, |
| ) |
|
|
| |
| if hasattr(torch, "set_float32_matmul_precision"): |
| torch.set_float32_matmul_precision("high") |
|
|
| |
| |
| |
| DATA_FILE = "/home/houssam-nojoom/.cache/huggingface/hub/datasets--houssamboukhalfa--labelds/snapshots/48f016fd5987875b0e9f79d0689cef2ec3b2ce0b/train.csv" |
|
|
| MODEL_NAME = "UBC-NLP/MARBERTv2" |
| OUTPUT_DIR = "./telecom_marbertv2_final" |
|
|
| MAX_LENGTH = 256 |
|
|
| |
| LABEL2ID = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7, 9: 8} |
| ID2LABEL = {0: 1, 1: 2, 2: 3, 3: 4, 4: 5, 5: 6, 6: 7, 7: 8, 8: 9} |
| NUM_LABELS = 9 |
|
|
| |
| |
| |
| print("Loading telecom dataset from CSV...") |
| dataset = load_dataset( |
| "csv", |
| data_files=DATA_FILE, |
| split="train", |
| ) |
|
|
| print("Sample example:", dataset[0]) |
| print(f"Total examples: {len(dataset)}") |
|
|
| print(f"Number of classes: {NUM_LABELS}") |
| print("Label mapping (class -> model index):", LABEL2ID) |
| print("Inverse mapping (model index -> class):", ID2LABEL) |
|
|
|
|
| def encode_labels(example): |
| """Convert class (1-9) to model label index (0-8).""" |
| class_val = example["Class"] |
| |
| |
| if isinstance(class_val, str): |
| class_val = int(class_val) |
| |
| if class_val not in LABEL2ID: |
| raise ValueError(f"Unknown class: {class_val}. Expected 1-9.") |
| |
| example["labels"] = LABEL2ID[class_val] |
| return example |
|
|
|
|
| dataset = dataset.map(encode_labels) |
|
|
| |
| dataset = dataset.train_test_split(test_size=0.1, seed=42) |
| train_dataset = dataset["train"] |
| eval_dataset = dataset["test"] |
|
|
| print("Train size:", len(train_dataset)) |
| print("Eval size:", len(eval_dataset)) |
|
|
| |
| |
| |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
|
|
|
| def preprocess_function(examples): |
| return tokenizer( |
| examples["Commentaire client"], |
| padding="max_length", |
| truncation=True, |
| max_length=MAX_LENGTH, |
| ) |
|
|
|
|
| train_dataset = train_dataset.map(preprocess_function, batched=True, num_proc=4) |
| eval_dataset = eval_dataset.map(preprocess_function, batched=True, num_proc=4) |
|
|
| train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) |
| eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"]) |
|
|
| |
| |
| |
| model = AutoModelForSequenceClassification.from_pretrained( |
| MODEL_NAME, |
| num_labels=NUM_LABELS, |
| id2label=ID2LABEL, |
| label2id=LABEL2ID, |
| ) |
|
|
| print("Model initialized with classification head") |
| print(f"Number of labels: {NUM_LABELS}") |
| print(f"Classes: {list(ID2LABEL.values())}") |
|
|
| |
| |
| |
| def compute_metrics(eval_pred): |
| logits, labels = eval_pred |
| predictions = np.argmax(logits, axis=-1) |
| |
| |
| accuracy = accuracy_score(labels, predictions) |
| |
| |
| precision_w, recall_w, f1_w, _ = precision_recall_fscore_support( |
| labels, predictions, average='weighted', zero_division=0 |
| ) |
| |
| |
| precision_m, recall_m, f1_m, _ = precision_recall_fscore_support( |
| labels, predictions, average='macro', zero_division=0 |
| ) |
| |
| metrics = { |
| 'accuracy': accuracy, |
| 'f1_weighted': f1_w, |
| 'f1_macro': f1_m, |
| 'precision_weighted': precision_w, |
| 'recall_weighted': recall_w, |
| 'precision_macro': precision_m, |
| 'recall_macro': recall_m, |
| } |
| |
| |
| per_class_f1 = f1_score(labels, predictions, average=None, zero_division=0) |
| for idx in range(NUM_LABELS): |
| class_name = ID2LABEL[idx] |
| if idx < len(per_class_f1): |
| metrics[f'f1_class_{class_name}'] = per_class_f1[idx] |
| |
| return metrics |
|
|
|
|
| |
| |
| |
| ta_sig = signature(TrainingArguments.__init__) |
| ta_params = set(ta_sig.parameters.keys()) |
|
|
| is_bf16_supported = ( |
| torch.cuda.is_available() |
| and hasattr(torch.cuda, "is_bf16_supported") |
| and torch.cuda.is_bf16_supported() |
| ) |
| use_bf16 = bool(is_bf16_supported) |
| use_fp16 = not use_bf16 |
|
|
| print(f"bf16 supported: {is_bf16_supported} -> using bf16={use_bf16}, fp16={use_fp16}") |
|
|
| base_kwargs = { |
| "output_dir": OUTPUT_DIR, |
| "num_train_epochs": 10, |
| "per_device_train_batch_size": 32, |
| "per_device_eval_batch_size": 64, |
| "learning_rate": 1e-4, |
| "weight_decay": 0.02, |
| "warmup_ratio": 0.1, |
| "logging_steps": 50, |
| "save_total_limit": 2, |
| "dataloader_num_workers": 4, |
| } |
|
|
| |
| if "bf16" in ta_params: |
| base_kwargs["bf16"] = use_bf16 |
| if "fp16" in ta_params: |
| base_kwargs["fp16"] = use_fp16 |
|
|
| |
| if "evaluation_strategy" in ta_params: |
| base_kwargs["evaluation_strategy"] = "epoch" |
| if "save_strategy" in ta_params: |
| base_kwargs["save_strategy"] = "epoch" |
| if "logging_strategy" in ta_params: |
| base_kwargs["logging_strategy"] = "steps" |
| if "load_best_model_at_end" in ta_params: |
| base_kwargs["load_best_model_at_end"] = True |
| if "metric_for_best_model" in ta_params: |
| base_kwargs["metric_for_best_model"] = "f1_weighted" |
| if "greater_is_better" in ta_params: |
| base_kwargs["greater_is_better"] = True |
| if "report_to" in ta_params: |
| base_kwargs["report_to"] = "none" |
| else: |
| if "report_to" in ta_params: |
| base_kwargs["report_to"] = "none" |
| print("[TrainingArguments] Old transformers version: no evaluation_strategy argument. Using simple setup.") |
|
|
| filtered_kwargs = {} |
| for k, v in base_kwargs.items(): |
| if k in ta_params: |
| filtered_kwargs[k] = v |
| else: |
| print(f"[TrainingArguments] Skipping unsupported arg: {k}={v}") |
|
|
| training_args = TrainingArguments(**filtered_kwargs) |
|
|
| |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=train_dataset, |
| eval_dataset=eval_dataset, |
| tokenizer=tokenizer, |
| compute_metrics=compute_metrics, |
| ) |
|
|
| |
| |
| |
| if __name__ == "__main__": |
| print("Starting telecom classification training...") |
| trainer.train() |
|
|
| print("Evaluating on validation split...") |
| metrics = trainer.evaluate() |
| print("Validation metrics:", metrics) |
|
|
| print("Saving final model & tokenizer...") |
| trainer.save_model(OUTPUT_DIR) |
| tokenizer.save_pretrained(OUTPUT_DIR) |
| |
| print(f"Label mappings saved in config:") |
| print(f" ID to Label: {ID2LABEL}") |
| print(f" Label to ID: {LABEL2ID}") |
|
|
| |
| example_texts = [ |
| "الخدمة ممتازة جدا وسريعة", |
| "سيء للغاية ولا يستجيبون", |
| "متوسط الجودة" |
| ] |
| |
| inputs = tokenizer( |
| example_texts, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=MAX_LENGTH |
| ).to(model.device) |
|
|
| with torch.no_grad(): |
| outputs = model(**inputs) |
| |
| logits = outputs.logits.cpu().numpy() |
| predictions = np.argmax(logits, axis=-1) |
|
|
| print("\nSanity-check predictions:") |
| for text, pred_idx in zip(example_texts, predictions): |
| pred_class = ID2LABEL[pred_idx] |
| print(f"Text: {text}") |
| print(f" -> Predicted Class: {pred_class}") |
| print() |
|
|
| print("Training complete and model saved to:", OUTPUT_DIR) |