File size: 9,440 Bytes

f8dd4fe

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Inference script for CPT+finetuned MARBERTv2 telecom classification model.

Loads the model from ./telecom_marbertv2_cpt_ft and runs predictions on test.csv
"""

import os
import numpy as np
import pandas as pd
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
)
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)

# -------------------------------------------------------------------
# 1. Paths & config
# -------------------------------------------------------------------
TEST_FILE = "/home/houssam-nojoom/.cache/huggingface/hub/datasets--houssamboukhalfa--telecom-ch1/snapshots/be06acac69aa411636dbe0e3bef5f0072e670765/test_file.csv"
MODEL_DIR = "./telecom_camelbert_cpt_ft"
OUTPUT_FILE = "./test_predictions_camelbert_cpt_ft.csv"

MAX_LENGTH = 256
BATCH_SIZE = 64

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# -------------------------------------------------------------------
# 2. Load test data
# -------------------------------------------------------------------
print(f"Loading test data from: {TEST_FILE}")
test_df = pd.read_csv(TEST_FILE)
print(f"Test samples: {len(test_df)}")
print(f"Columns: {test_df.columns.tolist()}")

# Check if test data has labels
has_labels = "Class" in test_df.columns
if has_labels:
    print("Test data contains labels - will compute metrics")
else:
    print("Test data has no labels - will only generate predictions")

# -------------------------------------------------------------------
# 3. Load model and tokenizer
# -------------------------------------------------------------------
print(f"\nLoading model from: {MODEL_DIR}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)

# Load config to get label mappings
config_path = os.path.join(MODEL_DIR, "config.json")
if os.path.exists(config_path):
    import json
    with open(config_path, 'r') as f:
        config_data = json.load(f)
    
    if 'id2label' in config_data:
        id2label = {int(k): int(v) for k, v in config_data['id2label'].items()}
        # Create label2id with both string and int keys for robustness
        label2id = {}
        for k, v in id2label.items():
            label2id[v] = k  # int key -> int value
            label2id[str(v)] = k  # string key -> int value
        num_labels = len(id2label)
    else:
        # Fallback: infer from test data if available
        if has_labels:
            unique_classes = sorted(test_df["Class"].unique())
            label2id = {label: idx for idx, label in enumerate(unique_classes)}
            id2label = {idx: label for label, idx in label2id.items()}
            num_labels = len(unique_classes)
        else:
            raise ValueError("Cannot determine number of labels without config or test labels")
else:
    # Fallback: infer from test data if available
    if has_labels:
        unique_classes = sorted(test_df["Class"].unique())
        label2id = {label: idx for idx, label in enumerate(unique_classes)}
        id2label = {idx: label for label, idx in label2id.items()}
        num_labels = len(unique_classes)
    else:
        raise ValueError("Cannot find config.json and test data has no labels")

print(f"Number of classes: {num_labels}")
print(f"Label mapping: {id2label}")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model = model.to(device)
model.eval()
print("Model loaded successfully!")

# -------------------------------------------------------------------
# 4. Run inference
# -------------------------------------------------------------------
print("\nRunning inference...")

all_predictions = []
all_probabilities = []

# Process in batches for efficiency
for i in range(0, len(test_df), BATCH_SIZE):
    batch_texts = test_df["Commentaire client"].iloc[i:i+BATCH_SIZE].tolist()
    
    # Tokenize
    inputs = tokenizer(
        batch_texts,
        padding=True,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
    ).to(device)
    
    # Predict
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)
        predictions = torch.argmax(logits, dim=-1)
    
    all_predictions.extend(predictions.cpu().numpy())
    all_probabilities.extend(probs.cpu().numpy())
    
    if (i // BATCH_SIZE + 1) % 10 == 0:
        print(f"Processed {i + len(batch_texts)}/{len(test_df)} samples...")

print(f"Inference complete! Processed {len(all_predictions)} samples")

# -------------------------------------------------------------------
# 5. Save predictions
# -------------------------------------------------------------------
# Convert predictions to class names (1-9)
predicted_classes = [id2label[pred] for pred in all_predictions]

# Add predictions to dataframe
test_df["Predicted_Class"] = predicted_classes
test_df["Predicted_Label_ID"] = all_predictions

# Add probability for each class
for idx, class_name in id2label.items():
    test_df[f"Prob_{class_name}"] = [probs[idx] for probs in all_probabilities]

# Add confidence (max probability)
test_df["Confidence"] = [max(probs) for probs in all_probabilities]

# Save results
test_df.to_csv(OUTPUT_FILE, index=False)
print(f"\nPredictions saved to: {OUTPUT_FILE}")

# -------------------------------------------------------------------
# 6. Compute metrics (if labels available)
# -------------------------------------------------------------------
if has_labels:
    print("\n" + "="*80)
    print("EVALUATION METRICS")
    print("="*80)
    
    # Convert true labels to indices
    true_labels = test_df["Class"].map(label2id).values
    pred_labels = np.array(all_predictions)
    
    # Overall metrics
    accuracy = accuracy_score(true_labels, pred_labels)
    print(f"\nAccuracy: {accuracy:.4f}")
    
    # Weighted metrics (accounts for class imbalance)
    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average='weighted', zero_division=0
    )
    print(f"\nWeighted Metrics:")
    print(f"  Precision: {precision_w:.4f}")
    print(f"  Recall:    {recall_w:.4f}")
    print(f"  F1 Score:  {f1_w:.4f}")
    
    # Macro metrics (treats all classes equally)
    precision_m, recall_m, f1_m, _ = precision_recall_fscore_support(
        true_labels, pred_labels, average='macro', zero_division=0
    )
    print(f"\nMacro Metrics:")
    print(f"  Precision: {precision_m:.4f}")
    print(f"  Recall:    {recall_m:.4f}")
    print(f"  F1 Score:  {f1_m:.4f}")
    
    # Per-class metrics
    print(f"\nPer-Class Metrics:")
    per_class_f1 = f1_score(true_labels, pred_labels, average=None, zero_division=0)
    per_class_precision, per_class_recall, _, support = precision_recall_fscore_support(
        true_labels, pred_labels, average=None, zero_division=0
    )
    
    for idx in range(num_labels):
        class_name = id2label[idx]
        print(f"\n  Class {class_name}:")
        print(f"    Precision: {per_class_precision[idx]:.4f}")
        print(f"    Recall:    {per_class_recall[idx]:.4f}")
        print(f"    F1 Score:  {per_class_f1[idx]:.4f}")
        print(f"    Support:   {int(support[idx])}")
    
    # Classification report
    print("\n" + "="*80)
    print("DETAILED CLASSIFICATION REPORT")
    print("="*80)
    target_names = [str(id2label[i]) for i in range(num_labels)]
    print(classification_report(true_labels, pred_labels, target_names=target_names, zero_division=0))
    
    # Confusion matrix
    print("\n" + "="*80)
    print("CONFUSION MATRIX")
    print("="*80)
    cm = confusion_matrix(true_labels, pred_labels)
    
    # Print confusion matrix with labels
    print("\nTrue \\ Predicted", end="")
    for i in range(num_labels):
        print(f"\t{id2label[i]}", end="")
    print()
    
    for i in range(num_labels):
        print(f"{id2label[i]:<15}", end="")
        for j in range(num_labels):
            print(f"\t{cm[i][j]}", end="")
        print()
    
    # Save confusion matrix to CSV
    cm_df = pd.DataFrame(
        cm,
        index=[str(id2label[i]) for i in range(num_labels)],
        columns=[str(id2label[i]) for i in range(num_labels)]
    )
    cm_df.to_csv("./confusion_matrix_marbertv2_cpt_ft.csv")
    print("\nConfusion matrix saved to: ./confusion_matrix_marbertv2_cpt_ft.csv")

# -------------------------------------------------------------------
# 7. Show sample predictions
# -------------------------------------------------------------------
print("\n" + "="*80)
print("SAMPLE PREDICTIONS (CPT+Finetuned MARBERTv2)")
print("="*80)

# Show first 5 predictions
num_samples = min(5, len(test_df))
for i in range(num_samples):
    print(f"\nSample {i+1}:")
    print(f"Text: {test_df['Commentaire client'].iloc[i]}")
    if has_labels:
        print(f"True Class: {test_df['Class'].iloc[i]}")
    print(f"Predicted Class: {predicted_classes[i]}")
    print(f"Confidence: {test_df['Confidence'].iloc[i]:.4f}")
    print(f"Probabilities:")
    for idx, class_name in id2label.items():
        print(f"  Class {class_name}: {all_probabilities[i][idx]:.4f}")

print("\n" + "="*80)
print("Inference completed successfully!")
print(f"Model used: CPT+Finetuned MARBERTv2 from {MODEL_DIR}")
print("="*80)