import pandas as pd import torch import numpy as np import argparse from transformers import RobertaTokenizer, RobertaForSequenceClassification from sklearn.metrics import classification_report, accuracy_score, f1_score from torch.utils.data import DataLoader, Dataset from tqdm import tqdm import os class SentimentDataset(Dataset): def __init__(self, texts, aspects, labels, tokenizer, max_len=128): self.texts = texts self.aspects = aspects self.labels = labels self.tokenizer = tokenizer self.max_len = max_len def __len__(self): return len(self.texts) def __getitem__(self, item): text = str(self.texts[item]) aspect = str(self.aspects[item]) label = self.labels[item] encoding = self.tokenizer.encode_plus( text, aspect, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='pt', ) return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'labels': torch.tensor(label, dtype=torch.long) } def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_path', type=str, default='models/absa-roberta-final', help='Path to your fine-tuned model') parser.add_argument('--test_file', type=str, default='data/processed/test.csv', help='Path to test CSV') args = parser.parse_args() if not os.path.exists(args.test_file): print(f"Error: {args.test_file} not found. Ensure you ran data_processing.py and downloaded the data.") return if not os.path.exists(args.model_path): print(f"Error: Model not found at {args.model_path}.") print("Please train the model on Colab using train_absa.ipynb, download the 'absa-roberta-final' folder, and place it in 'models/'.") return print(f"Loading test data from {args.test_file}...") df = pd.read_csv(args.test_file) label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3} df['label'] = df['sentiment'].map(label_mapping) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Using device: {device}") print(f"Loading model and tokenizer from {args.model_path}...") tokenizer = RobertaTokenizer.from_pretrained(args.model_path) model = RobertaForSequenceClassification.from_pretrained(args.model_path) model = model.to(device) model.eval() test_dataset = SentimentDataset( texts=df['text'].to_numpy(), aspects=df['aspect'].to_numpy(), labels=df['label'].to_numpy(), tokenizer=tokenizer ) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) all_preds = [] all_labels = [] print("Evaluating...") with torch.no_grad(): for batch in tqdm(test_loader): input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['labels'].to(device) outputs = model(input_ids=input_ids, attention_mask=attention_mask) _, preds = torch.max(outputs.logits, dim=1) all_preds.extend(preds.cpu().tolist()) all_labels.extend(labels.cpu().tolist()) target_names = ['positive', 'negative', 'neutral', 'conflict'] print("\n--- Model Evaluation Results ---") print(f"Accuracy: {accuracy_score(all_labels, all_preds):.4f}") print(f"Macro F1: {f1_score(all_labels, all_preds, average='macro'):.4f}\n") print("Classification Report:") print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0)) print("--- Per-Aspect Accuracy ---") df['prediction'] = all_preds for aspect in df['aspect'].unique(): mask = df['aspect'] == aspect aspect_acc = accuracy_score(df[mask]['label'], df[mask]['prediction']) print(f"{aspect.ljust(15)}: {aspect_acc:.2%}") if __name__ == "__main__": main()