File size: 4,341 Bytes
1d70196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import pandas as pd
import torch
import numpy as np
import argparse
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, f1_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os

class SentimentDataset(Dataset):
    def __init__(self, texts, aspects, labels, tokenizer, max_len=128):
        self.texts = texts
        self.aspects = aspects
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, item):
        text = str(self.texts[item])
        aspect = str(self.aspects[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            aspect,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', type=str, default='models/absa-roberta-final', help='Path to your fine-tuned model')
    parser.add_argument('--test_file', type=str, default='data/processed/test.csv', help='Path to test CSV')
    args = parser.parse_args()
    
    if not os.path.exists(args.test_file):
        print(f"Error: {args.test_file} not found. Ensure you ran data_processing.py and downloaded the data.")
        return
        
    if not os.path.exists(args.model_path):
        print(f"Error: Model not found at {args.model_path}.")
        print("Please train the model on Colab using train_absa.ipynb, download the 'absa-roberta-final' folder, and place it in 'models/'.")
        return
        
    print(f"Loading test data from {args.test_file}...")
    df = pd.read_csv(args.test_file)
    
    label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
    df['label'] = df['sentiment'].map(label_mapping)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    print(f"Loading model and tokenizer from {args.model_path}...")
    tokenizer = RobertaTokenizer.from_pretrained(args.model_path)
    model = RobertaForSequenceClassification.from_pretrained(args.model_path)
    model = model.to(device)
    model.eval()
    
    test_dataset = SentimentDataset(
        texts=df['text'].to_numpy(),
        aspects=df['aspect'].to_numpy(),
        labels=df['label'].to_numpy(),
        tokenizer=tokenizer
    )
    
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    all_preds = []
    all_labels = []
    
    print("Evaluating...")
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            
    target_names = ['positive', 'negative', 'neutral', 'conflict']
    
    print("\n--- Model Evaluation Results ---")
    print(f"Accuracy:  {accuracy_score(all_labels, all_preds):.4f}")
    print(f"Macro F1:  {f1_score(all_labels, all_preds, average='macro'):.4f}\n")
    
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))
    
    print("--- Per-Aspect Accuracy ---")
    df['prediction'] = all_preds
    for aspect in df['aspect'].unique():
        mask = df['aspect'] == aspect
        aspect_acc = accuracy_score(df[mask]['label'], df[mask]['prediction'])
        print(f"{aspect.ljust(15)}: {aspect_acc:.2%}")

if __name__ == "__main__":
    main()