import pandas as pd
import torch
import numpy as np
import argparse
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from sklearn.metrics import classification_report, accuracy_score, f1_score
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
import os

class SentimentDataset(Dataset):
    def __init__(self, texts, aspects, labels, tokenizer, max_len=128):
        self.texts = texts
        self.aspects = aspects
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
        
    def __getitem__(self, item):
        text = str(self.texts[item])
        aspect = str(self.aspects[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            text,
            aspect,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', type=str, default='models/absa-roberta-final', help='Path to your fine-tuned model')
    parser.add_argument('--test_file', type=str, default='data/processed/test.csv', help='Path to test CSV')
    args = parser.parse_args()
    
    if not os.path.exists(args.test_file):
        print(f"Error: {args.test_file} not found. Ensure you ran data_processing.py and downloaded the data.")
        return
        
    if not os.path.exists(args.model_path):
        print(f"Error: Model not found at {args.model_path}.")
        print("Please train the model on Colab using train_absa.ipynb, download the 'absa-roberta-final' folder, and place it in 'models/'.")
        return
        
    print(f"Loading test data from {args.test_file}...")
    df = pd.read_csv(args.test_file)
    
    label_mapping = {'positive': 0, 'negative': 1, 'neutral': 2, 'conflict': 3}
    df['label'] = df['sentiment'].map(label_mapping)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")
    
    print(f"Loading model and tokenizer from {args.model_path}...")
    tokenizer = RobertaTokenizer.from_pretrained(args.model_path)
    model = RobertaForSequenceClassification.from_pretrained(args.model_path)
    model = model.to(device)
    model.eval()
    
    test_dataset = SentimentDataset(
        texts=df['text'].to_numpy(),
        aspects=df['aspect'].to_numpy(),
        labels=df['label'].to_numpy(),
        tokenizer=tokenizer
    )
    
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
    
    all_preds = []
    all_labels = []
    
    print("Evaluating...")
    with torch.no_grad():
        for batch in tqdm(test_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            
    target_names = ['positive', 'negative', 'neutral', 'conflict']
    
    print("\n--- Model Evaluation Results ---")
    print(f"Accuracy:  {accuracy_score(all_labels, all_preds):.4f}")
    print(f"Macro F1:  {f1_score(all_labels, all_preds, average='macro'):.4f}\n")
    
    print("Classification Report:")
    print(classification_report(all_labels, all_preds, target_names=target_names, zero_division=0))
    
    print("--- Per-Aspect Accuracy ---")
    df['prediction'] = all_preds
    for aspect in df['aspect'].unique():
        mask = df['aspect'] == aspect
        aspect_acc = accuracy_score(df[mask]['label'], df[mask]['prediction'])
        print(f"{aspect.ljust(15)}: {aspect_acc:.2%}")

if __name__ == "__main__":
    main()