File size: 14,260 Bytes

54c5666

"""

Advanced Evaluation and Benchmarking Utilities

For comprehensive model assessment and comparison

"""

import os
import json
import time
import math
import torch
import torch.nn.functional as F
from typing import Dict, List, Optional, Tuple, Any
from dataclasses import dataclass
import numpy as np

try:
    from rouge_score import rouge_scorer
    ROUGE_AVAILABLE = True
except ImportError:
    ROUGE_AVAILABLE = False

try:
    import sacrebleu
    BLEU_AVAILABLE = True
except ImportError:
    BLEU_AVAILABLE = False


@dataclass
class EvaluationConfig:
    """Configuration for evaluation"""
    max_eval_samples: int = 1000
    batch_size: int = 8
    max_new_tokens: int = 512
    temperature: float = 0.7
    top_k: int = 50
    top_p: float = 0.9
    do_sample: bool = True
    num_beams: int = 1
    repetition_penalty: float = 1.0
    length_penalty: float = 1.0


class PerplexityEvaluator:
    """Evaluate model perplexity on various datasets"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    @torch.no_grad()
    def evaluate_perplexity(self, texts: List[str], max_length: int = 2048) -> Dict[str, float]:
        """Calculate perplexity on a list of texts"""
        self.model.eval()
        
        total_loss = 0.0
        total_tokens = 0
        
        for text in texts:
            tokens = self.tokenizer.encode(text)
            if len(tokens) < 2:
                continue
                
            # Split into chunks if too long
            chunks = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length)]
            
            for chunk in chunks:
                if len(chunk) < 2:
                    continue
                    
                input_ids = torch.tensor([chunk[:-1]], device=self.device)
                labels = torch.tensor([chunk[1:]], device=self.device)
                
                outputs = self.model(input_ids=input_ids, labels=labels)
                loss = outputs['loss']
                
                total_loss += loss.item() * len(chunk[1:])
                total_tokens += len(chunk[1:])
        
        if total_tokens == 0:
            return {'perplexity': float('inf'), 'loss': float('inf')}
        
        avg_loss = total_loss / total_tokens
        perplexity = math.exp(min(avg_loss, 20))  # Cap for numerical stability
        
        return {
            'perplexity': perplexity,
            'loss': avg_loss,
            'total_tokens': total_tokens
        }


class GenerationEvaluator:
    """Evaluate text generation quality"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
    
    @torch.no_grad()
    def generate_text(

        self, 

        prompt: str, 

        config: EvaluationConfig

    ) -> str:
        """Generate text from a prompt"""
        self.model.eval()
        
        # Encode prompt
        input_ids = torch.tensor([self.tokenizer.encode(prompt)], device=self.device)
        
        # Generate
        generated = input_ids.clone()
        
        for _ in range(config.max_new_tokens):
            # Forward pass
            outputs = self.model(input_ids=generated, use_cache=False)
            logits = outputs['logits']
            
            # Get next token logits
            next_token_logits = logits[0, -1, :] / config.temperature
            
            # Apply top-k filtering
            if config.top_k > 0:
                indices_to_remove = next_token_logits < torch.topk(next_token_logits, config.top_k)[0][..., -1, None]
                next_token_logits[indices_to_remove] = float('-inf')
            
            # Apply top-p filtering
            if config.top_p < 1.0:
                sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
                
                sorted_indices_to_remove = cumulative_probs > config.top_p
                sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
                sorted_indices_to_remove[0] = 0
                
                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                next_token_logits[indices_to_remove] = float('-inf')
            
            # Sample next token
            if config.do_sample:
                probs = F.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, num_samples=1)
            else:
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            
            # Append to sequence
            generated = torch.cat([generated, next_token.unsqueeze(0)], dim=-1)
            
            # Stop if we hit EOS or max length
            if generated.size(1) >= input_ids.size(1) + config.max_new_tokens:
                break
        
        # Decode generated text
        generated_text = self.tokenizer.decode(generated[0].cpu().tolist())
        
        # Extract only the new part
        prompt_length = len(prompt)
        return generated_text[prompt_length:]
    
    def evaluate_generation_quality(

        self, 

        prompts: List[str], 

        references: Optional[List[str]] = None,

        config: EvaluationConfig = None

    ) -> Dict[str, Any]:
        """Evaluate generation quality with various metrics"""
        if config is None:
            config = EvaluationConfig()
        
        results = {
            'generations': [],
            'metrics': {}
        }
        
        # Generate responses
        for prompt in prompts:
            generation = self.generate_text(prompt, config)
            results['generations'].append({
                'prompt': prompt,
                'generation': generation
            })
        
        # Calculate metrics if references provided
        if references and len(references) == len(prompts):
            generations = [r['generation'] for r in results['generations']]
            
            # BLEU score
            if BLEU_AVAILABLE:
                bleu_scores = []
                for gen, ref in zip(generations, references):
                    bleu = sacrebleu.sentence_bleu(gen, [ref])
                    bleu_scores.append(bleu.score)
                results['metrics']['bleu'] = np.mean(bleu_scores)
            
            # ROUGE scores
            if ROUGE_AVAILABLE:
                scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
                rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
                
                for gen, ref in zip(generations, references):
                    scores = scorer.score(ref, gen)
                    for key in rouge_scores:
                        rouge_scores[key].append(scores[key].fmeasure)
                
                for key in rouge_scores:
                    results['metrics'][key] = np.mean(rouge_scores[key])
            
            # Length statistics
            gen_lengths = [len(gen.split()) for gen in generations]
            ref_lengths = [len(ref.split()) for ref in references]
            
            results['metrics']['avg_gen_length'] = np.mean(gen_lengths)
            results['metrics']['avg_ref_length'] = np.mean(ref_lengths)
            results['metrics']['length_ratio'] = np.mean(gen_lengths) / np.mean(ref_lengths)
        
        return results


class BenchmarkEvaluator:
    """Run standardized benchmarks"""
    
    def __init__(self, model, tokenizer, device):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.perplexity_evaluator = PerplexityEvaluator(model, tokenizer, device)
        self.generation_evaluator = GenerationEvaluator(model, tokenizer, device)
    
    def run_hellaswag_eval(self, dataset_path: str = None) -> Dict[str, float]:
        """Evaluate on HellaSwag dataset (common sense reasoning)"""
        # Simplified HellaSwag evaluation
        # In practice, you'd load the actual dataset
        
        examples = [
            {
                "context": "A woman is outside with a bucket and a dog. The dog is running around trying to avoid a bath. She",
                "choices": [
                    "rinses the bucket off with a hose and fills it with soap.",
                    "uses a hose to keep filling the bucket with water.",
                    "gets the dog wet, then it runs away again.",
                    "gets into the bucket."
                ],
                "correct": 2
            },
            # Add more examples...
        ]
        
        correct = 0
        total = 0
        
        for example in examples:
            context = example["context"]
            choices = example["choices"]
            correct_idx = example["correct"]
            
            # Calculate likelihood for each choice
            choice_scores = []
            for choice in choices:
                full_text = context + " " + choice
                tokens = self.tokenizer.encode(full_text)
                
                if len(tokens) < 2:
                    choice_scores.append(float('-inf'))
                    continue
                
                input_ids = torch.tensor([tokens[:-1]], device=self.device)
                labels = torch.tensor([tokens[1:]], device=self.device)
                
                with torch.no_grad():
                    outputs = self.model(input_ids=input_ids, labels=labels)
                    loss = outputs['loss']
                    choice_scores.append(-loss.item())
            
            # Check if highest scoring choice is correct
            predicted_idx = choice_scores.index(max(choice_scores))
            if predicted_idx == correct_idx:
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0.0
        return {"hellaswag_accuracy": accuracy}
    
    def run_lambada_eval(self, dataset_path: str = None) -> Dict[str, float]:
        """Evaluate on LAMBADA dataset (reading comprehension)"""
        # Simplified LAMBADA evaluation
        examples = [
            {
                "text": "George Washington was the first President of the United States. He served from 1789 to 1797. Washington was born in",
                "target": "Virginia"
            },
            # Add more examples...
        ]
        
        correct = 0
        total = 0
        
        for example in examples:
            text = example["text"]
            target = example["target"]
            
            # Generate continuation
            config = EvaluationConfig(max_new_tokens=10, temperature=0.0, do_sample=False)
            generation = self.generation_evaluator.generate_text(text, config)
            
            # Check if target word appears in generation
            if target.lower() in generation.lower():
                correct += 1
            total += 1
        
        accuracy = correct / total if total > 0 else 0.0
        return {"lambada_accuracy": accuracy}
    
    def run_comprehensive_eval(self) -> Dict[str, Any]:
        """Run comprehensive evaluation suite"""
        results = {}
        
        # Perplexity on sample texts
        sample_texts = [
            "The quick brown fox jumps over the lazy dog.",
            "Artificial intelligence is transforming the world in unprecedented ways.",
            "Climate change represents one of the most significant challenges of our time."
        ]
        
        perplexity_results = self.perplexity_evaluator.evaluate_perplexity(sample_texts)
        results.update(perplexity_results)
        
        # Common sense reasoning
        hellaswag_results = self.run_hellaswag_eval()
        results.update(hellaswag_results)
        
        # Reading comprehension
        lambada_results = self.run_lambada_eval()
        results.update(lambada_results)
        
        # Generation quality
        prompts = [
            "Explain the concept of machine learning in simple terms:",
            "Write a short story about a robot discovering emotions:",
            "Describe the benefits of renewable energy:"
        ]
        
        generation_results = self.generation_evaluator.evaluate_generation_quality(prompts)
        results['generation_examples'] = generation_results['generations']
        results.update(generation_results['metrics'])
        
        return results


def run_evaluation(model, tokenizer, device, output_dir: str = "eval_results"):
    """Run complete evaluation suite"""
    os.makedirs(output_dir, exist_ok=True)
    
    evaluator = BenchmarkEvaluator(model, tokenizer, device)
    
    print("Running comprehensive evaluation...")
    start_time = time.time()
    
    results = evaluator.run_comprehensive_eval()
    
    end_time = time.time()
    results['evaluation_time'] = end_time - start_time
    
    # Save results
    results_path = os.path.join(output_dir, "evaluation_results.json")
    with open(results_path, 'w') as f:
        json.dump(results, f, indent=2)
    
    print(f"Evaluation completed in {results['evaluation_time']:.2f} seconds")
    print(f"Results saved to {results_path}")
    
    # Print summary
    print("\n=== Evaluation Summary ===")
    if 'perplexity' in results:
        print(f"Perplexity: {results['perplexity']:.2f}")
    if 'hellaswag_accuracy' in results:
        print(f"HellaSwag Accuracy: {results['hellaswag_accuracy']:.3f}")
    if 'lambada_accuracy' in results:
        print(f"LAMBADA Accuracy: {results['lambada_accuracy']:.3f}")
    if 'bleu' in results:
        print(f"BLEU Score: {results['bleu']:.2f}")
    
    return results