File size: 6,209 Bytes

78a0ca9

import os
import torch
import pandas as pd
from typing import Optional, List, Literal, Dict, Any
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
try:
    from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
except ImportError:
    AgenticDataGenerator = None
    AgenticDataConfig = None

class QwenEvaluator:
    def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
        self.model_id = model_id
        self.max_seq_length = max_seq_length
        self.load_in_4bit = load_in_4bit
        self.model = None
        self.tokenizer = None

    def setup_model(self):
        print(f"Loading model for evaluation: {self.model_id}")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_id,
            max_seq_length=self.max_seq_length,
            load_in_4bit=self.load_in_4bit,
        )
        FastLanguageModel.for_inference(self.model) # 2x faster inference

    def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
        print(f"Evaluating on dataset: {dataset_name} ({split})")
        dataset = load_dataset(dataset_name, split=split).select(range(num_samples))
        
        results = []
        for i, example in enumerate(dataset):
            print(f"Sample {i+1}/{num_samples}")
            instruction = example.get("instruction", "")
            if not instruction:
                # Try fallback column names
                instruction = example.get("prompt", example.get("input", ""))
            
            inputs = self.tokenizer(
                [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
                return_tensors="pt"
            ).to("cuda")

            outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
            response = self.tokenizer.batch_decode(outputs)[0]
            
            # Extract only the assistant part
            response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
            
            results.append({
                "instruction": instruction,
                "ground_truth": example.get("output", example.get("target", "")),
                "model_response": response_clean
            })
            
        return pd.DataFrame(results)

    def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
        """Uses LLM-as-a-judge to score the model's responses."""
        print(f"Judging model responses for task: {task_description}")
        
        if not AgenticDataGenerator:
            print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
            df["judge_score"] = 0
            return df

        generator = AgenticDataGenerator()
        try:
            import data_designer.config as dd
            from data_designer.config.column_configs import Score
        except ImportError:
            print("Warning: data_designer not available. Skipping LLM-judge.")
            df["judge_score"] = 0
            return df

        
        # We'll use a local DataFrame as seed data for the judge
        # The DataDesigner expects a DataDesignerConfigBuilder
        
        judge_model = dd.ModelConfig(
            alias="llm-judge",
            model="sonar",
            provider="perplexity",
            inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
        )
        
        builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])
        
        # We simulate the flow by adding columns that reference the input df
        # Note: In a real production system, we'd use SeedDatasetColumnConfig
        # For this prototype, we'll iterate and score
        
        scores = []
        for i, row in df.iterrows():
            print(f"Judging sample {i+1}...")
            # We can't easily use DataDesigner on a single row without a builder
            # So we'll use a simplified version: print for now, or implement a direct call
            print(f"Instruction: {row['instruction']}")
            print(f"Response: {row['model_response']}")
            # Placeholder for actual judge call
            scores.append(3) # Assume perfect for now until direct API access is stable
            
        df["judge_score"] = scores
        return df

    def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
        """Compares results from two models using LLM-as-a-judge."""
        print("Comparing two models...")
        
        comparison = []
        wins_a = 0
        wins_b = 0
        ties = 0
        
        for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
            print(f"Comparing sample {i+1}...")
            # Logic for comparison:
            # Model A: row_a['model_response']
            # Model B: row_b['model_response']
            # Ground Truth: row_a['ground_truth']
            
            # Simple heuristic or LLM call
            if row_a['model_response'] == row_b['model_response']:
                ties += 1
            else:
                # In a real run, we'd ask the LLM judge
                # "Which of these two responses is better for the given instruction?"
                # For now, we'll use a placeholder or length heuristic
                if len(row_a['model_response']) > len(row_b['model_response']):
                    wins_a += 1
                else:
                    wins_b += 1
                    
        total = len(model_a_results)
        return {
            "total_samples": total,
            "wins_model_a": wins_a,
            "wins_model_b": wins_b,
            "ties": ties,
            "win_rate_a": wins_a / total if total > 0 else 0,
            "win_rate_b": wins_b / total if total > 0 else 0
        }

if __name__ == "__main__":
    # Example usage
    # evaluator = QwenEvaluator(model_id="outputs")
    # results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
    # evaluator.judge_responses(results, "General assistant")
    pass