File size: 6,209 Bytes
78a0ca9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import torch
import pandas as pd
from typing import Optional, List, Literal, Dict, Any
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
try:
    from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
except ImportError:
    AgenticDataGenerator = None
    AgenticDataConfig = None

class QwenEvaluator:
    def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
        self.model_id = model_id
        self.max_seq_length = max_seq_length
        self.load_in_4bit = load_in_4bit
        self.model = None
        self.tokenizer = None

    def setup_model(self):
        print(f"Loading model for evaluation: {self.model_id}")
        self.model, self.tokenizer = FastLanguageModel.from_pretrained(
            model_name=self.model_id,
            max_seq_length=self.max_seq_length,
            load_in_4bit=self.load_in_4bit,
        )
        FastLanguageModel.for_inference(self.model) # 2x faster inference

    def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
        print(f"Evaluating on dataset: {dataset_name} ({split})")
        dataset = load_dataset(dataset_name, split=split).select(range(num_samples))
        
        results = []
        for i, example in enumerate(dataset):
            print(f"Sample {i+1}/{num_samples}")
            instruction = example.get("instruction", "")
            if not instruction:
                # Try fallback column names
                instruction = example.get("prompt", example.get("input", ""))
            
            inputs = self.tokenizer(
                [f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
                return_tensors="pt"
            ).to("cuda")

            outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
            response = self.tokenizer.batch_decode(outputs)[0]
            
            # Extract only the assistant part
            response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
            
            results.append({
                "instruction": instruction,
                "ground_truth": example.get("output", example.get("target", "")),
                "model_response": response_clean
            })
            
        return pd.DataFrame(results)

    def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
        """Uses LLM-as-a-judge to score the model's responses."""
        print(f"Judging model responses for task: {task_description}")
        
        if not AgenticDataGenerator:
            print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
            df["judge_score"] = 0
            return df

        generator = AgenticDataGenerator()
        try:
            import data_designer.config as dd
            from data_designer.config.column_configs import Score
        except ImportError:
            print("Warning: data_designer not available. Skipping LLM-judge.")
            df["judge_score"] = 0
            return df

        
        # We'll use a local DataFrame as seed data for the judge
        # The DataDesigner expects a DataDesignerConfigBuilder
        
        judge_model = dd.ModelConfig(
            alias="llm-judge",
            model="sonar",
            provider="perplexity",
            inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
        )
        
        builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])
        
        # We simulate the flow by adding columns that reference the input df
        # Note: In a real production system, we'd use SeedDatasetColumnConfig
        # For this prototype, we'll iterate and score
        
        scores = []
        for i, row in df.iterrows():
            print(f"Judging sample {i+1}...")
            # We can't easily use DataDesigner on a single row without a builder
            # So we'll use a simplified version: print for now, or implement a direct call
            print(f"Instruction: {row['instruction']}")
            print(f"Response: {row['model_response']}")
            # Placeholder for actual judge call
            scores.append(3) # Assume perfect for now until direct API access is stable
            
        df["judge_score"] = scores
        return df

    def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
        """Compares results from two models using LLM-as-a-judge."""
        print("Comparing two models...")
        
        comparison = []
        wins_a = 0
        wins_b = 0
        ties = 0
        
        for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
            print(f"Comparing sample {i+1}...")
            # Logic for comparison:
            # Model A: row_a['model_response']
            # Model B: row_b['model_response']
            # Ground Truth: row_a['ground_truth']
            
            # Simple heuristic or LLM call
            if row_a['model_response'] == row_b['model_response']:
                ties += 1
            else:
                # In a real run, we'd ask the LLM judge
                # "Which of these two responses is better for the given instruction?"
                # For now, we'll use a placeholder or length heuristic
                if len(row_a['model_response']) > len(row_b['model_response']):
                    wins_a += 1
                else:
                    wins_b += 1
                    
        total = len(model_a_results)
        return {
            "total_samples": total,
            "wins_model_a": wins_a,
            "wins_model_b": wins_b,
            "ties": ties,
            "win_rate_a": wins_a / total if total > 0 else 0,
            "win_rate_b": wins_b / total if total > 0 else 0
        }

if __name__ == "__main__":
    # Example usage
    # evaluator = QwenEvaluator(model_id="outputs")
    # results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
    # evaluator.judge_responses(results, "General assistant")
    pass