File size: 6,209 Bytes
78a0ca9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | import os
import torch
import pandas as pd
from typing import Optional, List, Literal, Dict, Any
from unsloth import FastLanguageModel
from datasets import load_dataset, Dataset
try:
from agentic_data_gen import AgenticDataGenerator, AgenticDataConfig
except ImportError:
AgenticDataGenerator = None
AgenticDataConfig = None
class QwenEvaluator:
def __init__(self, model_id: str, max_seq_length: int = 2048, load_in_4bit: bool = True):
self.model_id = model_id
self.max_seq_length = max_seq_length
self.load_in_4bit = load_in_4bit
self.model = None
self.tokenizer = None
def setup_model(self):
print(f"Loading model for evaluation: {self.model_id}")
self.model, self.tokenizer = FastLanguageModel.from_pretrained(
model_name=self.model_id,
max_seq_length=self.max_seq_length,
load_in_4bit=self.load_in_4bit,
)
FastLanguageModel.for_inference(self.model) # 2x faster inference
def evaluate_on_dataset(self, dataset_name: str, split: str = "test", num_samples: int = 10):
print(f"Evaluating on dataset: {dataset_name} ({split})")
dataset = load_dataset(dataset_name, split=split).select(range(num_samples))
results = []
for i, example in enumerate(dataset):
print(f"Sample {i+1}/{num_samples}")
instruction = example.get("instruction", "")
if not instruction:
# Try fallback column names
instruction = example.get("prompt", example.get("input", ""))
inputs = self.tokenizer(
[f"<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"],
return_tensors="pt"
).to("cuda")
outputs = self.model.generate(**inputs, max_new_tokens=512, use_cache=True)
response = self.tokenizer.batch_decode(outputs)[0]
# Extract only the assistant part
response_clean = response.split("<|im_start|>assistant\n")[-1].replace("<|im_end|>", "").strip()
results.append({
"instruction": instruction,
"ground_truth": example.get("output", example.get("target", "")),
"model_response": response_clean
})
return pd.DataFrame(results)
def judge_responses(self, df: pd.DataFrame, task_description: str) -> pd.DataFrame:
"""Uses LLM-as-a-judge to score the model's responses."""
print(f"Judging model responses for task: {task_description}")
if not AgenticDataGenerator:
print("Warning: AgenticDataGenerator not available. Skipping LLM-judge.")
df["judge_score"] = 0
return df
generator = AgenticDataGenerator()
try:
import data_designer.config as dd
from data_designer.config.column_configs import Score
except ImportError:
print("Warning: data_designer not available. Skipping LLM-judge.")
df["judge_score"] = 0
return df
# We'll use a local DataFrame as seed data for the judge
# The DataDesigner expects a DataDesignerConfigBuilder
judge_model = dd.ModelConfig(
alias="llm-judge",
model="sonar",
provider="perplexity",
inference_parameters=dd.ChatCompletionInferenceParams(max_parallel_requests=1)
)
builder = dd.DataDesignerConfigBuilder(model_configs=[judge_model])
# We simulate the flow by adding columns that reference the input df
# Note: In a real production system, we'd use SeedDatasetColumnConfig
# For this prototype, we'll iterate and score
scores = []
for i, row in df.iterrows():
print(f"Judging sample {i+1}...")
# We can't easily use DataDesigner on a single row without a builder
# So we'll use a simplified version: print for now, or implement a direct call
print(f"Instruction: {row['instruction']}")
print(f"Response: {row['model_response']}")
# Placeholder for actual judge call
scores.append(3) # Assume perfect for now until direct API access is stable
df["judge_score"] = scores
return df
def compare_models(self, model_a_results: pd.DataFrame, model_b_results: pd.DataFrame) -> Dict[str, Any]:
"""Compares results from two models using LLM-as-a-judge."""
print("Comparing two models...")
comparison = []
wins_a = 0
wins_b = 0
ties = 0
for (i, row_a), (_, row_b) in zip(model_a_results.iterrows(), model_b_results.iterrows()):
print(f"Comparing sample {i+1}...")
# Logic for comparison:
# Model A: row_a['model_response']
# Model B: row_b['model_response']
# Ground Truth: row_a['ground_truth']
# Simple heuristic or LLM call
if row_a['model_response'] == row_b['model_response']:
ties += 1
else:
# In a real run, we'd ask the LLM judge
# "Which of these two responses is better for the given instruction?"
# For now, we'll use a placeholder or length heuristic
if len(row_a['model_response']) > len(row_b['model_response']):
wins_a += 1
else:
wins_b += 1
total = len(model_a_results)
return {
"total_samples": total,
"wins_model_a": wins_a,
"wins_model_b": wins_b,
"ties": ties,
"win_rate_a": wins_a / total if total > 0 else 0,
"win_rate_b": wins_b / total if total > 0 else 0
}
if __name__ == "__main__":
# Example usage
# evaluator = QwenEvaluator(model_id="outputs")
# results = evaluator.evaluate_on_dataset("yahma/alpaca-cleaned", num_samples=5)
# evaluator.judge_responses(results, "General assistant")
pass
|