out / kista_benchmarking /benchmarking.py
BayesTensor's picture
Upload folder using huggingface_hub
9d5b280 verified
import json
from model_run import VLLMClient
from typing import Dict, List
import logging
from tqdm import tqdm
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class BenchmarkEvaluator:
def __init__(self, model_path: str):
self.client = VLLMClient(model_path)
self.nest_name = model_path.split('/')[1]
def load_data(self, file_path: str) -> List[Dict]:
"""Load data from JSON file."""
try:
with open(file_path, 'r') as f:
return json.load(f)
except Exception as e:
logger.error(f"Error loading data: {e}")
raise
def log_api_call(self, input_data: Dict, api_response: Dict, ground_truth: str, error: str = None) -> None:
"""Log API call details to a JSON file."""
import os
from datetime import datetime
import uuid
# Create benchmark_logs directory if it doesn't exist
log_dir = f"benchmark_logs/{self.nest_name}"
os.makedirs(log_dir, exist_ok=True)
# Create unique filename using timestamp and UUID
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_id = str(uuid.uuid4())[:8]
filename = f"{log_dir}/api_call_{timestamp}_{unique_id}.json"
# Prepare log data
log_data = {
"timestamp": datetime.now().isoformat(),
"input": str(input_data),
"ground_truth": str(ground_truth),
"api_response": str(api_response),
"error": error
}
# Write to file
with open(filename, 'w', encoding='utf-8') as f:
json.dump(log_data, f, indent=2, ensure_ascii=False)
def get_model_response(self, system_prompt: str, input_text: str, ground_truth: str) -> Dict:
"""Get response from the model."""
input_data = {
"system_prompt": system_prompt,
"input_text": input_text
}
try:
response = self.client.send_message(system_prompt, input_text)
# Parse the response to match the expected format
parsed_response = eval(response['result'])
# Log successful API call
self.log_api_call(input_data, parsed_response, ground_truth)
return parsed_response
except Exception as e:
# Log failed API call
self.log_api_call(input_data, None, str(e))
logger.error(f"Error getting model response: {e}")
return None
def normalize_is_met(self, value: str) -> str:
"""Normalize is_met value to lowercase."""
if not isinstance(value, str):
return str(value).lower()
return value.lower()
def calculate_accuracy(self, ground_truth: List[Dict], model_outputs: List[Dict]) -> float:
"""Calculate accuracy between ground truth and model outputs."""
if len(ground_truth) != len(model_outputs):
raise ValueError("Ground truth and model outputs must have the same length")
correct = 0
total = len(ground_truth)
for gt, mo in zip(ground_truth, model_outputs):
gt_is_met = self.normalize_is_met(gt['output']['is_met'])
mo_is_met = self.normalize_is_met(mo['assessments'][0]['is_met'])
if gt_is_met == mo_is_met:
correct += 1
return correct / total if total > 0 else 0
def run_benchmark(self, file_path: str) -> Dict:
"""Run the complete benchmarking process."""
# Load data
data = self.load_data(file_path)
# Store model outputs
model_outputs = []
ground_truth = []
# Process each entry
for entry in tqdm(data, desc="Processing entries"):
# Get model response
model_response = self.get_model_response(
entry['system_prompt'],
entry['input'],
entry['output']
)
if model_response:
model_outputs.append(model_response)
ground_truth.append(entry)
# Calculate accuracy
accuracy = self.calculate_accuracy(ground_truth, model_outputs)
return {
'accuracy': accuracy,
'total_samples': len(data),
'processed_samples': len(model_outputs)
}