model-evaluation-scripts / model_evaluation.py

Add comprehensive model evaluation script

fbd68f1 verified about 1 month ago

13.1 kB

	# /// script
	# dependencies = [
	# "transformers>=4.40.0",
	# "datasets>=2.18.0",
	# "torch>=2.0.0",
	# "rouge-score>=0.1.2",
	# "evaluate>=0.4.0",
	# "numpy>=1.24.0",
	# "pandas>=2.0.0",
	# "scikit-learn>=1.3.0",
	# "huggingface-hub>=0.20.0",
	# "accelerate>=0.27.0",
	# "trackio"
	# ]
	# ///

	import os
	import json
	import pandas as pd
	import numpy as np
	from datetime import datetime
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
	from rouge_score import rouge_scorer
	from sklearn.metrics import f1_score
	import re
	import trackio
	from huggingface_hub import HfApi, upload_file
	import torch

	def normalize_text(text):
	"""Normalize text for comparison"""
	if not isinstance(text, str):
	return ""
	# Remove extra whitespace and normalize
	text = re.sub(r'\s+', ' ', text.strip())
	return text.lower()

	def compute_exact_match(pred, true):
	"""Compute exact match score"""
	return float(normalize_text(pred) == normalize_text(true))

	def compute_f1_score(pred, true):
	"""Compute token-level F1 score"""
	pred_tokens = normalize_text(pred).split()
	true_tokens = normalize_text(true).split()

	if len(pred_tokens) == 0 and len(true_tokens) == 0:
	return 1.0
	if len(pred_tokens) == 0 or len(true_tokens) == 0:
	return 0.0

	# Convert to sets for intersection
	pred_set = set(pred_tokens)
	true_set = set(true_tokens)

	if len(pred_set) == 0 and len(true_set) == 0:
	return 1.0

	intersection = pred_set.intersection(true_set)
	precision = len(intersection) / len(pred_set) if pred_set else 0
	recall = len(intersection) / len(true_set) if true_set else 0

	if precision + recall == 0:
	return 0.0

	f1 = 2 * (precision * recall) / (precision + recall)
	return f1

	def compute_rouge_l(pred, true):
	"""Compute ROUGE-L score"""
	scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
	scores = scorer.score(normalize_text(true), normalize_text(pred))
	return scores['rougeL'].fmeasure

	def evaluate_model():
	# Initialize Trackio
	trackio.init()

	print("🚀 Starting model evaluation...")

	# Configuration
	model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned"
	dataset_name = "ligaments-enterprise/sec-data"

	print(f"📊 Loading dataset: {dataset_name}")
	try:
	# Try to load the dataset
	dataset = load_dataset(dataset_name, split="train")
	print(f"✅ Dataset loaded successfully. Size: {len(dataset)}")
	except Exception as e:
	print(f"❌ Error loading dataset: {e}")
	# Try different splits
	try:
	dataset = load_dataset(dataset_name)
	if isinstance(dataset, dict):
	# Use the first available split
	split_name = list(dataset.keys())[0]
	dataset = dataset[split_name]
	print(f"✅ Using split '{split_name}'. Size: {len(dataset)}")
	except Exception as e2:
	print(f"❌ Failed to load dataset: {e2}")
	return

	# Inspect dataset structure
	print(f"📋 Dataset columns: {dataset.column_names}")
	print(f"📋 First example: {dataset[0]}")

	# Determine input/output columns
	possible_input_cols = ['prompt', 'input', 'question', 'instruction', 'text']
	possible_output_cols = ['response', 'output', 'answer', 'completion', 'target']

	input_col = None
	output_col = None

	for col in possible_input_cols:
	if col in dataset.column_names:
	input_col = col
	break

	for col in possible_output_cols:
	if col in dataset.column_names:
	output_col = col
	break

	# Handle messages format
	if 'messages' in dataset.column_names:
	print("📋 Detected messages format, extracting prompts and responses...")
	def extract_from_messages(example):
	messages = example['messages']
	if isinstance(messages, list) and len(messages) >= 2:
	# Find the last user message and assistant response
	user_msg = None
	assistant_msg = None
	for msg in messages:
	if msg.get('role') == 'user':
	user_msg = msg.get('content', '')
	elif msg.get('role') == 'assistant':
	assistant_msg = msg.get('content', '')

	return {
	'input_text': user_msg or '',
	'target_text': assistant_msg or ''
	}
	return {'input_text': '', 'target_text': ''}

	dataset = dataset.map(extract_from_messages)
	input_col = 'input_text'
	output_col = 'target_text'

	if not input_col or not output_col:
	print(f"❌ Could not identify input/output columns. Available: {dataset.column_names}")
	return

	print(f"✅ Using input column: {input_col}, output column: {output_col}")

	print(f"🤖 Loading model: {model_name}")
	try:
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16,
	device_map="auto",
	trust_remote_code=True
	)

	# Set pad token if not set
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	print("✅ Model loaded successfully")
	except Exception as e:
	print(f"❌ Error loading model: {e}")
	return

	# Create text generation pipeline
	generator = pipeline(
	"text-generation",
	model=model,
	tokenizer=tokenizer,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	# Limit evaluation to reasonable size for demonstration
	eval_size = min(100, len(dataset))
	eval_dataset = dataset.select(range(eval_size))
	print(f"📊 Evaluating on {eval_size} samples...")

	results = []

	for i, example in enumerate(eval_dataset):
	if i % 10 == 0:
	print(f"📈 Processing sample {i+1}/{eval_size}")

	input_text = example[input_col]
	target_text = example[output_col]

	if not input_text or not target_text:
	continue

	# Generate prediction
	try:
	# Format prompt appropriately
	if not input_text.strip().endswith(('?', '.', '!', ':')):
	formatted_prompt = f"{input_text.strip()}:"
	else:
	formatted_prompt = input_text.strip()

	generated = generator(
	formatted_prompt,
	max_new_tokens=256,
	do_sample=False, # Deterministic for evaluation
	temperature=0.1,
	pad_token_id=tokenizer.eos_token_id,
	return_full_text=False
	)

	prediction = generated[0]['generated_text'].strip()

	# Compute metrics
	exact_match = compute_exact_match(prediction, target_text)
	f1 = compute_f1_score(prediction, target_text)
	rouge_l = compute_rouge_l(prediction, target_text)

	# Error analysis
	error_type = "correct" if exact_match == 1.0 else "incorrect"
	if exact_match == 0 and f1 > 0.5:
	error_type = "partial_match"
	elif exact_match == 0 and rouge_l > 0.3:
	error_type = "semantic_similarity"
	elif len(prediction.split()) > len(target_text.split()) * 2:
	error_type = "too_verbose"
	elif len(prediction.split()) < len(target_text.split()) * 0.5:
	error_type = "too_brief"

	result = {
	'sample_id': i,
	'input': input_text,
	'target': target_text,
	'prediction': prediction,
	'exact_match': exact_match,
	'f1_score': f1,
	'rouge_l': rouge_l,
	'error_type': error_type,
	'input_length': len(input_text.split()),
	'target_length': len(target_text.split()),
	'prediction_length': len(prediction.split())
	}

	results.append(result)

	except Exception as e:
	print(f"⚠️ Error processing sample {i}: {e}")
	continue

	if not results:
	print("❌ No results generated")
	return

	# Compute summary statistics
	df_results = pd.DataFrame(results)

	summary_metrics = {
	'evaluation_timestamp': datetime.now().isoformat(),
	'model_name': model_name,
	'dataset_name': dataset_name,
	'total_samples': len(results),
	'exact_match_avg': df_results['exact_match'].mean(),
	'f1_score_avg': df_results['f1_score'].mean(),
	'rouge_l_avg': df_results['rouge_l'].mean(),
	'exact_match_std': df_results['exact_match'].std(),
	'f1_score_std': df_results['f1_score'].std(),
	'rouge_l_std': df_results['rouge_l'].std(),
	'perfect_matches': int(df_results['exact_match'].sum()),
	'perfect_match_rate': df_results['exact_match'].mean()
	}

	# Error analysis summary
	error_analysis = df_results['error_type'].value_counts().to_dict()
	summary_metrics['error_breakdown'] = error_analysis

	# Performance by length buckets
	df_results['target_length_bucket'] = pd.cut(
	df_results['target_length'],
	bins=[0, 10, 25, 50, 100, float('inf')],
	labels=['very_short', 'short', 'medium', 'long', 'very_long']
	)

	length_performance = df_results.groupby('target_length_bucket')[['exact_match', 'f1_score', 'rouge_l']].mean().to_dict()
	summary_metrics['performance_by_length'] = length_performance

	print("\n📊 EVALUATION RESULTS:")
	print(f"Total Samples: {summary_metrics['total_samples']}")
	print(f"Exact Match: {summary_metrics['exact_match_avg']:.4f} ± {summary_metrics['exact_match_std']:.4f}")
	print(f"F1 Score: {summary_metrics['f1_score_avg']:.4f} ± {summary_metrics['f1_score_std']:.4f}")
	print(f"ROUGE-L: {summary_metrics['rouge_l_avg']:.4f} ± {summary_metrics['rouge_l_std']:.4f}")
	print(f"Perfect Matches: {summary_metrics['perfect_matches']}/{summary_metrics['total_samples']} ({summary_metrics['perfect_match_rate']:.2%})")

	print("\n🔍 Error Breakdown:")
	for error_type, count in error_analysis.items():
	print(f" {error_type}: {count} ({count/len(results):.2%})")

	# Save results locally first
	os.makedirs('eval_results', exist_ok=True)

	# Save detailed results
	df_results.to_csv('eval_results/detailed_results.csv', index=False)

	# Save summary metrics
	with open('eval_results/summary_metrics.json', 'w') as f:
	json.dump(summary_metrics, f, indent=2, default=str)

	# Save top errors for analysis
	worst_samples = df_results.nsmallest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
	worst_samples.to_csv('eval_results/worst_predictions.csv', index=False)

	# Save best samples
	best_samples = df_results.nlargest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
	best_samples.to_csv('eval_results/best_predictions.csv', index=False)

	print("\n💾 Results saved locally to eval_results/")

	# Upload results to model repository
	try:
	print("🚀 Uploading results to model repository...")
	api = HfApi()

	# Upload all result files
	files_to_upload = [
	('eval_results/summary_metrics.json', 'eval_results/summary_metrics.json'),
	('eval_results/detailed_results.csv', 'eval_results/detailed_results.csv'),
	('eval_results/worst_predictions.csv', 'eval_results/worst_predictions.csv'),
	('eval_results/best_predictions.csv', 'eval_results/best_predictions.csv')
	]

	for local_path, repo_path in files_to_upload:
	api.upload_file(
	path_or_fileobj=local_path,
	path_in_repo=repo_path,
	repo_id=model_name,
	commit_message=f"Add evaluation results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
	)
	print(f"✅ Uploaded {repo_path}")

	print(f"✅ All evaluation results uploaded to {model_name}")

	# Log to Trackio
	trackio.log({
	"exact_match": summary_metrics['exact_match_avg'],
	"f1_score": summary_metrics['f1_score_avg'],
	"rouge_l": summary_metrics['rouge_l_avg'],
	"perfect_match_rate": summary_metrics['perfect_match_rate'],
	"total_samples": summary_metrics['total_samples']
	})

	except Exception as e:
	print(f"⚠️ Warning: Could not upload to repository: {e}")
	print("💾 Results are saved locally in eval_results/ directory")

	print("\n🎉 Evaluation completed successfully!")
	return summary_metrics

	if __name__ == "__main__":
	evaluate_model()