model-evaluation-scripts / model_evaluation.py
ligaments-dev's picture
Update evaluation script with new token
18c6ce2 verified
# /// script
# dependencies = [
# "transformers>=4.40.0",
# "datasets>=2.18.0",
# "torch>=2.0.0",
# "rouge-score>=0.1.2",
# "evaluate>=0.4.0",
# "numpy>=1.24.0",
# "pandas>=2.0.0",
# "scikit-learn>=1.3.0",
# "huggingface-hub>=0.20.0",
# "accelerate>=0.27.0",
# "trackio"
# ]
# ///
import os
import json
import pandas as pd
import numpy as np
from datetime import datetime
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from rouge_score import rouge_scorer
from sklearn.metrics import f1_score
import re
import trackio
from huggingface_hub import HfApi, upload_file
import torch
def normalize_text(text):
"""Normalize text for comparison"""
if not isinstance(text, str):
return ""
# Remove extra whitespace and normalize
text = re.sub(r'\s+', ' ', text.strip())
return text.lower()
def compute_exact_match(pred, true):
"""Compute exact match score"""
return float(normalize_text(pred) == normalize_text(true))
def compute_f1_score(pred, true):
"""Compute token-level F1 score"""
pred_tokens = normalize_text(pred).split()
true_tokens = normalize_text(true).split()
if len(pred_tokens) == 0 and len(true_tokens) == 0:
return 1.0
if len(pred_tokens) == 0 or len(true_tokens) == 0:
return 0.0
# Convert to sets for intersection
pred_set = set(pred_tokens)
true_set = set(true_tokens)
if len(pred_set) == 0 and len(true_set) == 0:
return 1.0
intersection = pred_set.intersection(true_set)
precision = len(intersection) / len(pred_set) if pred_set else 0
recall = len(intersection) / len(true_set) if true_set else 0
if precision + recall == 0:
return 0.0
f1 = 2 * (precision * recall) / (precision + recall)
return f1
def compute_rouge_l(pred, true):
"""Compute ROUGE-L score"""
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
scores = scorer.score(normalize_text(true), normalize_text(pred))
return scores['rougeL'].fmeasure
def evaluate_model():
# Initialize Trackio
trackio.init()
print("πŸš€ Starting model evaluation...")
# Configuration
model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned"
dataset_name = "ligaments-enterprise/sec-data"
print(f"πŸ“Š Loading dataset: {dataset_name}")
try:
# Try to load the dataset
dataset = load_dataset(dataset_name, split="train")
print(f"βœ… Dataset loaded successfully. Size: {len(dataset)}")
except Exception as e:
print(f"❌ Error loading dataset: {e}")
# Try different splits
try:
dataset = load_dataset(dataset_name)
if isinstance(dataset, dict):
# Use the first available split
split_name = list(dataset.keys())[0]
dataset = dataset[split_name]
print(f"βœ… Using split '{split_name}'. Size: {len(dataset)}")
except Exception as e2:
print(f"❌ Failed to load dataset: {e2}")
return
# Inspect dataset structure
print(f"πŸ“‹ Dataset columns: {dataset.column_names}")
print(f"πŸ“‹ First example: {dataset[0]}")
# Determine input/output columns
possible_input_cols = ['prompt', 'input', 'question', 'instruction', 'text']
possible_output_cols = ['response', 'output', 'answer', 'completion', 'target']
input_col = None
output_col = None
for col in possible_input_cols:
if col in dataset.column_names:
input_col = col
break
for col in possible_output_cols:
if col in dataset.column_names:
output_col = col
break
# Handle messages format
if 'messages' in dataset.column_names:
print("πŸ“‹ Detected messages format, extracting prompts and responses...")
def extract_from_messages(example):
messages = example['messages']
if isinstance(messages, list) and len(messages) >= 2:
# Find the last user message and assistant response
user_msg = None
assistant_msg = None
for msg in messages:
if msg.get('role') == 'user':
user_msg = msg.get('content', '')
elif msg.get('role') == 'assistant':
assistant_msg = msg.get('content', '')
return {
'input_text': user_msg or '',
'target_text': assistant_msg or ''
}
return {'input_text': '', 'target_text': ''}
dataset = dataset.map(extract_from_messages)
input_col = 'input_text'
output_col = 'target_text'
if not input_col or not output_col:
print(f"❌ Could not identify input/output columns. Available: {dataset.column_names}")
return
print(f"βœ… Using input column: {input_col}, output column: {output_col}")
print(f"πŸ€– Loading model: {model_name}")
try:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16,
device_map="auto",
trust_remote_code=True
)
# Set pad token if not set
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print("βœ… Model loaded successfully")
except Exception as e:
print(f"❌ Error loading model: {e}")
return
# Create text generation pipeline
generator = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
torch_dtype=torch.float16,
device_map="auto"
)
# Limit evaluation to reasonable size for demonstration
eval_size = min(100, len(dataset))
eval_dataset = dataset.select(range(eval_size))
print(f"πŸ“Š Evaluating on {eval_size} samples...")
results = []
for i, example in enumerate(eval_dataset):
if i % 10 == 0:
print(f"πŸ“ˆ Processing sample {i+1}/{eval_size}")
input_text = example[input_col]
target_text = example[output_col]
if not input_text or not target_text:
continue
# Generate prediction
try:
# Format prompt appropriately
if not input_text.strip().endswith(('?', '.', '!', ':')):
formatted_prompt = f"{input_text.strip()}:"
else:
formatted_prompt = input_text.strip()
generated = generator(
formatted_prompt,
max_new_tokens=256,
do_sample=False, # Deterministic for evaluation
temperature=0.1,
pad_token_id=tokenizer.eos_token_id,
return_full_text=False
)
prediction = generated[0]['generated_text'].strip()
# Compute metrics
exact_match = compute_exact_match(prediction, target_text)
f1 = compute_f1_score(prediction, target_text)
rouge_l = compute_rouge_l(prediction, target_text)
# Error analysis
error_type = "correct" if exact_match == 1.0 else "incorrect"
if exact_match == 0 and f1 > 0.5:
error_type = "partial_match"
elif exact_match == 0 and rouge_l > 0.3:
error_type = "semantic_similarity"
elif len(prediction.split()) > len(target_text.split()) * 2:
error_type = "too_verbose"
elif len(prediction.split()) < len(target_text.split()) * 0.5:
error_type = "too_brief"
result = {
'sample_id': i,
'input': input_text,
'target': target_text,
'prediction': prediction,
'exact_match': exact_match,
'f1_score': f1,
'rouge_l': rouge_l,
'error_type': error_type,
'input_length': len(input_text.split()),
'target_length': len(target_text.split()),
'prediction_length': len(prediction.split())
}
results.append(result)
except Exception as e:
print(f"⚠️ Error processing sample {i}: {e}")
continue
if not results:
print("❌ No results generated")
return
# Compute summary statistics
df_results = pd.DataFrame(results)
summary_metrics = {
'evaluation_timestamp': datetime.now().isoformat(),
'model_name': model_name,
'dataset_name': dataset_name,
'total_samples': len(results),
'exact_match_avg': df_results['exact_match'].mean(),
'f1_score_avg': df_results['f1_score'].mean(),
'rouge_l_avg': df_results['rouge_l'].mean(),
'exact_match_std': df_results['exact_match'].std(),
'f1_score_std': df_results['f1_score'].std(),
'rouge_l_std': df_results['rouge_l'].std(),
'perfect_matches': int(df_results['exact_match'].sum()),
'perfect_match_rate': df_results['exact_match'].mean()
}
# Error analysis summary
error_analysis = df_results['error_type'].value_counts().to_dict()
summary_metrics['error_breakdown'] = error_analysis
# Performance by length buckets
df_results['target_length_bucket'] = pd.cut(
df_results['target_length'],
bins=[0, 10, 25, 50, 100, float('inf')],
labels=['very_short', 'short', 'medium', 'long', 'very_long']
)
length_performance = df_results.groupby('target_length_bucket')[['exact_match', 'f1_score', 'rouge_l']].mean().to_dict()
summary_metrics['performance_by_length'] = length_performance
print("\nπŸ“Š EVALUATION RESULTS:")
print(f"Total Samples: {summary_metrics['total_samples']}")
print(f"Exact Match: {summary_metrics['exact_match_avg']:.4f} Β± {summary_metrics['exact_match_std']:.4f}")
print(f"F1 Score: {summary_metrics['f1_score_avg']:.4f} Β± {summary_metrics['f1_score_std']:.4f}")
print(f"ROUGE-L: {summary_metrics['rouge_l_avg']:.4f} Β± {summary_metrics['rouge_l_std']:.4f}")
print(f"Perfect Matches: {summary_metrics['perfect_matches']}/{summary_metrics['total_samples']} ({summary_metrics['perfect_match_rate']:.2%})")
print("\nπŸ” Error Breakdown:")
for error_type, count in error_analysis.items():
print(f" {error_type}: {count} ({count/len(results):.2%})")
# Save results locally first
os.makedirs('eval_results', exist_ok=True)
# Save detailed results
df_results.to_csv('eval_results/detailed_results.csv', index=False)
# Save summary metrics
with open('eval_results/summary_metrics.json', 'w') as f:
json.dump(summary_metrics, f, indent=2, default=str)
# Save top errors for analysis
worst_samples = df_results.nsmallest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
worst_samples.to_csv('eval_results/worst_predictions.csv', index=False)
# Save best samples
best_samples = df_results.nlargest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
best_samples.to_csv('eval_results/best_predictions.csv', index=False)
print("\nπŸ’Ύ Results saved locally to eval_results/")
# Upload results to model repository
try:
print("πŸš€ Uploading results to model repository...")
api = HfApi()
# Upload all result files
files_to_upload = [
('eval_results/summary_metrics.json', 'eval_results/summary_metrics.json'),
('eval_results/detailed_results.csv', 'eval_results/detailed_results.csv'),
('eval_results/worst_predictions.csv', 'eval_results/worst_predictions.csv'),
('eval_results/best_predictions.csv', 'eval_results/best_predictions.csv')
]
for local_path, repo_path in files_to_upload:
api.upload_file(
path_or_fileobj=local_path,
path_in_repo=repo_path,
repo_id=model_name,
commit_message=f"Add evaluation results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
token=os.getenv('HF_TOKEN')
)
print(f"βœ… Uploaded {repo_path}")
print(f"βœ… All evaluation results uploaded to {model_name}")
# Log to Trackio
trackio.log({
"exact_match": summary_metrics['exact_match_avg'],
"f1_score": summary_metrics['f1_score_avg'],
"rouge_l": summary_metrics['rouge_l_avg'],
"perfect_match_rate": summary_metrics['perfect_match_rate'],
"total_samples": summary_metrics['total_samples']
})
except Exception as e:
print(f"⚠️ Warning: Could not upload to repository: {e}")
print("πŸ’Ύ Results are saved locally in eval_results/ directory")
print("\nπŸŽ‰ Evaluation completed successfully!")
return summary_metrics
if __name__ == "__main__":
evaluate_model()