# /// script # dependencies = [ # "transformers>=4.40.0", # "datasets>=2.18.0", # "torch>=2.0.0", # "rouge-score>=0.1.2", # "evaluate>=0.4.0", # "numpy>=1.24.0", # "pandas>=2.0.0", # "scikit-learn>=1.3.0", # "huggingface-hub>=0.20.0", # "accelerate>=0.27.0", # "trackio" # ] # /// import os import json import pandas as pd import numpy as np from datetime import datetime from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from rouge_score import rouge_scorer from sklearn.metrics import f1_score import re import trackio from huggingface_hub import HfApi, upload_file import torch def normalize_text(text): """Normalize text for comparison""" if not isinstance(text, str): return "" # Remove extra whitespace and normalize text = re.sub(r'\s+', ' ', text.strip()) return text.lower() def compute_exact_match(pred, true): """Compute exact match score""" return float(normalize_text(pred) == normalize_text(true)) def compute_f1_score(pred, true): """Compute token-level F1 score""" pred_tokens = normalize_text(pred).split() true_tokens = normalize_text(true).split() if len(pred_tokens) == 0 and len(true_tokens) == 0: return 1.0 if len(pred_tokens) == 0 or len(true_tokens) == 0: return 0.0 # Convert to sets for intersection pred_set = set(pred_tokens) true_set = set(true_tokens) if len(pred_set) == 0 and len(true_set) == 0: return 1.0 intersection = pred_set.intersection(true_set) precision = len(intersection) / len(pred_set) if pred_set else 0 recall = len(intersection) / len(true_set) if true_set else 0 if precision + recall == 0: return 0.0 f1 = 2 * (precision * recall) / (precision + recall) return f1 def compute_rouge_l(pred, true): """Compute ROUGE-L score""" scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) scores = scorer.score(normalize_text(true), normalize_text(pred)) return scores['rougeL'].fmeasure def evaluate_model(): # Initialize Trackio trackio.init() print("šŸš€ Starting model evaluation...") # Configuration model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned" dataset_name = "ligaments-enterprise/sec-data" print(f"šŸ“Š Loading dataset: {dataset_name}") try: # Try to load the dataset dataset = load_dataset(dataset_name, split="train") print(f"āœ… Dataset loaded successfully. Size: {len(dataset)}") except Exception as e: print(f"āŒ Error loading dataset: {e}") # Try different splits try: dataset = load_dataset(dataset_name) if isinstance(dataset, dict): # Use the first available split split_name = list(dataset.keys())[0] dataset = dataset[split_name] print(f"āœ… Using split '{split_name}'. Size: {len(dataset)}") except Exception as e2: print(f"āŒ Failed to load dataset: {e2}") return # Inspect dataset structure print(f"šŸ“‹ Dataset columns: {dataset.column_names}") print(f"šŸ“‹ First example: {dataset[0]}") # Determine input/output columns possible_input_cols = ['prompt', 'input', 'question', 'instruction', 'text'] possible_output_cols = ['response', 'output', 'answer', 'completion', 'target'] input_col = None output_col = None for col in possible_input_cols: if col in dataset.column_names: input_col = col break for col in possible_output_cols: if col in dataset.column_names: output_col = col break # Handle messages format if 'messages' in dataset.column_names: print("šŸ“‹ Detected messages format, extracting prompts and responses...") def extract_from_messages(example): messages = example['messages'] if isinstance(messages, list) and len(messages) >= 2: # Find the last user message and assistant response user_msg = None assistant_msg = None for msg in messages: if msg.get('role') == 'user': user_msg = msg.get('content', '') elif msg.get('role') == 'assistant': assistant_msg = msg.get('content', '') return { 'input_text': user_msg or '', 'target_text': assistant_msg or '' } return {'input_text': '', 'target_text': ''} dataset = dataset.map(extract_from_messages) input_col = 'input_text' output_col = 'target_text' if not input_col or not output_col: print(f"āŒ Could not identify input/output columns. Available: {dataset.column_names}") return print(f"āœ… Using input column: {input_col}, output column: {output_col}") print(f"šŸ¤– Loading model: {model_name}") try: tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True ) # Set pad token if not set if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print("āœ… Model loaded successfully") except Exception as e: print(f"āŒ Error loading model: {e}") return # Create text generation pipeline generator = pipeline( "text-generation", model=model, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto" ) # Limit evaluation to reasonable size for demonstration eval_size = min(100, len(dataset)) eval_dataset = dataset.select(range(eval_size)) print(f"šŸ“Š Evaluating on {eval_size} samples...") results = [] for i, example in enumerate(eval_dataset): if i % 10 == 0: print(f"šŸ“ˆ Processing sample {i+1}/{eval_size}") input_text = example[input_col] target_text = example[output_col] if not input_text or not target_text: continue # Generate prediction try: # Format prompt appropriately if not input_text.strip().endswith(('?', '.', '!', ':')): formatted_prompt = f"{input_text.strip()}:" else: formatted_prompt = input_text.strip() generated = generator( formatted_prompt, max_new_tokens=256, do_sample=False, # Deterministic for evaluation temperature=0.1, pad_token_id=tokenizer.eos_token_id, return_full_text=False ) prediction = generated[0]['generated_text'].strip() # Compute metrics exact_match = compute_exact_match(prediction, target_text) f1 = compute_f1_score(prediction, target_text) rouge_l = compute_rouge_l(prediction, target_text) # Error analysis error_type = "correct" if exact_match == 1.0 else "incorrect" if exact_match == 0 and f1 > 0.5: error_type = "partial_match" elif exact_match == 0 and rouge_l > 0.3: error_type = "semantic_similarity" elif len(prediction.split()) > len(target_text.split()) * 2: error_type = "too_verbose" elif len(prediction.split()) < len(target_text.split()) * 0.5: error_type = "too_brief" result = { 'sample_id': i, 'input': input_text, 'target': target_text, 'prediction': prediction, 'exact_match': exact_match, 'f1_score': f1, 'rouge_l': rouge_l, 'error_type': error_type, 'input_length': len(input_text.split()), 'target_length': len(target_text.split()), 'prediction_length': len(prediction.split()) } results.append(result) except Exception as e: print(f"āš ļø Error processing sample {i}: {e}") continue if not results: print("āŒ No results generated") return # Compute summary statistics df_results = pd.DataFrame(results) summary_metrics = { 'evaluation_timestamp': datetime.now().isoformat(), 'model_name': model_name, 'dataset_name': dataset_name, 'total_samples': len(results), 'exact_match_avg': df_results['exact_match'].mean(), 'f1_score_avg': df_results['f1_score'].mean(), 'rouge_l_avg': df_results['rouge_l'].mean(), 'exact_match_std': df_results['exact_match'].std(), 'f1_score_std': df_results['f1_score'].std(), 'rouge_l_std': df_results['rouge_l'].std(), 'perfect_matches': int(df_results['exact_match'].sum()), 'perfect_match_rate': df_results['exact_match'].mean() } # Error analysis summary error_analysis = df_results['error_type'].value_counts().to_dict() summary_metrics['error_breakdown'] = error_analysis # Performance by length buckets df_results['target_length_bucket'] = pd.cut( df_results['target_length'], bins=[0, 10, 25, 50, 100, float('inf')], labels=['very_short', 'short', 'medium', 'long', 'very_long'] ) length_performance = df_results.groupby('target_length_bucket')[['exact_match', 'f1_score', 'rouge_l']].mean().to_dict() summary_metrics['performance_by_length'] = length_performance print("\nšŸ“Š EVALUATION RESULTS:") print(f"Total Samples: {summary_metrics['total_samples']}") print(f"Exact Match: {summary_metrics['exact_match_avg']:.4f} ± {summary_metrics['exact_match_std']:.4f}") print(f"F1 Score: {summary_metrics['f1_score_avg']:.4f} ± {summary_metrics['f1_score_std']:.4f}") print(f"ROUGE-L: {summary_metrics['rouge_l_avg']:.4f} ± {summary_metrics['rouge_l_std']:.4f}") print(f"Perfect Matches: {summary_metrics['perfect_matches']}/{summary_metrics['total_samples']} ({summary_metrics['perfect_match_rate']:.2%})") print("\nšŸ” Error Breakdown:") for error_type, count in error_analysis.items(): print(f" {error_type}: {count} ({count/len(results):.2%})") # Save results locally first os.makedirs('eval_results', exist_ok=True) # Save detailed results df_results.to_csv('eval_results/detailed_results.csv', index=False) # Save summary metrics with open('eval_results/summary_metrics.json', 'w') as f: json.dump(summary_metrics, f, indent=2, default=str) # Save top errors for analysis worst_samples = df_results.nsmallest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']] worst_samples.to_csv('eval_results/worst_predictions.csv', index=False) # Save best samples best_samples = df_results.nlargest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']] best_samples.to_csv('eval_results/best_predictions.csv', index=False) print("\nšŸ’¾ Results saved locally to eval_results/") # Upload results to model repository try: print("šŸš€ Uploading results to model repository...") api = HfApi() # Upload all result files files_to_upload = [ ('eval_results/summary_metrics.json', 'eval_results/summary_metrics.json'), ('eval_results/detailed_results.csv', 'eval_results/detailed_results.csv'), ('eval_results/worst_predictions.csv', 'eval_results/worst_predictions.csv'), ('eval_results/best_predictions.csv', 'eval_results/best_predictions.csv') ] for local_path, repo_path in files_to_upload: api.upload_file( path_or_fileobj=local_path, path_in_repo=repo_path, repo_id=model_name, commit_message=f"Add evaluation results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" ) print(f"āœ… Uploaded {repo_path}") print(f"āœ… All evaluation results uploaded to {model_name}") # Log to Trackio trackio.log({ "exact_match": summary_metrics['exact_match_avg'], "f1_score": summary_metrics['f1_score_avg'], "rouge_l": summary_metrics['rouge_l_avg'], "perfect_match_rate": summary_metrics['perfect_match_rate'], "total_samples": summary_metrics['total_samples'] }) except Exception as e: print(f"āš ļø Warning: Could not upload to repository: {e}") print("šŸ’¾ Results are saved locally in eval_results/ directory") print("\nšŸŽ‰ Evaluation completed successfully!") return summary_metrics if __name__ == "__main__": evaluate_model()