|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
import json |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from datetime import datetime |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
|
|
from rouge_score import rouge_scorer |
|
|
from sklearn.metrics import f1_score |
|
|
import re |
|
|
import trackio |
|
|
from huggingface_hub import HfApi, upload_file |
|
|
import torch |
|
|
|
|
|
def normalize_text(text): |
|
|
"""Normalize text for comparison""" |
|
|
if not isinstance(text, str): |
|
|
return "" |
|
|
|
|
|
text = re.sub(r'\s+', ' ', text.strip()) |
|
|
return text.lower() |
|
|
|
|
|
def compute_exact_match(pred, true): |
|
|
"""Compute exact match score""" |
|
|
return float(normalize_text(pred) == normalize_text(true)) |
|
|
|
|
|
def compute_f1_score(pred, true): |
|
|
"""Compute token-level F1 score""" |
|
|
pred_tokens = normalize_text(pred).split() |
|
|
true_tokens = normalize_text(true).split() |
|
|
|
|
|
if len(pred_tokens) == 0 and len(true_tokens) == 0: |
|
|
return 1.0 |
|
|
if len(pred_tokens) == 0 or len(true_tokens) == 0: |
|
|
return 0.0 |
|
|
|
|
|
|
|
|
pred_set = set(pred_tokens) |
|
|
true_set = set(true_tokens) |
|
|
|
|
|
if len(pred_set) == 0 and len(true_set) == 0: |
|
|
return 1.0 |
|
|
|
|
|
intersection = pred_set.intersection(true_set) |
|
|
precision = len(intersection) / len(pred_set) if pred_set else 0 |
|
|
recall = len(intersection) / len(true_set) if true_set else 0 |
|
|
|
|
|
if precision + recall == 0: |
|
|
return 0.0 |
|
|
|
|
|
f1 = 2 * (precision * recall) / (precision + recall) |
|
|
return f1 |
|
|
|
|
|
def compute_rouge_l(pred, true): |
|
|
"""Compute ROUGE-L score""" |
|
|
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True) |
|
|
scores = scorer.score(normalize_text(true), normalize_text(pred)) |
|
|
return scores['rougeL'].fmeasure |
|
|
|
|
|
def evaluate_model(): |
|
|
|
|
|
trackio.init() |
|
|
|
|
|
print("π Starting model evaluation...") |
|
|
|
|
|
|
|
|
model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned" |
|
|
dataset_name = "ligaments-enterprise/sec-data" |
|
|
|
|
|
print(f"π Loading dataset: {dataset_name}") |
|
|
try: |
|
|
|
|
|
dataset = load_dataset(dataset_name, split="train") |
|
|
print(f"β
Dataset loaded successfully. Size: {len(dataset)}") |
|
|
except Exception as e: |
|
|
print(f"β Error loading dataset: {e}") |
|
|
|
|
|
try: |
|
|
dataset = load_dataset(dataset_name) |
|
|
if isinstance(dataset, dict): |
|
|
|
|
|
split_name = list(dataset.keys())[0] |
|
|
dataset = dataset[split_name] |
|
|
print(f"β
Using split '{split_name}'. Size: {len(dataset)}") |
|
|
except Exception as e2: |
|
|
print(f"β Failed to load dataset: {e2}") |
|
|
return |
|
|
|
|
|
|
|
|
print(f"π Dataset columns: {dataset.column_names}") |
|
|
print(f"π First example: {dataset[0]}") |
|
|
|
|
|
|
|
|
possible_input_cols = ['prompt', 'input', 'question', 'instruction', 'text'] |
|
|
possible_output_cols = ['response', 'output', 'answer', 'completion', 'target'] |
|
|
|
|
|
input_col = None |
|
|
output_col = None |
|
|
|
|
|
for col in possible_input_cols: |
|
|
if col in dataset.column_names: |
|
|
input_col = col |
|
|
break |
|
|
|
|
|
for col in possible_output_cols: |
|
|
if col in dataset.column_names: |
|
|
output_col = col |
|
|
break |
|
|
|
|
|
|
|
|
if 'messages' in dataset.column_names: |
|
|
print("π Detected messages format, extracting prompts and responses...") |
|
|
def extract_from_messages(example): |
|
|
messages = example['messages'] |
|
|
if isinstance(messages, list) and len(messages) >= 2: |
|
|
|
|
|
user_msg = None |
|
|
assistant_msg = None |
|
|
for msg in messages: |
|
|
if msg.get('role') == 'user': |
|
|
user_msg = msg.get('content', '') |
|
|
elif msg.get('role') == 'assistant': |
|
|
assistant_msg = msg.get('content', '') |
|
|
|
|
|
return { |
|
|
'input_text': user_msg or '', |
|
|
'target_text': assistant_msg or '' |
|
|
} |
|
|
return {'input_text': '', 'target_text': ''} |
|
|
|
|
|
dataset = dataset.map(extract_from_messages) |
|
|
input_col = 'input_text' |
|
|
output_col = 'target_text' |
|
|
|
|
|
if not input_col or not output_col: |
|
|
print(f"β Could not identify input/output columns. Available: {dataset.column_names}") |
|
|
return |
|
|
|
|
|
print(f"β
Using input column: {input_col}, output column: {output_col}") |
|
|
|
|
|
print(f"π€ Loading model: {model_name}") |
|
|
try: |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto", |
|
|
trust_remote_code=True |
|
|
) |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
print("β
Model loaded successfully") |
|
|
except Exception as e: |
|
|
print(f"β Error loading model: {e}") |
|
|
return |
|
|
|
|
|
|
|
|
generator = pipeline( |
|
|
"text-generation", |
|
|
model=model, |
|
|
tokenizer=tokenizer, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="auto" |
|
|
) |
|
|
|
|
|
|
|
|
eval_size = min(100, len(dataset)) |
|
|
eval_dataset = dataset.select(range(eval_size)) |
|
|
print(f"π Evaluating on {eval_size} samples...") |
|
|
|
|
|
results = [] |
|
|
|
|
|
for i, example in enumerate(eval_dataset): |
|
|
if i % 10 == 0: |
|
|
print(f"π Processing sample {i+1}/{eval_size}") |
|
|
|
|
|
input_text = example[input_col] |
|
|
target_text = example[output_col] |
|
|
|
|
|
if not input_text or not target_text: |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
if not input_text.strip().endswith(('?', '.', '!', ':')): |
|
|
formatted_prompt = f"{input_text.strip()}:" |
|
|
else: |
|
|
formatted_prompt = input_text.strip() |
|
|
|
|
|
generated = generator( |
|
|
formatted_prompt, |
|
|
max_new_tokens=256, |
|
|
do_sample=False, |
|
|
temperature=0.1, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
return_full_text=False |
|
|
) |
|
|
|
|
|
prediction = generated[0]['generated_text'].strip() |
|
|
|
|
|
|
|
|
exact_match = compute_exact_match(prediction, target_text) |
|
|
f1 = compute_f1_score(prediction, target_text) |
|
|
rouge_l = compute_rouge_l(prediction, target_text) |
|
|
|
|
|
|
|
|
error_type = "correct" if exact_match == 1.0 else "incorrect" |
|
|
if exact_match == 0 and f1 > 0.5: |
|
|
error_type = "partial_match" |
|
|
elif exact_match == 0 and rouge_l > 0.3: |
|
|
error_type = "semantic_similarity" |
|
|
elif len(prediction.split()) > len(target_text.split()) * 2: |
|
|
error_type = "too_verbose" |
|
|
elif len(prediction.split()) < len(target_text.split()) * 0.5: |
|
|
error_type = "too_brief" |
|
|
|
|
|
result = { |
|
|
'sample_id': i, |
|
|
'input': input_text, |
|
|
'target': target_text, |
|
|
'prediction': prediction, |
|
|
'exact_match': exact_match, |
|
|
'f1_score': f1, |
|
|
'rouge_l': rouge_l, |
|
|
'error_type': error_type, |
|
|
'input_length': len(input_text.split()), |
|
|
'target_length': len(target_text.split()), |
|
|
'prediction_length': len(prediction.split()) |
|
|
} |
|
|
|
|
|
results.append(result) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β οΈ Error processing sample {i}: {e}") |
|
|
continue |
|
|
|
|
|
if not results: |
|
|
print("β No results generated") |
|
|
return |
|
|
|
|
|
|
|
|
df_results = pd.DataFrame(results) |
|
|
|
|
|
summary_metrics = { |
|
|
'evaluation_timestamp': datetime.now().isoformat(), |
|
|
'model_name': model_name, |
|
|
'dataset_name': dataset_name, |
|
|
'total_samples': len(results), |
|
|
'exact_match_avg': df_results['exact_match'].mean(), |
|
|
'f1_score_avg': df_results['f1_score'].mean(), |
|
|
'rouge_l_avg': df_results['rouge_l'].mean(), |
|
|
'exact_match_std': df_results['exact_match'].std(), |
|
|
'f1_score_std': df_results['f1_score'].std(), |
|
|
'rouge_l_std': df_results['rouge_l'].std(), |
|
|
'perfect_matches': int(df_results['exact_match'].sum()), |
|
|
'perfect_match_rate': df_results['exact_match'].mean() |
|
|
} |
|
|
|
|
|
|
|
|
error_analysis = df_results['error_type'].value_counts().to_dict() |
|
|
summary_metrics['error_breakdown'] = error_analysis |
|
|
|
|
|
|
|
|
df_results['target_length_bucket'] = pd.cut( |
|
|
df_results['target_length'], |
|
|
bins=[0, 10, 25, 50, 100, float('inf')], |
|
|
labels=['very_short', 'short', 'medium', 'long', 'very_long'] |
|
|
) |
|
|
|
|
|
length_performance = df_results.groupby('target_length_bucket')[['exact_match', 'f1_score', 'rouge_l']].mean().to_dict() |
|
|
summary_metrics['performance_by_length'] = length_performance |
|
|
|
|
|
print("\nπ EVALUATION RESULTS:") |
|
|
print(f"Total Samples: {summary_metrics['total_samples']}") |
|
|
print(f"Exact Match: {summary_metrics['exact_match_avg']:.4f} Β± {summary_metrics['exact_match_std']:.4f}") |
|
|
print(f"F1 Score: {summary_metrics['f1_score_avg']:.4f} Β± {summary_metrics['f1_score_std']:.4f}") |
|
|
print(f"ROUGE-L: {summary_metrics['rouge_l_avg']:.4f} Β± {summary_metrics['rouge_l_std']:.4f}") |
|
|
print(f"Perfect Matches: {summary_metrics['perfect_matches']}/{summary_metrics['total_samples']} ({summary_metrics['perfect_match_rate']:.2%})") |
|
|
|
|
|
print("\nπ Error Breakdown:") |
|
|
for error_type, count in error_analysis.items(): |
|
|
print(f" {error_type}: {count} ({count/len(results):.2%})") |
|
|
|
|
|
|
|
|
os.makedirs('eval_results', exist_ok=True) |
|
|
|
|
|
|
|
|
df_results.to_csv('eval_results/detailed_results.csv', index=False) |
|
|
|
|
|
|
|
|
with open('eval_results/summary_metrics.json', 'w') as f: |
|
|
json.dump(summary_metrics, f, indent=2, default=str) |
|
|
|
|
|
|
|
|
worst_samples = df_results.nsmallest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']] |
|
|
worst_samples.to_csv('eval_results/worst_predictions.csv', index=False) |
|
|
|
|
|
|
|
|
best_samples = df_results.nlargest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']] |
|
|
best_samples.to_csv('eval_results/best_predictions.csv', index=False) |
|
|
|
|
|
print("\nπΎ Results saved locally to eval_results/") |
|
|
|
|
|
|
|
|
try: |
|
|
print("π Uploading results to model repository...") |
|
|
api = HfApi() |
|
|
|
|
|
|
|
|
files_to_upload = [ |
|
|
('eval_results/summary_metrics.json', 'eval_results/summary_metrics.json'), |
|
|
('eval_results/detailed_results.csv', 'eval_results/detailed_results.csv'), |
|
|
('eval_results/worst_predictions.csv', 'eval_results/worst_predictions.csv'), |
|
|
('eval_results/best_predictions.csv', 'eval_results/best_predictions.csv') |
|
|
] |
|
|
|
|
|
for local_path, repo_path in files_to_upload: |
|
|
api.upload_file( |
|
|
path_or_fileobj=local_path, |
|
|
path_in_repo=repo_path, |
|
|
repo_id=model_name, |
|
|
commit_message=f"Add evaluation results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}" |
|
|
) |
|
|
print(f"β
Uploaded {repo_path}") |
|
|
|
|
|
print(f"β
All evaluation results uploaded to {model_name}") |
|
|
|
|
|
|
|
|
trackio.log({ |
|
|
"exact_match": summary_metrics['exact_match_avg'], |
|
|
"f1_score": summary_metrics['f1_score_avg'], |
|
|
"rouge_l": summary_metrics['rouge_l_avg'], |
|
|
"perfect_match_rate": summary_metrics['perfect_match_rate'], |
|
|
"total_samples": summary_metrics['total_samples'] |
|
|
}) |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β οΈ Warning: Could not upload to repository: {e}") |
|
|
print("πΎ Results are saved locally in eval_results/ directory") |
|
|
|
|
|
print("\nπ Evaluation completed successfully!") |
|
|
return summary_metrics |
|
|
|
|
|
if __name__ == "__main__": |
|
|
evaluate_model() |