ligaments-dev
/

model-evaluation-scripts

Model card Files Files and versions

xet

Community

ligaments-dev commited on Dec 9, 2025

Commit

18c6ce2

verified ·

1 Parent(s): ce462eb

Update evaluation script with new token

Browse files

Files changed (1) hide show

model_evaluation.py +366 -0

model_evaluation.py ADDED Viewed

	@@ -0,0 +1,366 @@

+# /// script
+# dependencies = [
+#     "transformers>=4.40.0",
+#     "datasets>=2.18.0",
+#     "torch>=2.0.0",
+#     "rouge-score>=0.1.2",
+#     "evaluate>=0.4.0",
+#     "numpy>=1.24.0",
+#     "pandas>=2.0.0",
+#     "scikit-learn>=1.3.0",
+#     "huggingface-hub>=0.20.0",
+#     "accelerate>=0.27.0",
+#     "trackio"
+# ]
+# ///
+import os
+import json
+import pandas as pd
+import numpy as np
+from datetime import datetime
+from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from rouge_score import rouge_scorer
+from sklearn.metrics import f1_score
+import re
+import trackio
+from huggingface_hub import HfApi, upload_file
+import torch
+def normalize_text(text):
+    """Normalize text for comparison"""
+    if not isinstance(text, str):
+        return ""
+    # Remove extra whitespace and normalize
+    text = re.sub(r'\s+', ' ', text.strip())
+    return text.lower()
+def compute_exact_match(pred, true):
+    """Compute exact match score"""
+    return float(normalize_text(pred) == normalize_text(true))
+def compute_f1_score(pred, true):
+    """Compute token-level F1 score"""
+    pred_tokens = normalize_text(pred).split()
+    true_tokens = normalize_text(true).split()
+    if len(pred_tokens) == 0 and len(true_tokens) == 0:
+        return 1.0
+    if len(pred_tokens) == 0 or len(true_tokens) == 0:
+        return 0.0
+    # Convert to sets for intersection
+    pred_set = set(pred_tokens)
+    true_set = set(true_tokens)
+    if len(pred_set) == 0 and len(true_set) == 0:
+        return 1.0
+    intersection = pred_set.intersection(true_set)
+    precision = len(intersection) / len(pred_set) if pred_set else 0
+    recall = len(intersection) / len(true_set) if true_set else 0
+    if precision + recall == 0:
+        return 0.0
+    f1 = 2 * (precision * recall) / (precision + recall)
+    return f1
+def compute_rouge_l(pred, true):
+    """Compute ROUGE-L score"""
+    scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
+    scores = scorer.score(normalize_text(true), normalize_text(pred))
+    return scores['rougeL'].fmeasure
+def evaluate_model():
+    # Initialize Trackio
+    trackio.init()
+    print("🚀 Starting model evaluation...")
+    # Configuration
+    model_name = "ligaments-enterprise/llama3.2-1b-instruct-sec-finetuned"
+    dataset_name = "ligaments-enterprise/sec-data"
+    print(f"📊 Loading dataset: {dataset_name}")
+    try:
+        # Try to load the dataset
+        dataset = load_dataset(dataset_name, split="train")
+        print(f"✅ Dataset loaded successfully. Size: {len(dataset)}")
+    except Exception as e:
+        print(f"❌ Error loading dataset: {e}")
+        # Try different splits
+        try:
+            dataset = load_dataset(dataset_name)
+            if isinstance(dataset, dict):
+                # Use the first available split
+                split_name = list(dataset.keys())[0]
+                dataset = dataset[split_name]
+                print(f"✅ Using split '{split_name}'. Size: {len(dataset)}")
+        except Exception as e2:
+            print(f"❌ Failed to load dataset: {e2}")
+            return
+    # Inspect dataset structure
+    print(f"📋 Dataset columns: {dataset.column_names}")
+    print(f"📋 First example: {dataset[0]}")
+    # Determine input/output columns
+    possible_input_cols = ['prompt', 'input', 'question', 'instruction', 'text']
+    possible_output_cols = ['response', 'output', 'answer', 'completion', 'target']
+    input_col = None
+    output_col = None
+    for col in possible_input_cols:
+        if col in dataset.column_names:
+            input_col = col
+            break
+    for col in possible_output_cols:
+        if col in dataset.column_names:
+            output_col = col
+            break
+    # Handle messages format
+    if 'messages' in dataset.column_names:
+        print("📋 Detected messages format, extracting prompts and responses...")
+        def extract_from_messages(example):
+            messages = example['messages']
+            if isinstance(messages, list) and len(messages) >= 2:
+                # Find the last user message and assistant response
+                user_msg = None
+                assistant_msg = None
+                for msg in messages:
+                    if msg.get('role') == 'user':
+                        user_msg = msg.get('content', '')
+                    elif msg.get('role') == 'assistant':
+                        assistant_msg = msg.get('content', '')
+                return {
+                    'input_text': user_msg or '',
+                    'target_text': assistant_msg or ''
+                }
+            return {'input_text': '', 'target_text': ''}
+        dataset = dataset.map(extract_from_messages)
+        input_col = 'input_text'
+        output_col = 'target_text'
+    if not input_col or not output_col:
+        print(f"❌ Could not identify input/output columns. Available: {dataset.column_names}")
+        return
+    print(f"✅ Using input column: {input_col}, output column: {output_col}")
+    print(f"🤖 Loading model: {model_name}")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True
+        )
+        # Set pad token if not set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        print("✅ Model loaded successfully")
+    except Exception as e:
+        print(f"❌ Error loading model: {e}")
+        return
+    # Create text generation pipeline
+    generator = pipeline(
+        "text-generation",
+        model=model,
+        tokenizer=tokenizer,
+        torch_dtype=torch.float16,
+        device_map="auto"
+    )
+    # Limit evaluation to reasonable size for demonstration
+    eval_size = min(100, len(dataset))
+    eval_dataset = dataset.select(range(eval_size))
+    print(f"📊 Evaluating on {eval_size} samples...")
+    results = []
+    for i, example in enumerate(eval_dataset):
+        if i % 10 == 0:
+            print(f"📈 Processing sample {i+1}/{eval_size}")
+        input_text = example[input_col]
+        target_text = example[output_col]
+        if not input_text or not target_text:
+            continue
+        # Generate prediction
+        try:
+            # Format prompt appropriately
+            if not input_text.strip().endswith(('?', '.', '!', ':')):
+                formatted_prompt = f"{input_text.strip()}:"
+            else:
+                formatted_prompt = input_text.strip()
+            generated = generator(
+                formatted_prompt,
+                max_new_tokens=256,
+                do_sample=False,  # Deterministic for evaluation
+                temperature=0.1,
+                pad_token_id=tokenizer.eos_token_id,
+                return_full_text=False
+            )
+            prediction = generated[0]['generated_text'].strip()
+            # Compute metrics
+            exact_match = compute_exact_match(prediction, target_text)
+            f1 = compute_f1_score(prediction, target_text)
+            rouge_l = compute_rouge_l(prediction, target_text)
+            # Error analysis
+            error_type = "correct" if exact_match == 1.0 else "incorrect"
+            if exact_match == 0 and f1 > 0.5:
+                error_type = "partial_match"
+            elif exact_match == 0 and rouge_l > 0.3:
+                error_type = "semantic_similarity"
+            elif len(prediction.split()) > len(target_text.split()) * 2:
+                error_type = "too_verbose"
+            elif len(prediction.split()) < len(target_text.split()) * 0.5:
+                error_type = "too_brief"
+            result = {
+                'sample_id': i,
+                'input': input_text,
+                'target': target_text,
+                'prediction': prediction,
+                'exact_match': exact_match,
+                'f1_score': f1,
+                'rouge_l': rouge_l,
+                'error_type': error_type,
+                'input_length': len(input_text.split()),
+                'target_length': len(target_text.split()),
+                'prediction_length': len(prediction.split())
+            }
+            results.append(result)
+        except Exception as e:
+            print(f"⚠️  Error processing sample {i}: {e}")
+            continue
+    if not results:
+        print("❌ No results generated")
+        return
+    # Compute summary statistics
+    df_results = pd.DataFrame(results)
+    summary_metrics = {
+        'evaluation_timestamp': datetime.now().isoformat(),
+        'model_name': model_name,
+        'dataset_name': dataset_name,
+        'total_samples': len(results),
+        'exact_match_avg': df_results['exact_match'].mean(),
+        'f1_score_avg': df_results['f1_score'].mean(),
+        'rouge_l_avg': df_results['rouge_l'].mean(),
+        'exact_match_std': df_results['exact_match'].std(),
+        'f1_score_std': df_results['f1_score'].std(),
+        'rouge_l_std': df_results['rouge_l'].std(),
+        'perfect_matches': int(df_results['exact_match'].sum()),
+        'perfect_match_rate': df_results['exact_match'].mean()
+    }
+    # Error analysis summary
+    error_analysis = df_results['error_type'].value_counts().to_dict()
+    summary_metrics['error_breakdown'] = error_analysis
+    # Performance by length buckets
+    df_results['target_length_bucket'] = pd.cut(
+        df_results['target_length'],
+        bins=[0, 10, 25, 50, 100, float('inf')],
+        labels=['very_short', 'short', 'medium', 'long', 'very_long']
+    )
+    length_performance = df_results.groupby('target_length_bucket')[['exact_match', 'f1_score', 'rouge_l']].mean().to_dict()
+    summary_metrics['performance_by_length'] = length_performance
+    print("\n📊 EVALUATION RESULTS:")
+    print(f"Total Samples: {summary_metrics['total_samples']}")
+    print(f"Exact Match: {summary_metrics['exact_match_avg']:.4f} ± {summary_metrics['exact_match_std']:.4f}")
+    print(f"F1 Score: {summary_metrics['f1_score_avg']:.4f} ± {summary_metrics['f1_score_std']:.4f}")
+    print(f"ROUGE-L: {summary_metrics['rouge_l_avg']:.4f} ± {summary_metrics['rouge_l_std']:.4f}")
+    print(f"Perfect Matches: {summary_metrics['perfect_matches']}/{summary_metrics['total_samples']} ({summary_metrics['perfect_match_rate']:.2%})")
+    print("\n🔍 Error Breakdown:")
+    for error_type, count in error_analysis.items():
+        print(f"  {error_type}: {count} ({count/len(results):.2%})")
+    # Save results locally first
+    os.makedirs('eval_results', exist_ok=True)
+    # Save detailed results
+    df_results.to_csv('eval_results/detailed_results.csv', index=False)
+    # Save summary metrics
+    with open('eval_results/summary_metrics.json', 'w') as f:
+        json.dump(summary_metrics, f, indent=2, default=str)
+    # Save top errors for analysis
+    worst_samples = df_results.nsmallest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
+    worst_samples.to_csv('eval_results/worst_predictions.csv', index=False)
+    # Save best samples
+    best_samples = df_results.nlargest(10, 'f1_score')[['sample_id', 'input', 'target', 'prediction', 'f1_score', 'error_type']]
+    best_samples.to_csv('eval_results/best_predictions.csv', index=False)
+    print("\n💾 Results saved locally to eval_results/")
+    # Upload results to model repository
+    try:
+        print("🚀 Uploading results to model repository...")
+        api = HfApi()
+        # Upload all result files
+        files_to_upload = [
+            ('eval_results/summary_metrics.json', 'eval_results/summary_metrics.json'),
+            ('eval_results/detailed_results.csv', 'eval_results/detailed_results.csv'),
+            ('eval_results/worst_predictions.csv', 'eval_results/worst_predictions.csv'),
+            ('eval_results/best_predictions.csv', 'eval_results/best_predictions.csv')
+        ]
+        for local_path, repo_path in files_to_upload:
+            api.upload_file(
+                path_or_fileobj=local_path,
+                path_in_repo=repo_path,
+                repo_id=model_name,
+                commit_message=f"Add evaluation results - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
+                token=os.getenv('HF_TOKEN')
+            )
+            print(f"✅ Uploaded {repo_path}")
+        print(f"✅ All evaluation results uploaded to {model_name}")
+        # Log to Trackio
+        trackio.log({
+            "exact_match": summary_metrics['exact_match_avg'],
+            "f1_score": summary_metrics['f1_score_avg'],
+            "rouge_l": summary_metrics['rouge_l_avg'],
+            "perfect_match_rate": summary_metrics['perfect_match_rate'],
+            "total_samples": summary_metrics['total_samples']
+        })
+    except Exception as e:
+        print(f"⚠️  Warning: Could not upload to repository: {e}")
+        print("💾 Results are saved locally in eval_results/ directory")
+    print("\n🎉 Evaluation completed successfully!")
+    return summary_metrics
+if __name__ == "__main__":
+    evaluate_model()