import os import json import numpy as np from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline import torch from sklearn.metrics import f1_score import re from collections import Counter import string from huggingface_hub import login import gradio as gr import pandas as pd from datetime import datetime # Normalization functions (identical to extractor) def normalize_answer(s): def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))) def f1_score_qa(prediction, ground_truth): prediction_tokens = normalize_answer(prediction).split() ground_truth_tokens = normalize_answer(ground_truth).split() common = Counter(prediction_tokens) & Counter(ground_truth_tokens) num_same = sum(common.values()) if num_same == 0: return 0 precision = 1.0 * num_same / len(prediction_tokens) recall = 1.0 * num_same / len(ground_truth_tokens) return (2 * precision * recall) / (precision + recall) def exact_match_score(prediction, ground_truth): return normalize_answer(prediction) == normalize_answer(ground_truth) # Identical confidence calculation to extractor def get_qa_confidence(model, tokenizer, question, context): inputs = tokenizer( question, context, return_tensors="pt", truncation=True, max_length=512, stride=128, padding=True ) if torch.cuda.is_available(): inputs = {k:v.cuda() for k,v in inputs.items()} model = model.cuda() with torch.no_grad(): outputs = model(**inputs) start_probs = torch.softmax(outputs.start_logits, dim=1) end_probs = torch.softmax(outputs.end_logits, dim=1) answer_start = torch.argmax(outputs.start_logits) answer_end = torch.argmax(outputs.end_logits) + 1 confidence = np.sqrt( start_probs[0, answer_start].item() * end_probs[0, answer_end-1].item() ) answer_tokens = inputs["input_ids"][0][answer_start:answer_end] answer = tokenizer.decode(answer_tokens, skip_special_tokens=True) return answer.strip(), float(confidence) def run_evaluation(num_samples, progress=gr.Progress()): # Authentication hf_token = os.getenv("EVAL_TOKEN") if hf_token: login(token=hf_token) # Load model same as extractor model_name = "AvocadoMuffin/roberta-cuad-qa-v2" tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token) model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token) progress(0.1, desc="Loading CUAD dataset...") try: dataset = load_dataset( "theatticusproject/cuad-qa", trust_remote_code=True, token=hf_token ) test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"])))) print(f"✓ Loaded {len(test_data)} samples") except Exception as e: return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None results = [] for i, example in enumerate(test_data): progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}") context = example["context"] question = example["question"] gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else "" pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context) results.append({ "Question": question[:100] + "..." if len(question) > 100 else question, "Prediction": pred_answer, "Truth": gt_answer, "Confidence": confidence, "Exact Match": exact_match_score(pred_answer, gt_answer), "F1": f1_score_qa(pred_answer, gt_answer) }) # Generate report df = pd.DataFrame(results) report = f""" Evaluation Results (n={len(df)}) ================= - Exact Match: {df['Exact Match'].mean():.1%} - F1 Score: {df['F1'].mean():.1%} - Avg Confidence: {df['Confidence'].mean():.1%} - High-Confidence (>80%) Accuracy: { df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%} """ # Save results timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") results_file = f"eval_results_{timestamp}.json" with open(results_file, 'w') as f: json.dump({ "model": model_name, "metrics": { "exact_match": float(df['Exact Match'].mean()), "f1": float(df['F1'].mean()), "avg_confidence": float(df['Confidence'].mean()) }, "samples": results }, f, indent=2) return report, df, results_file def create_gradio_interface(): with gr.Blocks(title="CUAD Evaluator") as demo: gr.Markdown("## 🏛️ CUAD QA Model Evaluation") with gr.Row(): num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of Samples") eval_btn = gr.Button("🚀 Run Evaluation", variant="primary") with gr.Row(): report = gr.Markdown("Results will appear here...") results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"]) download = gr.File(label="Download Results", visible=False) def run_and_display(num_samples): report_text, df, file = run_evaluation(num_samples) return ( report_text, df[["Question", "Prediction", "Confidence", "Exact Match"]], gr.File(visible=True, value=file) ) eval_btn.click( fn=run_and_display, inputs=num_samples, outputs=[report, results_table, download] ) return demo if __name__ == "__main__": # Verify CUDA if torch.cuda.is_available(): print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}") else: print("! Using CPU") # Launch Gradio demo = create_gradio_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=True )