Spaces:

AvocadoMuffin
/

eval_model

Sleeping

App Files Files Community

eval_model / app.py

AvocadoMuffin

Update app.py

37e8cfe verified 5 months ago

raw

history blame

5.25 kB

	import os
	import json
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	import torch
	from sklearn.metrics import f1_score
	import re
	from collections import Counter
	import string
	from huggingface_hub import login
	import gradio as gr
	import pandas as pd
	from datetime import datetime
	import matplotlib.pyplot as plt

	# Normalization functions (same as extractor)
	def normalize_answer(s):
	def remove_articles(text):
	return re.sub(r'\b(a\|an\|the)\b', ' ', text)
	def white_space_fix(text):
	return ' '.join(text.split())
	def remove_punc(text):
	exclude = set(string.punctuation)
	return ''.join(ch for ch in text if ch not in exclude)
	def lower(text):
	return text.lower()
	return white_space_fix(remove_articles(remove_punc(lower(s))))

	def f1_score_qa(prediction, ground_truth):
	prediction_tokens = normalize_answer(prediction).split()
	ground_truth_tokens = normalize_answer(ground_truth).split()
	common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
	num_same = sum(common.values())
	if num_same == 0:
	return 0
	precision = 1.0 * num_same / len(prediction_tokens)
	recall = 1.0 * num_same / len(ground_truth_tokens)
	return (2 * precision * recall) / (precision + recall)

	def exact_match_score(prediction, ground_truth):
	return normalize_answer(prediction) == normalize_answer(ground_truth)

	# Identical confidence calculation to extractor
	def calculate_confidence(model, tokenizer, question, context):
	inputs = tokenizer(
	question,
	context,
	return_tensors="pt",
	truncation=True,
	max_length=512,
	stride=128,
	padding=True
	)

	if torch.cuda.is_available():
	inputs = {k: v.cuda() for k, v in inputs.items()}
	model = model.cuda()

	with torch.no_grad():
	outputs = model(**inputs)

	start_probs = torch.softmax(outputs.start_logits, dim=1)
	end_probs = torch.softmax(outputs.end_logits, dim=1)
	answer_start = torch.argmax(outputs.start_logits)
	answer_end = torch.argmax(outputs.end_logits) + 1

	start_prob = start_probs[0, answer_start].item()
	end_prob = end_probs[0, answer_end-1].item()
	confidence = np.sqrt(start_prob * end_prob)

	answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
	answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()

	return answer, float(confidence)

	def run_evaluation(num_samples=100):
	# Authenticate
	if token := os.getenv("HF_TOKEN"):
	login(token=token)

	# Load model same as extractor
	model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForQuestionAnswering.from_pretrained(model_name)

	# Load CUAD dataset
	dataset = load_dataset("theatticusproject/cuad-qa", token=token)
	test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))

	results = []
	for example in test_data:
	context = example["context"]
	question = example["question"]
	gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""

	pred_answer, confidence = calculate_confidence(model, tokenizer, question, context)

	results.append({
	"question": question,
	"prediction": pred_answer,
	"ground_truth": gt_answer,
	"confidence": confidence,
	"exact_match": exact_match_score(pred_answer, gt_answer),
	"f1": f1_score_qa(pred_answer, gt_answer)
	})

	# Generate report
	df = pd.DataFrame(results)
	avg_metrics = {
	"exact_match": df["exact_match"].mean() * 100,
	"f1": df["f1"].mean() * 100,
	"confidence": df["confidence"].mean() * 100
	}

	# Confidence calibration analysis
	high_conf_correct = df[(df["confidence"] > 0.8) & (df["exact_match"] == 1)].shape[0]
	high_conf_total = df[df["confidence"] > 0.8].shape[0]

	report = f"""
	CUAD Evaluation Report (n={len(df)})
	========================
	Accuracy:
	- Exact Match: {avg_metrics['exact_match']:.2f}%
	- F1 Score: {avg_metrics['f1']:.2f}%

	Confidence Analysis:
	- Avg Confidence: {avg_metrics['confidence']:.2f}%
	- High-Confidence (>80%) Accuracy: {high_conf_correct}/{high_conf_total} ({high_conf_correct/max(1,high_conf_total)*100:.1f}%)

	Confidence vs Accuracy:
	{df[['confidence', 'exact_match']].corr().iloc[0,1]:.3f} correlation
	"""

	# Save results
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	results_file = f"cuad_eval_{timestamp}.json"
	with open(results_file, "w") as f:
	json.dump({
	"metrics": avg_metrics,
	"samples": results,
	"config": {
	"model": model_name,
	"confidence_method": "geometric_mean_start_end_probs"
	}
	}, f, indent=2)

	return report, df, results_file

	if __name__ == "__main__":
	report, df, _ = run_evaluation()
	print(report)
	print("\nSample predictions:")
	print(df.head())