Spaces:

AvocadoMuffin
/

eval_model

Running

App Files Files Community

AvocadoMuffin commited on Jul 11, 2025

Commit

98d17bf

verified ·

1 Parent(s): cb1cf5c

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -72

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import json
 import numpy as np
 from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
 import torch
 from sklearn.metrics import f1_score
 import re
@@ -13,17 +13,17 @@ import gradio as gr
 import pandas as pd
 from datetime import datetime
-# Normalization functions (identical to extractor)
 def normalize_answer(s):
     def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
     def white_space_fix(text): return ' '.join(text.split())
     def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
     def lower(text): return text.lower()
-    return white_space_fix(remove_articles(remove_punc(lower(s)))
 def f1_score_qa(prediction, ground_truth):
     prediction_tokens = normalize_answer(prediction).split()
     ground_truth_tokens = normalize_answer(ground_truth).split()
     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
@@ -34,10 +34,11 @@ def f1_score_qa(prediction, ground_truth):
     return (2 * precision * recall) / (precision + recall)
 def exact_match_score(prediction, ground_truth):
     return normalize_answer(prediction) == normalize_answer(ground_truth)
-# Identical confidence calculation to extractor
 def get_qa_confidence(model, tokenizer, question, context):
     inputs = tokenizer(
         question, context,
         return_tensors="pt",
@@ -48,7 +49,6 @@ def get_qa_confidence(model, tokenizer, question, context):
     )
     if torch.cuda.is_available():
         inputs = {k:v.cuda() for k,v in inputs.items()}
-        model = model.cuda()
     with torch.no_grad():
         outputs = model(**inputs)
@@ -64,20 +64,30 @@ def get_qa_confidence(model, tokenizer, question, context):
     )
     answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
-    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
-    return answer.strip(), float(confidence)
 def run_evaluation(num_samples, progress=gr.Progress()):
     # Authentication
     hf_token = os.getenv("EVAL_TOKEN")
     if hf_token:
-        login(token=hf_token)
-    # Load model same as extractor
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-    model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
     progress(0.1, desc="Loading CUAD dataset...")
     try:
         dataset = load_dataset(
@@ -86,96 +96,111 @@ def run_evaluation(num_samples, progress=gr.Progress()):
             token=hf_token
         )
         test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
-        print(f"✓ Loaded {len(test_data)} samples")
     except Exception as e:
-        return f"❌ Dataset load failed: {str(e)}", pd.DataFrame(), None
-    results = []
     for i, example in enumerate(test_data):
-        progress(0.2 + 0.7*i/num_samples, desc=f"Evaluating {i+1}/{num_samples}")
-        context = example["context"]
-        question = example["question"]
-        gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
-        pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
-        results.append({
-            "Question": question[:100] + "..." if len(question) > 100 else question,
-            "Prediction": pred_answer,
-            "Truth": gt_answer,
-            "Confidence": confidence,
-            "Exact Match": exact_match_score(pred_answer, gt_answer),
-            "F1": f1_score_qa(pred_answer, gt_answer)
-        })
-    # Generate report
-    df = pd.DataFrame(results)
-    report = f"""
-    Evaluation Results (n={len(df)})
-    =================
-    - Exact Match: {df['Exact Match'].mean():.1%}
-    - F1 Score: {df['F1'].mean():.1%}
-    - Avg Confidence: {df['Confidence'].mean():.1%}
     - High-Confidence (>80%) Accuracy: {
-        df[df['Confidence'] > 0.8]['Exact Match'].mean():.1%}
     """
-    # Save results
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-    results_file = f"eval_results_{timestamp}.json"
-    with open(results_file, 'w') as f:
         json.dump({
             "model": model_name,
             "metrics": {
-                "exact_match": float(df['Exact Match'].mean()),
-                "f1": float(df['F1'].mean()),
                 "avg_confidence": float(df['Confidence'].mean())
             },
-            "samples": results
         }, f, indent=2)
-    return report, df, results_file
 def create_gradio_interface():
-    with gr.Blocks(title="CUAD Evaluator") as demo:
-        gr.Markdown("## 🏛️ CUAD QA Model Evaluation")
-        with gr.Row():
-            num_samples = gr.Slider(10, 500, value=100, step=10,
-                                   label="Number of Samples")
-            eval_btn = gr.Button("🚀 Run Evaluation", variant="primary")
         with gr.Row():
-            report = gr.Markdown("Results will appear here...")
-            results_table = gr.Dataframe(headers=["Question", "Prediction", "Confidence", "Exact Match"])
-        download = gr.File(label="Download Results", visible=False)
-        def run_and_display(num_samples):
-            report_text, df, file = run_evaluation(num_samples)
             return (
-                report_text,
-                df[["Question", "Prediction", "Confidence", "Exact Match"]],
-                gr.File(visible=True, value=file)
             )
-        eval_btn.click(
-            fn=run_and_display,
             inputs=num_samples,
-            outputs=[report, results_table, download]
         )
     return demo
 if __name__ == "__main__":
-    # Verify CUDA
-    if torch.cuda.is_available():
-        print(f"✓ CUDA available: {torch.cuda.get_device_name(0)}")
-    else:
-        print("! Using CPU")
-    # Launch Gradio
     demo = create_gradio_interface()
     demo.launch(
         server_name="0.0.0.0",

 import json
 import numpy as np
 from datasets import load_dataset
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
 import torch
 from sklearn.metrics import f1_score
 import re
 import pandas as pd
 from datetime import datetime
 def normalize_answer(s):
+    """Identical to extractor's normalization"""
     def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text)
     def white_space_fix(text): return ' '.join(text.split())
     def remove_punc(text):
+        return ''.join(ch for ch in text if ch not in set(string.punctuation))
     def lower(text): return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
 def f1_score_qa(prediction, ground_truth):
+    """Identical to original"""
     prediction_tokens = normalize_answer(prediction).split()
     ground_truth_tokens = normalize_answer(ground_truth).split()
     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
     return (2 * precision * recall) / (precision + recall)
 def exact_match_score(prediction, ground_truth):
+    """Identical to original"""
     return normalize_answer(prediction) == normalize_answer(ground_truth)
 def get_qa_confidence(model, tokenizer, question, context):
+    """Identical to extractor's confidence calculation"""
     inputs = tokenizer(
         question, context,
         return_tensors="pt",
     )
     if torch.cuda.is_available():
         inputs = {k:v.cuda() for k,v in inputs.items()}
     with torch.no_grad():
         outputs = model(**inputs)
     )
     answer_tokens = inputs["input_ids"][0][answer_start:answer_end]
+    answer = tokenizer.decode(answer_tokens, skip_special_tokens=True).strip()
+    return answer, float(confidence)
 def run_evaluation(num_samples, progress=gr.Progress()):
+    """Modified to use extractor's confidence calculation"""
     # Authentication
     hf_token = os.getenv("EVAL_TOKEN")
     if hf_token:
+        try:
+            login(token=hf_token)
+        except Exception as e:
+            print(f"Auth error: {e}")
+    # Load model (raw instead of pipeline)
     model_name = "AvocadoMuffin/roberta-cuad-qa-v2"
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+        model = AutoModelForQuestionAnswering.from_pretrained(model_name, token=hf_token)
+        if torch.cuda.is_available():
+            model = model.cuda()
+    except Exception as e:
+        return f"❌ Model load failed: {e}", pd.DataFrame(), None
+    # Load dataset
     progress(0.1, desc="Loading CUAD dataset...")
     try:
         dataset = load_dataset(
             token=hf_token
         )
         test_data = dataset["test"].select(range(min(num_samples, len(dataset["test"]))))
     except Exception as e:
+        return f"❌ Dataset load failed: {e}", pd.DataFrame(), None
+    predictions = []
     for i, example in enumerate(test_data):
+        progress((0.2 + 0.7 * i / num_samples), desc=f"Processing {i+1}/{num_samples}")
+        try:
+            context = example["context"]
+            question = example["question"]
+            gt_answer = example["answers"]["text"][0] if example["answers"]["text"] else ""
+            # Use extractor-style confidence
+            pred_answer, confidence = get_qa_confidence(model, tokenizer, question, context)
+            predictions.append({
+                "Sample_ID": i+1,
+                "Question": question[:100] + "..." if len(question) > 100 else question,
+                "Predicted_Answer": pred_answer,
+                "Ground_Truth": gt_answer,
+                "Exact_Match": exact_match_score(pred_answer, gt_answer),
+                "F1_Score": round(f1_score_qa(pred_answer, gt_answer), 3),
+                "Confidence": round(confidence, 3)  # Now matches extractor
+            })
+        except Exception as e:
+            print(f"Error sample {i}: {e}")
+            continue
+    # Generate report (identical to original)
+    if not predictions:
+        return "❌ No valid predictions", pd.DataFrame(), None
+    df = pd.DataFrame(predictions)
+    avg_em = df["Exact_Match"].mean() * 100
+    avg_f1 = df["F1_Score"].mean() * 100
+    results_summary = f"""
+    # 📊 Evaluation Results (n={len(df)})
+    ## 🎯 Metrics
+    - Exact Match: {avg_em:.2f}%
+    - F1 Score: {avg_f1:.2f}%
+    - Avg Confidence: {df['Confidence'].mean():.2%}
+    ## 🔍 Confidence Analysis
     - High-Confidence (>80%) Accuracy: {
+        df[df['Confidence'] > 0.8]['Exact_Match'].mean():.1%}
     """
+    # Save results (identical to original)
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    results_file = f"cuad_eval_{timestamp}.json"
+    with open(results_file, "w") as f:
         json.dump({
             "model": model_name,
             "metrics": {
+                "exact_match": float(avg_em),
+                "f1_score": float(avg_f1),
                 "avg_confidence": float(df['Confidence'].mean())
             },
+            "samples": predictions
         }, f, indent=2)
+    return results_summary, df, results_file
+# YOUR ORIGINAL GRADIO INTERFACE (COMPLETELY UNCHANGED)
 def create_gradio_interface():
+    with gr.Blocks(title="CUAD Model Evaluator", theme=gr.themes.Soft()) as demo:
+        gr.HTML("""
+        <div style="text-align: center; padding: 20px;">
+            <h1>🏛️ CUAD Model Evaluation Dashboard</h1>
+            <p>Evaluate your CUAD (Contract Understanding Atticus Dataset) Question Answering model</p>
+            <p><strong>Model:</strong> AvocadoMuffin/roberta-cuad-qa-v2</p>
+        </div>
+        """)
         with gr.Row():
+            with gr.Column(scale=1):
+                gr.HTML("<h3>⚙️ Evaluation Settings</h3>")
+                num_samples = gr.Slider(10, 500, value=100, step=10, label="Number of samples")
+                evaluate_btn = gr.Button("🚀 Start Evaluation", variant="primary")
+            with gr.Column(scale=2):
+                results_summary = gr.Markdown("Click '🚀 Start Evaluation' to begin...")
+        gr.HTML("<hr>")
+        detailed_results = gr.Dataframe(interactive=False, wrap=True)
+        download_file = gr.File(visible=False)
+        def handle_eval(num_samples):
+            summary, df, file = run_evaluation(num_samples)
             return (
+                summary,
+                df[["Sample_ID", "Question", "Predicted_Answer", "Confidence", "Exact_Match"]],
+                gr.File(visible=True, value=file) if file else gr.File(visible=False)
             )
+        evaluate_btn.click(
+            fn=handle_eval,
             inputs=num_samples,
+            outputs=[results_summary, detailed_results, download_file],
+            show_progress=True
         )
     return demo
 if __name__ == "__main__":
     demo = create_gradio_interface()
     demo.launch(
         server_name="0.0.0.0",