Spaces:

MBZUAI-LLM
/

Mobile-MMLU-Challenge

Running

App Files Files Community

SondosMB commited on Dec 23, 2024

Commit

aa88144

verified ·

1 Parent(s): 956b788

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -15

app.py CHANGED Viewed

@@ -641,8 +641,40 @@ with gr.Blocks(css=css_tech_theme) as demo:
                 eval_status = gr.Textbox(label="🛠️ Evaluation Status", interactive=False,scale=1,min_width=1200)
         # Define the functions outside the `with` block
         def handle_evaluation(file, model_name):
-            # Check if required inputs are provided
             if not file:
                 return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
             if not model_name or model_name.strip() == "":
@@ -652,27 +684,39 @@ with gr.Blocks(css=css_tech_theme) as demo:
                 # Load predictions file
                 predictions_df = pd.read_csv(file.name)
-                # Validate required columns in the prediction file
                 required_columns = ['question_id', 'predicted_answer']
                 missing_columns = [col for col in required_columns if col not in predictions_df.columns]
                 if missing_columns:
                     return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
                             0, gr.update(visible=False))
-                # Perform evaluation
-                status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard=False)
-                if leaderboard.empty:
-                    overall_accuracy = 0
-                else:
-                    overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]
-                # Show the submit button after successful evaluation
-                return status, overall_accuracy, gr.update(visible=True)
-            except Exception as e:
-                # Handle unexpected errors
-                return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)

                 eval_status = gr.Textbox(label="🛠️ Evaluation Status", interactive=False,scale=1,min_width=1200)
         # Define the functions outside the `with` block
+        # def handle_evaluation(file, model_name):
+        #     # Check if required inputs are provided
+        #     if not file:
+        #         return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
+        #     if not model_name or model_name.strip() == "":
+        #         return "Error: Please enter a model name.", 0, gr.update(visible=False)
+        #     try:
+        #         # Load predictions file
+        #         predictions_df = pd.read_csv(file.name)
+        #         # Validate required columns in the prediction file
+        #         required_columns = ['question_id', 'predicted_answer']
+        #         missing_columns = [col for col in required_columns if col not in predictions_df.columns]
+        #         if missing_columns:
+        #             return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
+        #                     0, gr.update(visible=False))
+        #         # Perform evaluation
+        #         status, leaderboard = evaluate_predictions(file, model_name, add_to_leaderboard=False)
+        #         if leaderboard.empty:
+        #             overall_accuracy = 0
+        #         else:
+        #             overall_accuracy = leaderboard.iloc[-1]["Overall Accuracy"]
+        #         # Show the submit button after successful evaluation
+        #         return status, overall_accuracy, gr.update(visible=True)
+        #     except Exception as e:
+        #         # Handle unexpected errors
+        #         return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)
         def handle_evaluation(file, model_name):
             if not file:
                 return "Error: Please upload a prediction file.", 0, gr.update(visible=False)
             if not model_name or model_name.strip() == "":
                 # Load predictions file
                 predictions_df = pd.read_csv(file.name)
+                # Validate required columns
                 required_columns = ['question_id', 'predicted_answer']
                 missing_columns = [col for col in required_columns if col not in predictions_df.columns]
                 if missing_columns:
                     return (f"Error: Missing required columns in prediction file: {', '.join(missing_columns)}.",
                             0, gr.update(visible=False))
+                # Load ground truth
+                try:
+                    ground_truth_path = hf_hub_download(
+                        repo_id="SondosMB/ground-truth-dataset",
+                        filename="ground_truth.csv",
+                        repo_type="dataset",
+                        use_auth_token=True
+                    )
+                    ground_truth_df = pd.read_csv(ground_truth_path)
+                except Exception as e:
+                    return f"Error loading ground truth: {e}", 0, gr.update(visible=False)
+        # Perform evaluation calculations
+        merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner')
+        merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer)
+        valid_predictions = merged_df.dropna(subset=['pred_answer'])
+        correct_predictions = (valid_predictions['pred_answer'] == valid_predictions['Answer']).sum()
+        total_predictions = len(merged_df)
+        overall_accuracy = (correct_predictions / total_predictions * 100) if total_predictions > 0 else 0
+        return "Evaluation completed successfully.", overall_accuracy, gr.update(visible=True)
+    except Exception as e:
+        return f"Error during evaluation: {str(e)}", 0, gr.update(visible=False)