Arabic-MMMLU-Leaderborad

Running

App Files Files Community

Omartificial-Intelligence-Space commited on Sep 25, 2024

Commit

402ebfa

verified ·

1 Parent(s): 21cb2aa

update submit.py

Browse files

Files changed (1) hide show

src/submission/submit.py +21 -27

src/submission/submit.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import json
 import os
 from datetime import datetime, timezone
@@ -10,7 +12,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 from langchain.prompts import PromptTemplate
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
@@ -69,7 +71,7 @@ def get_top_prediction(text, tokenizer, model):
     return top_option
 @spaces.GPU(duration=120)
-def evaluate_model_accuracy_by_subject(model_name, num_examples):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
@@ -84,12 +86,13 @@ def evaluate_model_accuracy_by_subject(model_name, num_examples):
         else:
             model = model.cpu()
-        # Load your custom MMMLU dataset
-        dataset = load_dataset("Omartificial-Intelligence-Space/Arabic_Openai_MMMLU")
-        dataset = dataset['test']
-        # Filter out excluded subjects
-        dataset = dataset.filter(lambda x: x['Subject'] not in excluded_subjects)
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
@@ -104,23 +107,15 @@ Answer:"""
         # Initialize results storage
         subject_results = {}
-        subjects = dataset.unique('Subject')
         overall_correct_predictions = 0
         overall_total_questions = 0
-        for subject in subjects:
-            subject_data = dataset.filter(lambda x: x['Subject'] == subject)
-            # Sample num_examples from each subject
-            if num_examples > 0:
-                subject_data = subject_data.shuffle().select(range(min(num_examples, len(subject_data))))
             correct_predictions = 0
             total_questions = 0
             results = []
-            for data in subject_data:
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
@@ -171,8 +166,7 @@ def add_new_eval(
     revision: str,
     precision: str,
     weight_type: str,
-    model_type: str,
-    num_examples: int
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
@@ -230,7 +224,7 @@ def add_new_eval(
     # Now, perform the evaluation
     try:
-        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model, int(num_examples))
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
@@ -239,17 +233,17 @@ def add_new_eval(
     # Prepare results for storage
     results_dict = {
         "config": {
-            "model_name": model,
-            "model_sha": revision,
-            "model_dtype": precision,
-            "submitted_time": current_time,
-            "model_type": model_type,
             "weight_type": weight_type,
             "license": license,
             "likes": model_info.likes,
             "params": model_size,
             "still_on_hub": True,
-            "precision": precision,
         },
         "results": {
             "average": overall_accuracy,
@@ -264,7 +258,7 @@ def add_new_eval(
     # Save results to a JSON file
     results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
     with open(results_file_path, "w") as f:
-        json.dump(results_dict, f)
     # Upload the results file
     API.upload_file(

+# src/submission/submit.py
 import json
 import os
 from datetime import datetime, timezone
 from langchain.prompts import PromptTemplate
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO, EVAL_RESULTS_PATH, RESULTS_REPO, FIXED_QUESTIONS_FILE
 from src.submission.check_validity import (
     already_submitted_models,
     check_model_card,
     return top_option
 @spaces.GPU(duration=120)
+def evaluate_model_accuracy_by_subject(model_name):
     try:
         # Load the model and tokenizer
         tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
         else:
             model = model.cpu()
+        # Load fixed questions from JSON file
+        fixed_questions_path = os.path.join(EVAL_RESULTS_PATH, FIXED_QUESTIONS_FILE)
+        if not os.path.exists(fixed_questions_path):
+            return "Fixed questions file not found. Please run the preselection step.", {}
+        with open(fixed_questions_path, 'r') as f:
+            fixed_questions = json.load(f)
         # Define prompt template
         template = """Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D].
         # Initialize results storage
         subject_results = {}
         overall_correct_predictions = 0
         overall_total_questions = 0
+        for subject, questions in fixed_questions.items():
             correct_predictions = 0
             total_questions = 0
             results = []
+            for data in questions:
                 # Prepare text input
                 text = prompt_template.format(
                     Question=data['Question'],
     revision: str,
     precision: str,
     weight_type: str,
+    model_type: str
 ):
     global REQUESTED_MODELS
     global USERS_TO_SUBMISSION_DATES
     # Now, perform the evaluation
     try:
+        overall_accuracy, subject_results = evaluate_model_accuracy_by_subject(model)
         if isinstance(overall_accuracy, str) and overall_accuracy.startswith("Error"):
             return styled_error(overall_accuracy)
     except Exception as e:
     # Prepare results for storage
     results_dict = {
         "config": {
+            "model": model,
+            "base_model": base_model,
+            "revision": revision,
+            "precision": precision,
             "weight_type": weight_type,
+            "model_type": model_type,
+            "submitted_time": current_time,
             "license": license,
             "likes": model_info.likes,
             "params": model_size,
             "still_on_hub": True,
         },
         "results": {
             "average": overall_accuracy,
     # Save results to a JSON file
     results_file_path = f"{EVAL_RESULTS_PATH}/{model.replace('/', '_')}_results.json"
     with open(results_file_path, "w") as f:
+        json.dump(results_dict, f, indent=4)
     # Upload the results file
     API.upload_file(