Spaces:

HiTZ
/

Critical_Questions_Leaderboard

Running

App Files Files Community

Blanca commited on Sep 18, 2025

Commit

0ef86f0

verified ·

1 Parent(s): c4da416

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -75

app.py CHANGED Viewed

@@ -22,10 +22,10 @@ TOKEN = os.environ.get("TOKEN", None)
 OWNER="Blanca"
 DATA_DATASET = f"{OWNER}/CQs-Gen_test"
-INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test"
 SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
 SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
-#CONTACT_DATASET = f"{OWNER}/contact_info"
 RESULTS_DATASET = f"{OWNER}/results_public"
 LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
 METRIC = 'similarity'
@@ -34,13 +34,12 @@ api = HfApi()
 if METRIC == 'similarity':
     similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
-if METRIC == 'gemma':
-    model = AutoModelForCausalLM.from_pretrained('google/gemma-2-9b-it', device_map="auto", attn_implementation='eager')
-    tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-9b-it')
 YEAR_VERSION = "2025"
 ref_scores_len = {"test": 34}
-#ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
 os.makedirs("scored", exist_ok=True)
@@ -58,6 +57,7 @@ test_results = load_dataset(
     trust_remote_code=True,
 )
 eval_results = {"test": test_results}
 #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
 def get_dataframe_from_results(eval_results, split):
     local_df = eval_results[split]
@@ -70,8 +70,6 @@ def get_dataframe_from_results(eval_results, split):
     df = pd.DataFrame(local_df)
     df = df.sort_values(by=["Score (%)"], ascending=False)
-    #df["Score (%)"] = df["Score (%)"].multiply(100).round(2)
     return df
@@ -80,8 +78,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
 # Gold answers
 gold_results = {}
 gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
-#gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
@@ -99,18 +95,13 @@ def run_model(model, tokenizer, prompt):
     generated_ids = model.generate(**inputs, max_new_tokens=512)
-    #generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)] # this does not work for Gemma
     out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    #print(out, flush=True)
     try:
         output = out.split('model\n')[1].replace('\n', '')
     except IndexError:
         print('EVAL ERROR: '+output, flush=True)
-    #import pdb; pdb.set_trace()
     output = output.strip()
     return output
@@ -159,7 +150,7 @@ def add_new_eval(
     if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
         return format_error("This account is not authorized to submit on this leaderboard.")
     #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
     #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
     #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
@@ -188,7 +179,7 @@ def add_new_eval(
         api.upload_file(
             repo_id=SUBMISSION_DATASET,
             path_or_fileobj=path_to_file.name,
-            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
             repo_type="dataset",
             token=TOKEN
         )
@@ -203,6 +194,8 @@ def add_new_eval(
         "mail": mail,
         "date": datetime.datetime.today().strftime('%Y-%m-%d')
     }
     #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
     #if LOCAL_DEBUG:
     #    print("mock uploaded contact info")
@@ -215,26 +208,28 @@ def add_new_eval(
     num_questions = 0
     task_ids = []
-    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
         with open(file_path, 'r') as f:
             data = json.load(f)
             for id_to_eval, line in data.items():
-                score = 0
                 for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
                     if id_to_eval == intervention_id:
                         references = gold_dataset['cqs']
                         reference_set = [row['cq'] for row in references[indx]]
                         #print(reference_set, flush=True)
-                        for cq in line['cqs']:
                             cq_text = cq['cq']
-                            #print(cq_text, flush=True)
                             if METRIC == 'similarity':
                                 sentence_embedding = similarity_model.encode(cq_text)
-                                reference_embedding = similarity_model.encode(reference_set)
                                 sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
-                                #print(sims, flush=True)
                                 winner = np.argmax(sims)
                                 # make sure the similarity of the winning reference sentence is at least 0.65
@@ -261,49 +256,37 @@ def add_new_eval(
                             print(label, flush=True)
                             if label == 'Useful':
-                                score += 1/3
-                print(id_to_eval, score, flush=True)
-                #return format_error(score)
-                #try:
-                #    task = json.loads(line)
-                #except Exception:
-                #    return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
-                #if "model_answer" not in task:
-                #    return format_error(f"Line {ix} missing 'model_answer'.")
-                #answer = task["model_answer"]
-                #task_id = task["task_id"]
-                #if task_id not in gold_results[val_or_test]:
-                #    return format_error(f"{task_id} not found in gold set.")
-                #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
-                #score = 1
                 scored_file.write(
                     json.dumps({
                         "id": intervention_id,
                         #"model_answer": answer,
-                        "score": score
                     }) + "\n"
                 )
                 task_ids.append(intervention_id)
-                scores += score
-                num_questions += 1
-                break
     # Check if there's any duplicate in the submission
     if len(task_ids) != len(set(task_ids)):
         return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
-    #if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
-    #    return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
     # SAVE SCORED SUBMISSION
     if LOCAL_DEBUG:
         print("mock uploaded scored submission")
@@ -316,15 +299,6 @@ def add_new_eval(
             token=TOKEN
         )
-        # Save scored file
-        if is_validation:
-            api.upload_file(
-                repo_id=SUBMISSION_DATASET_PUBLIC,
-                path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
-                path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
-                repo_type="dataset",
-                token=TOKEN
-            )
     # SAVE TO LEADERBOARD DATA
     eval_entry = {
@@ -333,18 +307,13 @@ def add_new_eval(
         "system_prompt": system_prompt,
         "url": url,
         "organisation": organisation,
-        "score": scores / ref_scores_len,#[val_or_test],
-        #"score_level1": scores[1]/num_questions[1],
-        #"score_level2": scores[2]/num_questions[2],
-        #"score_level3": scores[3]/num_questions[3],
         "date": datetime.datetime.today().strftime('%Y-%m-%d')
     }
-    #if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
-    #    return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
-    # Catching spam submissions of 100%
-    #if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
-    #    return format_error(f"There was a problem with your submission. Please open a discussion.")
     # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
     #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
     #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
@@ -400,11 +369,6 @@ with demo:
             value=eval_dataframe_test, datatype=TYPES, interactive=False,
             column_widths=["20%"]
         )
-    #with gr.Tab("Results: Validation"):
-    #    leaderboard_table_val = gr.components.Dataframe(
-    #        value=eval_dataframe_val, datatype=TYPES, interactive=False,
-    #        column_widths=["20%"]
-    #    )
     refresh_button = gr.Button("Refresh")
     refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])

 OWNER="Blanca"
 DATA_DATASET = f"{OWNER}/CQs-Gen_test"
+INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" # TODO: change to the dataset that contains the embeddings
 SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
 SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
+#CONTACT_DATASET = f"{OWNER}/contact_info" # TODO: I should reactivate this
 RESULTS_DATASET = f"{OWNER}/results_public"
 LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
 METRIC = 'similarity'
 if METRIC == 'similarity':
     similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
+if METRIC == 'gemma': # WARNING: this can't be used because I do not have GPU in HF
+    model = AutoModelForCausalLM.from_pretrained('google/gemma-3-12b-it', device_map="auto", attn_implementation='eager')
+    tokenizer = AutoTokenizer.from_pretrained('google/gemma-3-12b-it')
 YEAR_VERSION = "2025"
 ref_scores_len = {"test": 34}
 os.makedirs("scored", exist_ok=True)
     trust_remote_code=True,
 )
 eval_results = {"test": test_results}
+# TODO: I should reactivate saving contact infos
 #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
 def get_dataframe_from_results(eval_results, split):
     local_df = eval_results[split]
     df = pd.DataFrame(local_df)
     df = df.sort_values(by=["Score (%)"], ascending=False)
     return df
 # Gold answers
 gold_results = {}
 gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
 def restart_space():
     api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
     generated_ids = model.generate(**inputs, max_new_tokens=512)
     out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
     try:
         output = out.split('model\n')[1].replace('\n', '')
     except IndexError:
         print('EVAL ERROR: '+output, flush=True)
     output = output.strip()
     return output
     if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
         return format_error("This account is not authorized to submit on this leaderboard.")
+    # TODO: I should reactivate this check
     #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
     #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
     #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
         api.upload_file(
             repo_id=SUBMISSION_DATASET,
             path_or_fileobj=path_to_file.name,
+            path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.json",
             repo_type="dataset",
             token=TOKEN
         )
         "mail": mail,
         "date": datetime.datetime.today().strftime('%Y-%m-%d')
     }
+    # TODO: reactivate this
     #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
     #if LOCAL_DEBUG:
     #    print("mock uploaded contact info")
     num_questions = 0
     task_ids = []
+    with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # check where is this saved
         with open(file_path, 'r') as f:
             data = json.load(f)
+            scores = []
             for id_to_eval, line in data.items():
+                intervention_score = 0
                 for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
                     if id_to_eval == intervention_id:
                         references = gold_dataset['cqs']
                         reference_set = [row['cq'] for row in references[indx]]
+                        # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
                         #print(reference_set, flush=True)
+                        if len(line['cqs']) < 3: # make sure there are at least 3 cqs
+                            return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
+                        for cq in line['cqs'][:2]: # here only take the first 3 cqs
                             cq_text = cq['cq']
                             if METRIC == 'similarity':
                                 sentence_embedding = similarity_model.encode(cq_text)
+                                reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
                                 sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
                                 winner = np.argmax(sims)
                                 # make sure the similarity of the winning reference sentence is at least 0.65
                             print(label, flush=True)
                             if label == 'Useful':
+                                intervention_score += 1/3
+                print(id_to_eval, intervention_score, flush=True)
+                scores.append(intervention_score)
                 scored_file.write(
                     json.dumps({
                         "id": intervention_id,
                         #"model_answer": answer,
+                        "score": intervention_score
                     }) + "\n"
                 )
                 task_ids.append(intervention_id)
+                #scores += score
+                #num_questions += 1
+                #break
+                #return format_error(score)
+            score = sum(scores)/len(score)*10
+            print(score, flush=True)
     # Check if there's any duplicate in the submission
     if len(task_ids) != len(set(task_ids)):
         return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
     # SAVE SCORED SUBMISSION
     if LOCAL_DEBUG:
         print("mock uploaded scored submission")
             token=TOKEN
         )
     # SAVE TO LEADERBOARD DATA
     eval_entry = {
         "system_prompt": system_prompt,
         "url": url,
         "organisation": organisation,
+        "score": score  #s / ref_scores_len,#[val_or_test],
         "date": datetime.datetime.today().strftime('%Y-%m-%d')
     }
+    #TODO: if I find potential errors, I should check them here and maybe suggest that they open a discussion
+    # TODO: I should reactivate this
     # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
     #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
     #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
             value=eval_dataframe_test, datatype=TYPES, interactive=False,
             column_widths=["20%"]
         )
     refresh_button = gr.Button("Refresh")
     refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])