Update app.py
Browse files
app.py
CHANGED
|
@@ -22,10 +22,10 @@ TOKEN = os.environ.get("TOKEN", None)
|
|
| 22 |
|
| 23 |
OWNER="Blanca"
|
| 24 |
DATA_DATASET = f"{OWNER}/CQs-Gen_test"
|
| 25 |
-
INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test"
|
| 26 |
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
| 27 |
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
|
| 28 |
-
#CONTACT_DATASET = f"{OWNER}/contact_info"
|
| 29 |
RESULTS_DATASET = f"{OWNER}/results_public"
|
| 30 |
LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
|
| 31 |
METRIC = 'similarity'
|
|
@@ -34,13 +34,12 @@ api = HfApi()
|
|
| 34 |
if METRIC == 'similarity':
|
| 35 |
similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
|
| 36 |
|
| 37 |
-
if METRIC == 'gemma':
|
| 38 |
-
model = AutoModelForCausalLM.from_pretrained('google/gemma-
|
| 39 |
-
tokenizer = AutoTokenizer.from_pretrained('google/gemma-
|
| 40 |
|
| 41 |
YEAR_VERSION = "2025"
|
| 42 |
ref_scores_len = {"test": 34}
|
| 43 |
-
#ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
|
| 44 |
|
| 45 |
os.makedirs("scored", exist_ok=True)
|
| 46 |
|
|
@@ -58,6 +57,7 @@ test_results = load_dataset(
|
|
| 58 |
trust_remote_code=True,
|
| 59 |
)
|
| 60 |
eval_results = {"test": test_results}
|
|
|
|
| 61 |
#contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
| 62 |
def get_dataframe_from_results(eval_results, split):
|
| 63 |
local_df = eval_results[split]
|
|
@@ -70,8 +70,6 @@ def get_dataframe_from_results(eval_results, split):
|
|
| 70 |
df = pd.DataFrame(local_df)
|
| 71 |
df = df.sort_values(by=["Score (%)"], ascending=False)
|
| 72 |
|
| 73 |
-
#df["Score (%)"] = df["Score (%)"].multiply(100).round(2)
|
| 74 |
-
|
| 75 |
return df
|
| 76 |
|
| 77 |
|
|
@@ -80,8 +78,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
|
|
| 80 |
# Gold answers
|
| 81 |
gold_results = {}
|
| 82 |
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
|
| 83 |
-
#gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
|
| 84 |
-
|
| 85 |
|
| 86 |
def restart_space():
|
| 87 |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
|
@@ -99,18 +95,13 @@ def run_model(model, tokenizer, prompt):
|
|
| 99 |
|
| 100 |
generated_ids = model.generate(**inputs, max_new_tokens=512)
|
| 101 |
|
| 102 |
-
#generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)] # this does not work for Gemma
|
| 103 |
-
|
| 104 |
out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 105 |
|
| 106 |
-
#print(out, flush=True)
|
| 107 |
-
|
| 108 |
try:
|
| 109 |
output = out.split('model\n')[1].replace('\n', '')
|
| 110 |
except IndexError:
|
| 111 |
print('EVAL ERROR: '+output, flush=True)
|
| 112 |
|
| 113 |
-
#import pdb; pdb.set_trace()
|
| 114 |
output = output.strip()
|
| 115 |
|
| 116 |
return output
|
|
@@ -159,7 +150,7 @@ def add_new_eval(
|
|
| 159 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
|
| 160 |
return format_error("This account is not authorized to submit on this leaderboard.")
|
| 161 |
|
| 162 |
-
|
| 163 |
#contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
| 164 |
#user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
|
| 165 |
#if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
|
|
@@ -188,7 +179,7 @@ def add_new_eval(
|
|
| 188 |
api.upload_file(
|
| 189 |
repo_id=SUBMISSION_DATASET,
|
| 190 |
path_or_fileobj=path_to_file.name,
|
| 191 |
-
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.
|
| 192 |
repo_type="dataset",
|
| 193 |
token=TOKEN
|
| 194 |
)
|
|
@@ -203,6 +194,8 @@ def add_new_eval(
|
|
| 203 |
"mail": mail,
|
| 204 |
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
| 205 |
}
|
|
|
|
|
|
|
| 206 |
#contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
|
| 207 |
#if LOCAL_DEBUG:
|
| 208 |
# print("mock uploaded contact info")
|
|
@@ -215,26 +208,28 @@ def add_new_eval(
|
|
| 215 |
num_questions = 0
|
| 216 |
task_ids = []
|
| 217 |
|
| 218 |
-
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: #
|
| 219 |
with open(file_path, 'r') as f:
|
| 220 |
data = json.load(f)
|
|
|
|
| 221 |
for id_to_eval, line in data.items():
|
| 222 |
-
|
| 223 |
for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
|
| 224 |
if id_to_eval == intervention_id:
|
| 225 |
references = gold_dataset['cqs']
|
| 226 |
reference_set = [row['cq'] for row in references[indx]]
|
|
|
|
|
|
|
| 227 |
#print(reference_set, flush=True)
|
| 228 |
-
|
|
|
|
|
|
|
| 229 |
cq_text = cq['cq']
|
| 230 |
-
#print(cq_text, flush=True)
|
| 231 |
-
|
| 232 |
|
| 233 |
if METRIC == 'similarity':
|
| 234 |
sentence_embedding = similarity_model.encode(cq_text)
|
| 235 |
-
reference_embedding = similarity_model.encode(reference_set)
|
| 236 |
sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
|
| 237 |
-
#print(sims, flush=True)
|
| 238 |
|
| 239 |
winner = np.argmax(sims)
|
| 240 |
# make sure the similarity of the winning reference sentence is at least 0.65
|
|
@@ -261,49 +256,37 @@ def add_new_eval(
|
|
| 261 |
|
| 262 |
print(label, flush=True)
|
| 263 |
if label == 'Useful':
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
print(id_to_eval, score, flush=True)
|
| 267 |
-
#return format_error(score)
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
#try:
|
| 272 |
-
# task = json.loads(line)
|
| 273 |
-
#except Exception:
|
| 274 |
-
# return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
|
| 275 |
-
#if "model_answer" not in task:
|
| 276 |
-
# return format_error(f"Line {ix} missing 'model_answer'.")
|
| 277 |
-
#answer = task["model_answer"]
|
| 278 |
-
#task_id = task["task_id"]
|
| 279 |
-
|
| 280 |
-
#if task_id not in gold_results[val_or_test]:
|
| 281 |
-
# return format_error(f"{task_id} not found in gold set.")
|
| 282 |
|
| 283 |
-
|
| 284 |
-
|
| 285 |
|
| 286 |
scored_file.write(
|
| 287 |
json.dumps({
|
| 288 |
"id": intervention_id,
|
| 289 |
#"model_answer": answer,
|
| 290 |
-
"score":
|
| 291 |
}) + "\n"
|
| 292 |
)
|
| 293 |
|
| 294 |
task_ids.append(intervention_id)
|
| 295 |
-
scores += score
|
| 296 |
-
num_questions += 1
|
| 297 |
-
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
|
| 300 |
# Check if there's any duplicate in the submission
|
| 301 |
if len(task_ids) != len(set(task_ids)):
|
| 302 |
return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
|
| 303 |
-
|
| 304 |
-
#if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
|
| 305 |
-
# return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
|
| 306 |
-
|
| 307 |
# SAVE SCORED SUBMISSION
|
| 308 |
if LOCAL_DEBUG:
|
| 309 |
print("mock uploaded scored submission")
|
|
@@ -316,15 +299,6 @@ def add_new_eval(
|
|
| 316 |
token=TOKEN
|
| 317 |
)
|
| 318 |
|
| 319 |
-
# Save scored file
|
| 320 |
-
if is_validation:
|
| 321 |
-
api.upload_file(
|
| 322 |
-
repo_id=SUBMISSION_DATASET_PUBLIC,
|
| 323 |
-
path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
|
| 324 |
-
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
|
| 325 |
-
repo_type="dataset",
|
| 326 |
-
token=TOKEN
|
| 327 |
-
)
|
| 328 |
|
| 329 |
# SAVE TO LEADERBOARD DATA
|
| 330 |
eval_entry = {
|
|
@@ -333,18 +307,13 @@ def add_new_eval(
|
|
| 333 |
"system_prompt": system_prompt,
|
| 334 |
"url": url,
|
| 335 |
"organisation": organisation,
|
| 336 |
-
"score":
|
| 337 |
-
#"score_level1": scores[1]/num_questions[1],
|
| 338 |
-
#"score_level2": scores[2]/num_questions[2],
|
| 339 |
-
#"score_level3": scores[3]/num_questions[3],
|
| 340 |
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
| 341 |
}
|
| 342 |
-
|
| 343 |
-
#
|
| 344 |
-
# Catching spam submissions of 100%
|
| 345 |
-
#if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
|
| 346 |
-
# return format_error(f"There was a problem with your submission. Please open a discussion.")
|
| 347 |
|
|
|
|
| 348 |
# Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
|
| 349 |
#eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
|
| 350 |
#columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
|
|
@@ -400,11 +369,6 @@ with demo:
|
|
| 400 |
value=eval_dataframe_test, datatype=TYPES, interactive=False,
|
| 401 |
column_widths=["20%"]
|
| 402 |
)
|
| 403 |
-
#with gr.Tab("Results: Validation"):
|
| 404 |
-
# leaderboard_table_val = gr.components.Dataframe(
|
| 405 |
-
# value=eval_dataframe_val, datatype=TYPES, interactive=False,
|
| 406 |
-
# column_widths=["20%"]
|
| 407 |
-
# )
|
| 408 |
|
| 409 |
refresh_button = gr.Button("Refresh")
|
| 410 |
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])
|
|
|
|
| 22 |
|
| 23 |
OWNER="Blanca"
|
| 24 |
DATA_DATASET = f"{OWNER}/CQs-Gen_test"
|
| 25 |
+
INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" # TODO: change to the dataset that contains the embeddings
|
| 26 |
SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
|
| 27 |
SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
|
| 28 |
+
#CONTACT_DATASET = f"{OWNER}/contact_info" # TODO: I should reactivate this
|
| 29 |
RESULTS_DATASET = f"{OWNER}/results_public"
|
| 30 |
LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
|
| 31 |
METRIC = 'similarity'
|
|
|
|
| 34 |
if METRIC == 'similarity':
|
| 35 |
similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
|
| 36 |
|
| 37 |
+
if METRIC == 'gemma': # WARNING: this can't be used because I do not have GPU in HF
|
| 38 |
+
model = AutoModelForCausalLM.from_pretrained('google/gemma-3-12b-it', device_map="auto", attn_implementation='eager')
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained('google/gemma-3-12b-it')
|
| 40 |
|
| 41 |
YEAR_VERSION = "2025"
|
| 42 |
ref_scores_len = {"test": 34}
|
|
|
|
| 43 |
|
| 44 |
os.makedirs("scored", exist_ok=True)
|
| 45 |
|
|
|
|
| 57 |
trust_remote_code=True,
|
| 58 |
)
|
| 59 |
eval_results = {"test": test_results}
|
| 60 |
+
# TODO: I should reactivate saving contact infos
|
| 61 |
#contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
| 62 |
def get_dataframe_from_results(eval_results, split):
|
| 63 |
local_df = eval_results[split]
|
|
|
|
| 70 |
df = pd.DataFrame(local_df)
|
| 71 |
df = df.sort_values(by=["Score (%)"], ascending=False)
|
| 72 |
|
|
|
|
|
|
|
| 73 |
return df
|
| 74 |
|
| 75 |
|
|
|
|
| 78 |
# Gold answers
|
| 79 |
gold_results = {}
|
| 80 |
gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
|
|
|
|
|
|
|
| 81 |
|
| 82 |
def restart_space():
|
| 83 |
api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
|
|
|
| 95 |
|
| 96 |
generated_ids = model.generate(**inputs, max_new_tokens=512)
|
| 97 |
|
|
|
|
|
|
|
| 98 |
out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
| 99 |
|
|
|
|
|
|
|
| 100 |
try:
|
| 101 |
output = out.split('model\n')[1].replace('\n', '')
|
| 102 |
except IndexError:
|
| 103 |
print('EVAL ERROR: '+output, flush=True)
|
| 104 |
|
|
|
|
| 105 |
output = output.strip()
|
| 106 |
|
| 107 |
return output
|
|
|
|
| 150 |
if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
|
| 151 |
return format_error("This account is not authorized to submit on this leaderboard.")
|
| 152 |
|
| 153 |
+
# TODO: I should reactivate this check
|
| 154 |
#contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
|
| 155 |
#user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
|
| 156 |
#if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
|
|
|
|
| 179 |
api.upload_file(
|
| 180 |
repo_id=SUBMISSION_DATASET,
|
| 181 |
path_or_fileobj=path_to_file.name,
|
| 182 |
+
path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.json",
|
| 183 |
repo_type="dataset",
|
| 184 |
token=TOKEN
|
| 185 |
)
|
|
|
|
| 194 |
"mail": mail,
|
| 195 |
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
| 196 |
}
|
| 197 |
+
|
| 198 |
+
# TODO: reactivate this
|
| 199 |
#contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
|
| 200 |
#if LOCAL_DEBUG:
|
| 201 |
# print("mock uploaded contact info")
|
|
|
|
| 208 |
num_questions = 0
|
| 209 |
task_ids = []
|
| 210 |
|
| 211 |
+
with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # check where is this saved
|
| 212 |
with open(file_path, 'r') as f:
|
| 213 |
data = json.load(f)
|
| 214 |
+
scores = []
|
| 215 |
for id_to_eval, line in data.items():
|
| 216 |
+
intervention_score = 0
|
| 217 |
for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
|
| 218 |
if id_to_eval == intervention_id:
|
| 219 |
references = gold_dataset['cqs']
|
| 220 |
reference_set = [row['cq'] for row in references[indx]]
|
| 221 |
+
# TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
|
| 222 |
+
|
| 223 |
#print(reference_set, flush=True)
|
| 224 |
+
if len(line['cqs']) < 3: # make sure there are at least 3 cqs
|
| 225 |
+
return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
|
| 226 |
+
for cq in line['cqs'][:2]: # here only take the first 3 cqs
|
| 227 |
cq_text = cq['cq']
|
|
|
|
|
|
|
| 228 |
|
| 229 |
if METRIC == 'similarity':
|
| 230 |
sentence_embedding = similarity_model.encode(cq_text)
|
| 231 |
+
reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
|
| 232 |
sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
|
|
|
|
| 233 |
|
| 234 |
winner = np.argmax(sims)
|
| 235 |
# make sure the similarity of the winning reference sentence is at least 0.65
|
|
|
|
| 256 |
|
| 257 |
print(label, flush=True)
|
| 258 |
if label == 'Useful':
|
| 259 |
+
intervention_score += 1/3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
+
print(id_to_eval, intervention_score, flush=True)
|
| 262 |
+
scores.append(intervention_score)
|
| 263 |
|
| 264 |
scored_file.write(
|
| 265 |
json.dumps({
|
| 266 |
"id": intervention_id,
|
| 267 |
#"model_answer": answer,
|
| 268 |
+
"score": intervention_score
|
| 269 |
}) + "\n"
|
| 270 |
)
|
| 271 |
|
| 272 |
task_ids.append(intervention_id)
|
| 273 |
+
#scores += score
|
| 274 |
+
#num_questions += 1
|
| 275 |
+
#break
|
| 276 |
+
|
| 277 |
+
|
| 278 |
+
#return format_error(score)
|
| 279 |
+
score = sum(scores)/len(score)*10
|
| 280 |
+
print(score, flush=True)
|
| 281 |
+
|
| 282 |
+
|
| 283 |
+
|
| 284 |
|
| 285 |
|
| 286 |
# Check if there's any duplicate in the submission
|
| 287 |
if len(task_ids) != len(set(task_ids)):
|
| 288 |
return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
|
| 289 |
+
|
|
|
|
|
|
|
|
|
|
| 290 |
# SAVE SCORED SUBMISSION
|
| 291 |
if LOCAL_DEBUG:
|
| 292 |
print("mock uploaded scored submission")
|
|
|
|
| 299 |
token=TOKEN
|
| 300 |
)
|
| 301 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
# SAVE TO LEADERBOARD DATA
|
| 304 |
eval_entry = {
|
|
|
|
| 307 |
"system_prompt": system_prompt,
|
| 308 |
"url": url,
|
| 309 |
"organisation": organisation,
|
| 310 |
+
"score": score #s / ref_scores_len,#[val_or_test],
|
|
|
|
|
|
|
|
|
|
| 311 |
"date": datetime.datetime.today().strftime('%Y-%m-%d')
|
| 312 |
}
|
| 313 |
+
|
| 314 |
+
#TODO: if I find potential errors, I should check them here and maybe suggest that they open a discussion
|
|
|
|
|
|
|
|
|
|
| 315 |
|
| 316 |
+
# TODO: I should reactivate this
|
| 317 |
# Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
|
| 318 |
#eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
|
| 319 |
#columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
|
|
|
|
| 369 |
value=eval_dataframe_test, datatype=TYPES, interactive=False,
|
| 370 |
column_widths=["20%"]
|
| 371 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
refresh_button = gr.Button("Refresh")
|
| 374 |
refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])
|