Blanca commited on
Commit
0ef86f0
·
verified ·
1 Parent(s): c4da416

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -75
app.py CHANGED
@@ -22,10 +22,10 @@ TOKEN = os.environ.get("TOKEN", None)
22
 
23
  OWNER="Blanca"
24
  DATA_DATASET = f"{OWNER}/CQs-Gen_test"
25
- INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test"
26
  SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
27
  SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
28
- #CONTACT_DATASET = f"{OWNER}/contact_info"
29
  RESULTS_DATASET = f"{OWNER}/results_public"
30
  LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
31
  METRIC = 'similarity'
@@ -34,13 +34,12 @@ api = HfApi()
34
  if METRIC == 'similarity':
35
  similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
36
 
37
- if METRIC == 'gemma':
38
- model = AutoModelForCausalLM.from_pretrained('google/gemma-2-9b-it', device_map="auto", attn_implementation='eager')
39
- tokenizer = AutoTokenizer.from_pretrained('google/gemma-2-9b-it')
40
 
41
  YEAR_VERSION = "2025"
42
  ref_scores_len = {"test": 34}
43
- #ref_level_len = {"validation": {1: 53, 2: 86, 3: 26}, "test": {1: 93, 2: 159, 3: 49}}
44
 
45
  os.makedirs("scored", exist_ok=True)
46
 
@@ -58,6 +57,7 @@ test_results = load_dataset(
58
  trust_remote_code=True,
59
  )
60
  eval_results = {"test": test_results}
 
61
  #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
62
  def get_dataframe_from_results(eval_results, split):
63
  local_df = eval_results[split]
@@ -70,8 +70,6 @@ def get_dataframe_from_results(eval_results, split):
70
  df = pd.DataFrame(local_df)
71
  df = df.sort_values(by=["Score (%)"], ascending=False)
72
 
73
- #df["Score (%)"] = df["Score (%)"].multiply(100).round(2)
74
-
75
  return df
76
 
77
 
@@ -80,8 +78,6 @@ eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, spli
80
  # Gold answers
81
  gold_results = {}
82
  gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
83
- #gold_results = {"test": {row["cqs"]: row for row in gold_dataset["test"]}}
84
-
85
 
86
  def restart_space():
87
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
@@ -99,18 +95,13 @@ def run_model(model, tokenizer, prompt):
99
 
100
  generated_ids = model.generate(**inputs, max_new_tokens=512)
101
 
102
- #generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)] # this does not work for Gemma
103
-
104
  out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
105
 
106
- #print(out, flush=True)
107
-
108
  try:
109
  output = out.split('model\n')[1].replace('\n', '')
110
  except IndexError:
111
  print('EVAL ERROR: '+output, flush=True)
112
 
113
- #import pdb; pdb.set_trace()
114
  output = output.strip()
115
 
116
  return output
@@ -159,7 +150,7 @@ def add_new_eval(
159
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
160
  return format_error("This account is not authorized to submit on this leaderboard.")
161
 
162
-
163
  #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
164
  #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
165
  #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
@@ -188,7 +179,7 @@ def add_new_eval(
188
  api.upload_file(
189
  repo_id=SUBMISSION_DATASET,
190
  path_or_fileobj=path_to_file.name,
191
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
192
  repo_type="dataset",
193
  token=TOKEN
194
  )
@@ -203,6 +194,8 @@ def add_new_eval(
203
  "mail": mail,
204
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
205
  }
 
 
206
  #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
207
  #if LOCAL_DEBUG:
208
  # print("mock uploaded contact info")
@@ -215,26 +208,28 @@ def add_new_eval(
215
  num_questions = 0
216
  task_ids = []
217
 
218
- with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # I am not sure where this is being saved
219
  with open(file_path, 'r') as f:
220
  data = json.load(f)
 
221
  for id_to_eval, line in data.items():
222
- score = 0
223
  for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
224
  if id_to_eval == intervention_id:
225
  references = gold_dataset['cqs']
226
  reference_set = [row['cq'] for row in references[indx]]
 
 
227
  #print(reference_set, flush=True)
228
- for cq in line['cqs']:
 
 
229
  cq_text = cq['cq']
230
- #print(cq_text, flush=True)
231
-
232
 
233
  if METRIC == 'similarity':
234
  sentence_embedding = similarity_model.encode(cq_text)
235
- reference_embedding = similarity_model.encode(reference_set)
236
  sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
237
- #print(sims, flush=True)
238
 
239
  winner = np.argmax(sims)
240
  # make sure the similarity of the winning reference sentence is at least 0.65
@@ -261,49 +256,37 @@ def add_new_eval(
261
 
262
  print(label, flush=True)
263
  if label == 'Useful':
264
- score += 1/3
265
-
266
- print(id_to_eval, score, flush=True)
267
- #return format_error(score)
268
-
269
-
270
-
271
- #try:
272
- # task = json.loads(line)
273
- #except Exception:
274
- # return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
275
- #if "model_answer" not in task:
276
- # return format_error(f"Line {ix} missing 'model_answer'.")
277
- #answer = task["model_answer"]
278
- #task_id = task["task_id"]
279
-
280
- #if task_id not in gold_results[val_or_test]:
281
- # return format_error(f"{task_id} not found in gold set.")
282
 
283
- #score = question_scorer(answer, gold_results[val_or_test][task_id]["Final answer"])
284
- #score = 1
285
 
286
  scored_file.write(
287
  json.dumps({
288
  "id": intervention_id,
289
  #"model_answer": answer,
290
- "score": score
291
  }) + "\n"
292
  )
293
 
294
  task_ids.append(intervention_id)
295
- scores += score
296
- num_questions += 1
297
- break
 
 
 
 
 
 
 
 
298
 
299
 
300
  # Check if there's any duplicate in the submission
301
  if len(task_ids) != len(set(task_ids)):
302
  return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
303
-
304
- #if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
305
- # return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
306
-
307
  # SAVE SCORED SUBMISSION
308
  if LOCAL_DEBUG:
309
  print("mock uploaded scored submission")
@@ -316,15 +299,6 @@ def add_new_eval(
316
  token=TOKEN
317
  )
318
 
319
- # Save scored file
320
- if is_validation:
321
- api.upload_file(
322
- repo_id=SUBMISSION_DATASET_PUBLIC,
323
- path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
324
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
325
- repo_type="dataset",
326
- token=TOKEN
327
- )
328
 
329
  # SAVE TO LEADERBOARD DATA
330
  eval_entry = {
@@ -333,18 +307,13 @@ def add_new_eval(
333
  "system_prompt": system_prompt,
334
  "url": url,
335
  "organisation": organisation,
336
- "score": scores / ref_scores_len,#[val_or_test],
337
- #"score_level1": scores[1]/num_questions[1],
338
- #"score_level2": scores[2]/num_questions[2],
339
- #"score_level3": scores[3]/num_questions[3],
340
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
341
  }
342
- #if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
343
- # return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
344
- # Catching spam submissions of 100%
345
- #if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
346
- # return format_error(f"There was a problem with your submission. Please open a discussion.")
347
 
 
348
  # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
349
  #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
350
  #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
@@ -400,11 +369,6 @@ with demo:
400
  value=eval_dataframe_test, datatype=TYPES, interactive=False,
401
  column_widths=["20%"]
402
  )
403
- #with gr.Tab("Results: Validation"):
404
- # leaderboard_table_val = gr.components.Dataframe(
405
- # value=eval_dataframe_val, datatype=TYPES, interactive=False,
406
- # column_widths=["20%"]
407
- # )
408
 
409
  refresh_button = gr.Button("Refresh")
410
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])
 
22
 
23
  OWNER="Blanca"
24
  DATA_DATASET = f"{OWNER}/CQs-Gen_test"
25
+ INTERNAL_DATA_DATASET = f"{OWNER}/CQs-Gen_test" # TODO: change to the dataset that contains the embeddings
26
  SUBMISSION_DATASET = f"{OWNER}/submissions_internal"
27
  SUBMISSION_DATASET_PUBLIC = f"{OWNER}/submissions_public"
28
+ #CONTACT_DATASET = f"{OWNER}/contact_info" # TODO: I should reactivate this
29
  RESULTS_DATASET = f"{OWNER}/results_public"
30
  LEADERBOARD_PATH = f"HiTZ/Critical_Questions_Leaderboard"
31
  METRIC = 'similarity'
 
34
  if METRIC == 'similarity':
35
  similarity_model = SentenceTransformer("stsb-mpnet-base-v2")
36
 
37
+ if METRIC == 'gemma': # WARNING: this can't be used because I do not have GPU in HF
38
+ model = AutoModelForCausalLM.from_pretrained('google/gemma-3-12b-it', device_map="auto", attn_implementation='eager')
39
+ tokenizer = AutoTokenizer.from_pretrained('google/gemma-3-12b-it')
40
 
41
  YEAR_VERSION = "2025"
42
  ref_scores_len = {"test": 34}
 
43
 
44
  os.makedirs("scored", exist_ok=True)
45
 
 
57
  trust_remote_code=True,
58
  )
59
  eval_results = {"test": test_results}
60
+ # TODO: I should reactivate saving contact infos
61
  #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
62
  def get_dataframe_from_results(eval_results, split):
63
  local_df = eval_results[split]
 
70
  df = pd.DataFrame(local_df)
71
  df = df.sort_values(by=["Score (%)"], ascending=False)
72
 
 
 
73
  return df
74
 
75
 
 
78
  # Gold answers
79
  gold_results = {}
80
  gold_dataset = load_dataset(INTERNAL_DATA_DATASET, "test", token=TOKEN, trust_remote_code=True)['test']
 
 
81
 
82
  def restart_space():
83
  api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
 
95
 
96
  generated_ids = model.generate(**inputs, max_new_tokens=512)
97
 
 
 
98
  out = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
99
 
 
 
100
  try:
101
  output = out.split('model\n')[1].replace('\n', '')
102
  except IndexError:
103
  print('EVAL ERROR: '+output, flush=True)
104
 
 
105
  output = output.strip()
106
 
107
  return output
 
150
  if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
151
  return format_error("This account is not authorized to submit on this leaderboard.")
152
 
153
+ # TODO: I should reactivate this check
154
  #contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
155
  #user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
156
  #if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
 
179
  api.upload_file(
180
  repo_id=SUBMISSION_DATASET,
181
  path_or_fileobj=path_to_file.name,
182
+ path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.json",
183
  repo_type="dataset",
184
  token=TOKEN
185
  )
 
194
  "mail": mail,
195
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
196
  }
197
+
198
+ # TODO: reactivate this
199
  #contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
200
  #if LOCAL_DEBUG:
201
  # print("mock uploaded contact info")
 
208
  num_questions = 0
209
  task_ids = []
210
 
211
+ with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file: # check where is this saved
212
  with open(file_path, 'r') as f:
213
  data = json.load(f)
214
+ scores = []
215
  for id_to_eval, line in data.items():
216
+ intervention_score = 0
217
  for indx, intervention_id in enumerate(gold_dataset['intervention_id']):
218
  if id_to_eval == intervention_id:
219
  references = gold_dataset['cqs']
220
  reference_set = [row['cq'] for row in references[indx]]
221
+ # TODO: here upload the embedding that I have saved, so they can be used in similarity evaluation
222
+
223
  #print(reference_set, flush=True)
224
+ if len(line['cqs']) < 3: # make sure there are at least 3 cqs
225
+ return format_warning("Make sure that there are at least 3 questions per intervention, or check that the format is right.")
226
+ for cq in line['cqs'][:2]: # here only take the first 3 cqs
227
  cq_text = cq['cq']
 
 
228
 
229
  if METRIC == 'similarity':
230
  sentence_embedding = similarity_model.encode(cq_text)
231
+ reference_embedding = similarity_model.encode(reference_set) # TODO: here have the embeddings directly, do no calculate each time
232
  sims = similarity_model.similarity(sentence_embedding, reference_embedding).tolist()[0]
 
233
 
234
  winner = np.argmax(sims)
235
  # make sure the similarity of the winning reference sentence is at least 0.65
 
256
 
257
  print(label, flush=True)
258
  if label == 'Useful':
259
+ intervention_score += 1/3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
+ print(id_to_eval, intervention_score, flush=True)
262
+ scores.append(intervention_score)
263
 
264
  scored_file.write(
265
  json.dumps({
266
  "id": intervention_id,
267
  #"model_answer": answer,
268
+ "score": intervention_score
269
  }) + "\n"
270
  )
271
 
272
  task_ids.append(intervention_id)
273
+ #scores += score
274
+ #num_questions += 1
275
+ #break
276
+
277
+
278
+ #return format_error(score)
279
+ score = sum(scores)/len(score)*10
280
+ print(score, flush=True)
281
+
282
+
283
+
284
 
285
 
286
  # Check if there's any duplicate in the submission
287
  if len(task_ids) != len(set(task_ids)):
288
  return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
289
+
 
 
 
290
  # SAVE SCORED SUBMISSION
291
  if LOCAL_DEBUG:
292
  print("mock uploaded scored submission")
 
299
  token=TOKEN
300
  )
301
 
 
 
 
 
 
 
 
 
 
302
 
303
  # SAVE TO LEADERBOARD DATA
304
  eval_entry = {
 
307
  "system_prompt": system_prompt,
308
  "url": url,
309
  "organisation": organisation,
310
+ "score": score #s / ref_scores_len,#[val_or_test],
 
 
 
311
  "date": datetime.datetime.today().strftime('%Y-%m-%d')
312
  }
313
+
314
+ #TODO: if I find potential errors, I should check them here and maybe suggest that they open a discussion
 
 
 
315
 
316
+ # TODO: I should reactivate this
317
  # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
318
  #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
319
  #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
 
369
  value=eval_dataframe_test, datatype=TYPES, interactive=False,
370
  column_widths=["20%"]
371
  )
 
 
 
 
 
372
 
373
  refresh_button = gr.Button("Refresh")
374
  refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table_test])