ha251 commited on
Commit
36c5bc9
·
verified ·
1 Parent(s): e035fc6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -775
app.py DELETED
@@ -1,775 +0,0 @@
1
- import datetime
2
- import io
3
- import json
4
- import os
5
- import re
6
- from urllib.parse import urlparse
7
-
8
- import gradio as gr
9
- import pandas as pd
10
- from huggingface_hub import HfApi, hf_hub_download
11
-
12
-
13
- APP_NAME = "miniapp"
14
- print("main mannnnn")
15
-
16
- # 在 Space 里通过 Secrets 配置:
17
- # - HF_TOKEN: 具有写 dataset 权限的 token(Settings -> Variables and secrets -> Secrets)
18
- # - LEADERBOARD_DATASET: 形如 "your-username/miniapp-leaderboard"(repo_type=dataset)
19
- HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
20
- LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", "").strip()
21
-
22
- # 判断是否运行在 Hugging Face Spaces
23
- IN_SPACES = bool(
24
- os.environ.get("SPACE_ID")
25
- or os.environ.get("SPACE_REPO_NAME")
26
- or os.environ.get("SPACE_AUTHOR_NAME")
27
- or os.environ.get("system", "") == "spaces"
28
- )
29
-
30
- MAX_ENTRIES = int(os.environ.get("MAX_ENTRIES", "200"))
31
-
32
-
33
- def _is_valid_http_url(url: str) -> bool:
34
- try:
35
- parsed = urlparse(url)
36
- return parsed.scheme in ("http", "https") and bool(parsed.netloc)
37
- except Exception:
38
- return False
39
-
40
-
41
- def _slug(s: str, max_len: int = 60) -> str:
42
- s = (s or "").strip().lower()
43
- s = re.sub(r"[^a-z0-9]+", "-", s)
44
- s = re.sub(r"-{2,}", "-", s).strip("-")
45
- return (s[:max_len] or "model")
46
-
47
-
48
- def _api() -> HfApi:
49
- return HfApi(token=HF_TOKEN)
50
-
51
-
52
- def _ensure_dataset_repo():
53
- if not HF_TOKEN:
54
- raise RuntimeError("未配置 HF_TOKEN(Space Secrets)。")
55
- if not LEADERBOARD_DATASET:
56
- raise RuntimeError("未配置 LEADERBOARD_DATASET(例如:your-username/miniapp-leaderboard)。")
57
- api = _api()
58
- try:
59
- api.repo_info(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
60
- except Exception:
61
- # 不存在则创建(public dataset;你也可以手动创建并设为 private)
62
- api.create_repo(repo_id=LEADERBOARD_DATASET, repo_type="dataset", private=False, exist_ok=True)
63
-
64
-
65
- def _empty_df() -> pd.DataFrame:
66
- return pd.DataFrame(columns=["submitted_at", "username", "model_name", "model_api", "notes"])
67
-
68
-
69
- def _load_submissions_df() -> pd.DataFrame:
70
- if not HF_TOKEN or not LEADERBOARD_DATASET:
71
- return _empty_df()
72
-
73
- api = _api()
74
- try:
75
- files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
76
- except Exception:
77
- return _empty_df()
78
-
79
- sub_files = sorted(
80
- [f for f in files if f.startswith("submissions/") and f.endswith(".json")],
81
- reverse=True,
82
- )[:MAX_ENTRIES]
83
-
84
- rows = []
85
- for filename in sub_files:
86
- try:
87
- path = hf_hub_download(
88
- repo_id=LEADERBOARD_DATASET,
89
- repo_type="dataset",
90
- filename=filename,
91
- token=HF_TOKEN,
92
- )
93
- with open(path, "r", encoding="utf-8") as fp:
94
- rows.append(json.load(fp))
95
- except Exception:
96
- continue
97
-
98
- if not rows:
99
- return _empty_df()
100
-
101
- df = pd.DataFrame(rows)
102
- for col in ["submitted_at", "username", "model_name", "model_api", "notes"]:
103
- if col not in df.columns:
104
- df[col] = ""
105
- df = df[["submitted_at", "username", "model_name", "model_api", "notes"]]
106
- df = df.sort_values(by=["submitted_at"], ascending=False, kind="stable")
107
- return df
108
-
109
-
110
- def refresh():
111
- return _load_submissions_df()
112
-
113
-
114
- def submit(model_name: str, model_api: str, notes: str, username: str | None):
115
- model_name = (model_name or "").strip()
116
- model_api = (model_api or "").strip()
117
- notes = (notes or "").strip()
118
- username = (username or "").strip() or "anonymous"
119
-
120
- if not model_name:
121
- return "请填写 **模型名称**。", _load_submissions_df()
122
- if not model_api:
123
- return "请填写 **模型 API**。", _load_submissions_df()
124
- if not _is_valid_http_url(model_api):
125
- return "**模型 API** 需要是合法的 `http(s)://...` URL。", _load_submissions_df()
126
-
127
- if not HF_TOKEN:
128
- return "Space 未配置 **HF_TOKEN**(Secrets),无法写入排行榜。", _load_submissions_df()
129
- if not LEADERBOARD_DATASET:
130
- return "Space 未配置 **LEADERBOARD_DATASET**(例如:`your-username/miniapp-leaderboard`)。", _load_submissions_df()
131
-
132
- _ensure_dataset_repo()
133
- api = _api()
134
-
135
- now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
136
- safe_model = _slug(model_name)
137
- safe_user = _slug(username)
138
- path_in_repo = f"submissions/{now[:10]}/{now}-{safe_user}-{safe_model}.json"
139
-
140
- payload = {
141
- "submitted_at": now,
142
- "username": username,
143
- "model_name": model_name,
144
- "model_api": model_api,
145
- "notes": notes,
146
- }
147
- data = (json.dumps(payload, ensure_ascii=False, indent=2) + "\n").encode("utf-8")
148
- bio = io.BytesIO(data)
149
-
150
- api.upload_file(
151
- repo_id=LEADERBOARD_DATASET,
152
- repo_type="dataset",
153
- path_or_fileobj=bio,
154
- path_in_repo=path_in_repo,
155
- commit_message=f"miniapp: submit {username}/{model_name}",
156
- token=HF_TOKEN,
157
- )
158
-
159
- return "已提交并写入 leaderboard。", _load_submissions_df()
160
-
161
-
162
- def build_demo() -> gr.Blocks:
163
- with gr.Blocks(title=f"{APP_NAME} leaderboard") as demo:
164
- gr.Markdown(
165
- f"## {APP_NAME} leaderboard\n\n"
166
- "提交你的模型信息后,会写入一个 Hugging Face **Dataset**,并在下方表格展示。\n\n"
167
- f"- 当前 `LEADERBOARD_DATASET`: `{LEADERBOARD_DATASET or '(未配置)'}`\n"
168
- )
169
-
170
- with gr.Row():
171
- with gr.Column(scale=2):
172
- model_name = gr.Textbox(label="模型名称(必填)", placeholder="例如:my-agent-v1")
173
- model_api = gr.Textbox(
174
- label="模型 API(必填)",
175
- placeholder="例如:https://api.example.com/v1/chat/completions",
176
- )
177
- notes = gr.Textbox(label="备注(可选)", lines=4)
178
-
179
- # 纯前端版:不强制 OAuth;如果你想“只能登录用户提交”,后续再加 LoginButton
180
- if IN_SPACES:
181
- username = gr.Textbox(
182
- label="用户名(可选)",
183
- placeholder="建议填你的 HF 用户名(也可留空)",
184
- )
185
- else:
186
- username = gr.Textbox(label="用户名(本地调试用)", value="local")
187
-
188
- submit_btn = gr.Button("提交", variant="primary")
189
- status = gr.Markdown()
190
-
191
- with gr.Column(scale=3):
192
- leaderboard = gr.Dataframe(
193
- label="Leaderboard(按提交时间倒序)",
194
- value=_load_submissions_df(),
195
- interactive=False,
196
- wrap=True,
197
- )
198
- refresh_btn = gr.Button("刷新")
199
-
200
- submit_btn.click(
201
- submit,
202
- inputs=[model_name, model_api, notes, username],
203
- outputs=[status, leaderboard],
204
- )
205
- refresh_btn.click(refresh, inputs=[], outputs=[leaderboard])
206
-
207
- return demo
208
-
209
-
210
- demo = build_demo()
211
-
212
-
213
- def main():
214
- demo.launch()
215
-
216
-
217
- if __name__ == "__main__":
218
- main()
219
-
220
- import datetime
221
- import io
222
- import json
223
- import os
224
- import re
225
- from urllib.parse import urlparse
226
-
227
- import gradio as gr
228
- import pandas as pd
229
- from huggingface_hub import HfApi, hf_hub_download
230
-
231
-
232
- APP_NAME = "miniapp"
233
-
234
- # 在 Space 里通过 Secrets 配置:
235
- # - HF_TOKEN: 具有写 dataset 权限的 token(Settings -> Variables and secrets -> Secrets)
236
- # - LEADERBOARD_DATASET: 形如 "your-username/miniapp-leaderboard"(repo_type=dataset)
237
- HF_TOKEN = os.environ.get("HF_TOKEN") or os.environ.get("TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
238
- LEADERBOARD_DATASET = os.environ.get("LEADERBOARD_DATASET", "").strip()
239
-
240
- # 判断是否运行在 Hugging Face Spaces
241
- IN_SPACES = bool(
242
- os.environ.get("SPACE_ID")
243
- or os.environ.get("SPACE_REPO_NAME")
244
- or os.environ.get("SPACE_AUTHOR_NAME")
245
- or os.environ.get("system", "") == "spaces"
246
- )
247
-
248
- MAX_ENTRIES = int(os.environ.get("MAX_ENTRIES", "200"))
249
-
250
-
251
- def _is_valid_http_url(url: str) -> bool:
252
- try:
253
- parsed = urlparse(url)
254
- return parsed.scheme in ("http", "https") and bool(parsed.netloc)
255
- except Exception:
256
- return False
257
-
258
-
259
- def _slug(s: str, max_len: int = 60) -> str:
260
- s = (s or "").strip().lower()
261
- s = re.sub(r"[^a-z0-9]+", "-", s)
262
- s = re.sub(r"-{2,}", "-", s).strip("-")
263
- return (s[:max_len] or "model")
264
-
265
-
266
- def _api() -> HfApi:
267
- return HfApi(token=HF_TOKEN)
268
-
269
-
270
- def _ensure_dataset_repo():
271
- if not HF_TOKEN:
272
- raise RuntimeError("未配置 HF_TOKEN(Space Secrets)。")
273
- if not LEADERBOARD_DATASET:
274
- raise RuntimeError("未配置 LEADERBOARD_DATASET(例如:your-username/miniapp-leaderboard)。")
275
- api = _api()
276
- try:
277
- api.repo_info(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
278
- except Exception:
279
- # 不存在则创建(public dataset;你也可以手动创建并设为 private)
280
- api.create_repo(repo_id=LEADERBOARD_DATASET, repo_type="dataset", private=False, exist_ok=True)
281
-
282
-
283
- def _load_submissions_df() -> pd.DataFrame:
284
- if not HF_TOKEN or not LEADERBOARD_DATASET:
285
- return pd.DataFrame(columns=["submitted_at", "username", "model_name", "model_api", "notes"])
286
-
287
- api = _api()
288
- try:
289
- files = api.list_repo_files(repo_id=LEADERBOARD_DATASET, repo_type="dataset")
290
- except Exception:
291
- return pd.DataFrame(columns=["submitted_at", "username", "model_name", "model_api", "notes"])
292
-
293
- sub_files = sorted(
294
- [f for f in files if f.startswith("submissions/") and f.endswith(".json")],
295
- reverse=True,
296
- )[:MAX_ENTRIES]
297
-
298
- rows = []
299
- for filename in sub_files:
300
- try:
301
- path = hf_hub_download(
302
- repo_id=LEADERBOARD_DATASET,
303
- repo_type="dataset",
304
- filename=filename,
305
- token=HF_TOKEN,
306
- )
307
- with open(path, "r", encoding="utf-8") as fp:
308
- rows.append(json.load(fp))
309
- except Exception:
310
- continue
311
-
312
- if not rows:
313
- return pd.DataFrame(columns=["submitted_at", "username", "model_name", "model_api", "notes"])
314
-
315
- df = pd.DataFrame(rows)
316
- # 统一列顺序
317
- for col in ["submitted_at", "username", "model_name", "model_api", "notes"]:
318
- if col not in df.columns:
319
- df[col] = ""
320
- df = df[["submitted_at", "username", "model_name", "model_api", "notes"]]
321
- df = df.sort_values(by=["submitted_at"], ascending=False, kind="stable")
322
- return df
323
-
324
-
325
- def refresh():
326
- return _load_submissions_df()
327
-
328
-
329
- def submit(model_name: str, model_api: str, notes: str, username: str | None):
330
- model_name = (model_name or "").strip()
331
- model_api = (model_api or "").strip()
332
- notes = (notes or "").strip()
333
- username = (username or "").strip() or "anonymous"
334
-
335
- if not model_name:
336
- return "请填写 **模型名称**。", _load_submissions_df()
337
- if not model_api:
338
- return "请填写 **模型 API**。", _load_submissions_df()
339
- if not _is_valid_http_url(model_api):
340
- return "**模型 API** 需要是合法的 `http(s)://...` URL。", _load_submissions_df()
341
-
342
- if not HF_TOKEN:
343
- return "Space 未配置 **HF_TOKEN**(Secrets),无法写入排行榜。", _load_submissions_df()
344
- if not LEADERBOARD_DATASET:
345
- return "Space 未配置 **LEADERBOARD_DATASET**(例如:`your-username/miniapp-leaderboard`)。", _load_submissions_df()
346
-
347
- _ensure_dataset_repo()
348
- api = _api()
349
-
350
- now = datetime.datetime.utcnow().replace(microsecond=0).isoformat() + "Z"
351
- safe_model = _slug(model_name)
352
- safe_user = _slug(username)
353
- path_in_repo = f"submissions/{now[:10]}/{now}-{safe_user}-{safe_model}.json"
354
-
355
- payload = {
356
- "submitted_at": now,
357
- "username": username,
358
- "model_name": model_name,
359
- "model_api": model_api,
360
- "notes": notes,
361
- }
362
- data = (json.dumps(payload, ensure_ascii=False, indent=2) + "\n").encode("utf-8")
363
- bio = io.BytesIO(data)
364
-
365
- api.upload_file(
366
- repo_id=LEADERBOARD_DATASET,
367
- repo_type="dataset",
368
- path_or_fileobj=bio,
369
- path_in_repo=path_in_repo,
370
- commit_message=f"miniapp: submit {username}/{model_name}",
371
- token=HF_TOKEN,
372
- )
373
-
374
- return "已提交并写入 leaderboard。", _load_submissions_df()
375
-
376
-
377
- with gr.Blocks(title=f"{APP_NAME} leaderboard") as demo:
378
- gr.Markdown(
379
- f"## {APP_NAME} leaderboard\n\n"
380
- "提交你的模型信息后,会写入一个 Hugging Face **Dataset**,并在下方表格展示。\n\n"
381
- f"- 当前 `LEADERBOARD_DATASET`: `{LEADERBOARD_DATASET or '(未配置)'}`\n"
382
- )
383
-
384
- with gr.Row():
385
- with gr.Column(scale=2):
386
- model_name = gr.Textbox(label="模型名称(必填)", placeholder="例如:my-agent-v1")
387
- model_api = gr.Textbox(
388
- label="模型 API(必填)",
389
- placeholder="例如:https://api.example.com/v1/chat/completions",
390
- )
391
- notes = gr.Textbox(label="备注(可选)", lines=4)
392
-
393
- # 纯前端版:不强制 OAuth;在 Space 里建议你自己加 LoginButton 做鉴权
394
- if IN_SPACES:
395
- username = gr.Textbox(
396
- label="用户名(可选)",
397
- placeholder="建议填你的 HF 用户名(也可留空)",
398
- )
399
- else:
400
- username = gr.Textbox(label="用户名(本地调试用)", value="local")
401
-
402
- submit_btn = gr.Button("提交", variant="primary")
403
- status = gr.Markdown()
404
- with gr.Column(scale=3):
405
- leaderboard = gr.Dataframe(
406
- label="Leaderboard(按提交时间倒序)",
407
- value=_load_submissions_df(),
408
- interactive=False,
409
- wrap=True,
410
- )
411
- refresh_btn = gr.Button("刷新")
412
-
413
- submit_btn.click(
414
- submit,
415
- inputs=[model_name, model_api, notes, username],
416
- outputs=[status, leaderboard],
417
- )
418
- refresh_btn.click(refresh, inputs=[], outputs=[leaderboard])
419
-
420
- def main():
421
- demo.launch()
422
-
423
-
424
- if __name__ == "__main__":
425
- main()
426
-
427
- # Display the results
428
- if HAS_TOKEN and not LOCAL_DEBUG:
429
- try:
430
- eval_results = load_dataset(
431
- RESULTS_DATASET,
432
- YEAR_VERSION,
433
- token=TOKEN,
434
- download_mode="force_redownload",
435
- verification_mode=VerificationMode.NO_CHECKS,
436
- )
437
- except Exception as e:
438
- print(e)
439
- eval_results = None
440
-
441
- try:
442
- contact_infos = load_dataset(
443
- CONTACT_DATASET,
444
- YEAR_VERSION,
445
- token=TOKEN,
446
- download_mode="force_redownload",
447
- verification_mode=VerificationMode.NO_CHECKS,
448
- )
449
- except Exception as e:
450
- print(e)
451
- contact_infos = None
452
- else:
453
- eval_results = None
454
- contact_infos = None
455
-
456
- def get_dataframe_from_results(eval_results, split):
457
- if eval_results is None:
458
- return pd.DataFrame(columns=EMPTY_LEADERBOARD_COLUMNS)
459
- local_df = eval_results[split]
460
- local_df = local_df.map(lambda row: {"model": model_hyperlink(row["url"], row["model"])})
461
- local_df = local_df.remove_columns(["system_prompt", "url"])
462
- local_df = local_df.rename_column("model", "Agent name")
463
- local_df = local_df.rename_column("model_family", "Model family")
464
- local_df = local_df.rename_column("score", "Average score (%)")
465
- for i in [1, 2, 3]:
466
- local_df = local_df.rename_column(f"score_level{i}", f"Level {i} score (%)")
467
- local_df = local_df.rename_column("date", "Submission date")
468
- df = pd.DataFrame(local_df)
469
- df = df.sort_values(by=["Average score (%)"], ascending=False)
470
-
471
- numeric_cols = [c for c in local_df.column_names if "score" in c]
472
- df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
473
- #df = df.style.format("{:.2%}", subset=numeric_cols)
474
-
475
- return df
476
-
477
- #eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
478
- eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
479
-
480
- # Gold answers
481
- if HAS_TOKEN and not LOCAL_DEBUG:
482
- gold_dataset = load_dataset(
483
- INTERNAL_DATA_DATASET,
484
- f"{YEAR_VERSION}_all",
485
- token=TOKEN,
486
- )
487
- gold_results = {
488
- split: {row["task_id"]: row for row in gold_dataset[split]}
489
- for split in ["test", "validation"]
490
- }
491
- else:
492
- gold_results = {"test": {}, "validation": {}}
493
-
494
-
495
- def restart_space():
496
- if IN_SPACES and HAS_TOKEN:
497
- api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
498
-
499
- TYPES = ["markdown", "number", "number", "number", "number", "str", "str", "str"]
500
-
501
- def add_new_eval(
502
- #val_or_test: str,
503
- model: str,
504
- model_family: str,
505
- system_prompt: str,
506
- url: str,
507
- path_to_file: str,
508
- organisation: str,
509
- mail: str,
510
- profile: gr.OAuthProfile,
511
- ):
512
- val_or_test = "test"
513
- try:
514
- if not HAS_TOKEN or LOCAL_DEBUG:
515
- return format_error(
516
- "Submissions are disabled in local mode. Set env TOKEN (Hugging Face token) and rerun to enable submissions."
517
- )
518
- # Was the profile created less than 2 month ago?
519
- user_data = requests.get(f"https://huggingface.co/api/users/{profile.username}/overview")
520
- creation_date = json.loads(user_data.content)["createdAt"]
521
- if datetime.datetime.now() - datetime.datetime.strptime(creation_date, '%Y-%m-%dT%H:%M:%S.%fZ') < datetime.timedelta(days=60):
522
- return format_error("This account is not authorized to submit on GAIA.")
523
-
524
-
525
- contact_infos = load_dataset(CONTACT_DATASET, YEAR_VERSION, token=TOKEN, download_mode="force_redownload", verification_mode=VerificationMode.NO_CHECKS, trust_remote_code=True)
526
- user_submission_dates = sorted(row["date"] for row in contact_infos[val_or_test] if row["username"] == profile.username)
527
- # if len(user_submission_dates) > 0 and user_submission_dates[-1] == datetime.datetime.today().strftime('%Y-%m-%d'):
528
- # return format_error("You already submitted once today, please try again tomorrow.")
529
-
530
-
531
- is_validation = val_or_test == "validation"
532
- # Very basic email parsing
533
- _, parsed_mail = parseaddr(mail)
534
- if not "@" in parsed_mail:
535
- return format_warning("Please provide a valid email adress.")
536
-
537
- print("Adding new eval")
538
-
539
- # Check if the combination model/org already exists and prints a warning message if yes
540
- if model.lower() in set([m.lower() for m in eval_results[val_or_test]["model"]]) and organisation.lower() in set([o.lower() for o in eval_results[val_or_test]["organisation"]]):
541
- return format_warning("This model has been already submitted.")
542
-
543
- if path_to_file is None:
544
- return format_warning("Please attach a file.")
545
-
546
- # SAVE UNSCORED SUBMISSION
547
- if LOCAL_DEBUG:
548
- print("mock uploaded submission")
549
- else:
550
- api.upload_file(
551
- repo_id=SUBMISSION_DATASET,
552
- path_or_fileobj=path_to_file.name,
553
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_raw_{datetime.datetime.today()}.jsonl",
554
- repo_type="dataset",
555
- token=TOKEN
556
- )
557
-
558
- # SAVE CONTACT
559
- contact_info = {
560
- "model": model,
561
- "model_family": model_family,
562
- "url": url,
563
- "organisation": organisation,
564
- "username": profile.username,
565
- "mail": mail,
566
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
567
- }
568
- contact_infos[val_or_test]= contact_infos[val_or_test].add_item(contact_info)
569
- if LOCAL_DEBUG:
570
- print("mock uploaded contact info")
571
- else:
572
- contact_infos.push_to_hub(CONTACT_DATASET, config_name = YEAR_VERSION, token=TOKEN)
573
-
574
- # SCORE SUBMISSION
575
- file_path = path_to_file.name
576
- scores = {"all": 0, 1: 0, 2: 0, 3: 0}
577
- num_questions = {"all": 0, 1: 0, 2: 0, 3: 0}
578
- task_ids = []
579
- with open(f"scored/{organisation}_{model}.jsonl", "w") as scored_file:
580
- with open(file_path, 'r') as f:
581
- for ix, line in enumerate(f):
582
- try:
583
- task = json.loads(line)
584
- except Exception:
585
- return format_error(f"Line {ix} is incorrectly formatted. Please fix it and resubmit your file.")
586
-
587
- if "model_answer" not in task:
588
- return format_error(f"Line {ix} contains no model_answer key. Please fix it and resubmit your file.")
589
- answer = task["model_answer"]
590
- task_id = task["task_id"]
591
- try:
592
- level = int(gold_results[val_or_test][task_id]["Level"])
593
- except KeyError:
594
- return format_error(f"{task_id} not found in split {val_or_test}. Are you sure you submitted the correct file?")
595
-
596
- score = question_scorer(task['model_answer'], gold_results[val_or_test][task_id]["Final answer"])
597
-
598
- scored_file.write(
599
- json.dumps({
600
- "id": task_id,
601
- "model_answer": answer,
602
- "score": score,
603
- "level": level
604
- }) + "\n"
605
- )
606
- task_ids.append(task_id)
607
-
608
- scores["all"] += score
609
- scores[level] += score
610
- num_questions["all"] += 1
611
- num_questions[level] += 1
612
-
613
- # Check if there's any duplicate in the submission
614
- if len(task_ids) != len(set(task_ids)):
615
- return format_error("There are duplicates in your submission. Please check your file and resubmit it.")
616
-
617
- if any([num_questions[level] != ref_level_len[val_or_test][level] for level in [1, 2, 3]]):
618
- return format_error(f"Your submission has {num_questions[1]} questions for level 1, {num_questions[2]} for level 2, and {num_questions[3]} for level 3, but it should have {ref_level_len[val_or_test][1]}, {ref_level_len[val_or_test][2]}, and {ref_level_len[val_or_test][3]} respectively. Please check your submission.")
619
-
620
- # SAVE SCORED SUBMISSION
621
- if LOCAL_DEBUG:
622
- print("mock uploaded scored submission")
623
- else:
624
- api.upload_file(
625
- repo_id=SUBMISSION_DATASET,
626
- path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
627
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
628
- repo_type="dataset",
629
- token=TOKEN
630
- )
631
-
632
- # Save scored file
633
- if is_validation:
634
- api.upload_file(
635
- repo_id=SUBMISSION_DATASET_PUBLIC,
636
- path_or_fileobj=f"scored/{organisation}_{model}.jsonl",
637
- path_in_repo=f"{organisation}/{model}/{YEAR_VERSION}_{val_or_test}_scored_{datetime.datetime.today()}.jsonl",
638
- repo_type="dataset",
639
- token=TOKEN
640
- )
641
-
642
- # SAVE TO LEADERBOARD DATA
643
- eval_entry = {
644
- "model": model,
645
- "model_family": model_family,
646
- "system_prompt": system_prompt,
647
- "url": url,
648
- "organisation": organisation,
649
- "score": scores["all"]/ref_scores_len[val_or_test],
650
- "score_level1": scores[1]/num_questions[1],
651
- "score_level2": scores[2]/num_questions[2],
652
- "score_level3": scores[3]/num_questions[3],
653
- "date": datetime.datetime.today().strftime('%Y-%m-%d')
654
- }
655
- if num_questions[1] + num_questions[2] + num_questions[3] != ref_scores_len[val_or_test]:
656
- return format_error(f"Your submission has {len(scores['all'])} questions for the {val_or_test} set, but it should have {ref_scores_len[val_or_test]}. Please check your submission.")
657
- # Catching spam submissions of 100%
658
- if all((eval_entry[k] == 1 for k in ["score_level1", "score_level2", "score_level3"])):
659
- return format_error(f"There was a problem with your submission. Please open a discussion.")
660
-
661
- # Testing for duplicates - to see if we want to add something like it as it would allow people to try to see the content of other submissions
662
- #eval_entry_no_date = {k: v for k, v in eval_entry if k != "date"}
663
- #columns_no_date = [c for c in eval_results[val_or_test].column_names if c != "date"]
664
- #if eval_entry_no_date in eval_results[val_or_test].select_columns(columns_no_date):
665
- # return format_error(f"Your submission is an exact duplicate from an existing submission.")
666
-
667
- eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
668
- print(eval_results)
669
- if LOCAL_DEBUG:
670
- print("mock uploaded results to lb")
671
- else:
672
- eval_results.push_to_hub(RESULTS_DATASET, config_name = YEAR_VERSION, token=TOKEN)
673
-
674
-
675
- return format_log(f"Model {model} submitted by {organisation} successfully.\nPlease wait a few hours and refresh the leaderboard to see your score displayed.")
676
- except Exception as e:
677
- print(e)
678
- return format_error(f"An error occurred, please open a discussion and indicate at what time you encountered the error.\n")
679
-
680
-
681
- def refresh():
682
- if HAS_TOKEN and not LOCAL_DEBUG:
683
- try:
684
- eval_results = load_dataset(
685
- RESULTS_DATASET,
686
- YEAR_VERSION,
687
- token=TOKEN,
688
- download_mode="force_redownload",
689
- verification_mode=VerificationMode.NO_CHECKS,
690
- )
691
- except Exception as e:
692
- print(e)
693
- eval_results = None
694
- else:
695
- eval_results = None
696
- return get_dataframe_from_results(eval_results=eval_results, split="test")
697
-
698
- def upload_file(files):
699
- file_paths = [file.name for file in files]
700
- return file_paths
701
-
702
-
703
- demo = gr.Blocks()
704
- with demo:
705
- gr.HTML(TITLE)
706
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
707
-
708
- with gr.Row():
709
- with gr.Accordion("📙 Citation", open=False):
710
- citation_button = gr.Textbox(
711
- value=CITATION_BUTTON_TEXT,
712
- label=CITATION_BUTTON_LABEL,
713
- elem_id="citation-button",
714
- ) #.style(show_copy_button=True)
715
-
716
- gr.Markdown("Results: Test")
717
- leaderboard_table_test = gr.components.Dataframe(
718
- value=eval_dataframe_test, datatype=TYPES, interactive=False,
719
- column_widths=["20%"]
720
- )
721
- #with gr.Tab("Results: Validation"):
722
- # leaderboard_table_val = gr.components.Dataframe(
723
- # value=eval_dataframe_val, datatype=TYPES, interactive=False,
724
- # column_widths=["20%"]
725
- # )
726
-
727
- refresh_button = gr.Button("Refresh")
728
- refresh_button.click(
729
- refresh,
730
- inputs=[],
731
- outputs=[
732
- #leaderboard_table_val,
733
- leaderboard_table_test,
734
- ],
735
- )
736
- with gr.Accordion("Submit a new model for evaluation"):
737
- with gr.Row():
738
- gr.Markdown(SUBMISSION_TEXT, elem_classes="markdown-text")
739
- with gr.Row():
740
- with gr.Column():
741
- #level_of_test = gr.Radio(["test"], value="test", label="Split")
742
- model_name_textbox = gr.Textbox(label="Agent name")
743
- model_family_textbox = gr.Textbox(label="Model family")
744
- system_prompt_textbox = gr.Textbox(label="System prompt example")
745
- url_textbox = gr.Textbox(label="Url to model information")
746
- with gr.Column():
747
- organisation = gr.Textbox(label="Organisation")
748
- mail = gr.Textbox(label="Contact email (will be stored privately, & used if there is an issue with your submission)")
749
- file_output = gr.File()
750
-
751
-
752
- with gr.Row():
753
- gr.LoginButton()
754
- submit_button = gr.Button("Submit Eval On Test")
755
- submission_result = gr.Markdown()
756
- submit_button.click(
757
- add_new_eval,
758
- [
759
- #level_of_test,
760
- model_name_textbox,
761
- model_family_textbox,
762
- system_prompt_textbox,
763
- url_textbox,
764
- file_output,
765
- organisation,
766
- mail
767
- ],
768
- submission_result,
769
- )
770
-
771
- if IN_SPACES and HAS_TOKEN:
772
- scheduler = BackgroundScheduler()
773
- scheduler.add_job(restart_space, "interval", seconds=3600)
774
- scheduler.start()
775
- demo.launch(debug=True)