pulsia commited on
Commit
542e9df
·
verified ·
1 Parent(s): b16de72

first_test (#1)

Browse files

- feat(leaderboard): first test with results (e9a9cf04d4702e7669dc6528ca7a17e767914fd1)

Files changed (3) hide show
  1. src/about.py +17 -4
  2. src/envs.py +4 -4
  3. src/leaderboard/read_evals.py +2 -2
src/about.py CHANGED
@@ -12,8 +12,12 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,11 +25,20 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("forms", "acc", "Forms")
16
+ task1 = Task("graphics", "acc", "Charts")
17
+ task2 = Task("handwritten", "acc", "Handwritten Texts")
18
+ task3 = Task("long_table", "acc", "Tables")
19
+ task4 = Task("tiny_texts", "acc", "Tiny Texts")
20
+ task5 = Task("multi_column", "acc", "Multiple Column Layout Texts")
21
 
22
  NUM_FEWSHOT = 0 # Change with your few shot
23
  # ---------------------------------------------------
 
25
 
26
 
27
  # Your leaderboard name
28
+ TITLE = """<h1 align="center" id="space-title">FR Benchmark for PDF to MD parsing</h1>"""
29
 
30
  # What does your leaderboard evaluate?
31
  INTRODUCTION_TEXT = """
32
+ This leaderboard is about evaluating different VLMs on a specifically crafted document
33
+ parsing benchmark in French language.
34
+ Results are evaluated on 6 different tasks to provide an extensive view on all the tasks
35
+ expected in a business context. The tasks are the following:
36
+ * Forms
37
+ * Charts
38
+ * Handwritten Texts
39
+ * Tables
40
+ * Tiny Texts
41
+ * Multiple Columns Layout Texts
42
  """
43
 
44
  # Which evaluations are you running? how can people reproduce what you have?
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "pulsia" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/fr-bench-pdf2md"
13
+ # QUEUE_REPO = f"{OWNER}/requests"
14
+ RESULTS_REPO = f"{OWNER}/fr-bench-pdf2md-results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -57,7 +57,7 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
@@ -176,7 +176,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
- eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name
 
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
+ still_on_hub, model_config = is_model_on_hub(
61
  full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
  )
63
  architecture = "?"
 
176
  for model_result_filepath in model_result_filepaths:
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
+ # eval_result.update_with_request_file(requests_path)
180
 
181
  # Store results of same eval together
182
  eval_name = eval_result.eval_name