Fhrozen commited on
Commit
1a72d3c
·
1 Parent(s): 8eaaf2d

updates on files path

Browse files
espn_ldbd/display/tabs.py CHANGED
@@ -2,11 +2,8 @@ import gradio as gr
2
  import math
3
  import re
4
 
5
- from espn_ldbd.leaderboard.data import (
6
- LeaderboardData,
7
- LeaderboardDataset,
8
- )
9
- from espn_ldbd.submission.submit import submit_repo
10
 
11
 
12
  def create_leaderboard_tab(
 
2
  import math
3
  import re
4
 
5
+ from espn_ldbd.leaderboard.data import LeaderboardData
6
+ from espn_ldbd.leaderboard.dataset import LeaderboardDataset
 
 
 
7
 
8
 
9
  def create_leaderboard_tab(
espn_ldbd/leaderboard/data.py CHANGED
@@ -1,15 +1,8 @@
1
- import os
2
- import threading
3
  import math
4
  from typing import Dict, Tuple
5
 
6
  import pandas as pd
7
 
8
- from datasets import Dataset, load_dataset, concatenate_datasets
9
-
10
- from huggingface_hub import HfApi
11
- from huggingface_hub.utils import HfHubHTTPError
12
-
13
 
14
  class LeaderboardData:
15
  """Class to manage leaderboard data from datasets"""
@@ -116,128 +109,3 @@ class LeaderboardData:
116
  paginated_df = df.iloc[start_idx:end_idx].copy()
117
 
118
  return paginated_df, page, total_pages
119
-
120
-
121
- class LeaderboardDataset:
122
- def __init__(self, min_time_submit: float = 10.0):
123
- repo_id = os.environ.get("ESPNET_DB", None)
124
- assert repo_id is not None
125
- self._repo_id = repo_id
126
- self._tasks = load_dataset(repo_id, "task_db", split="train")
127
- self._subtasks_db = load_dataset(repo_id, "sub_task_db", split="train")
128
- self._subtasks_cache = {}
129
- self._new_submits = []
130
-
131
- self._stop_event = threading.Event()
132
- self._background_submission = None
133
- self._hf_api = HfApi()
134
- self._submit_lock = threading.Lock()
135
- self._submit_timer = None
136
- self._submit_period = min_time_submit
137
- return
138
-
139
- @property
140
- def get_tasks(self):
141
- return self._tasks
142
-
143
- def get_subtasks(self, task_id: str):
144
- if task_id not in self._subtasks_cache:
145
- subtasks = self._subtasks_db.filter(
146
- lambda sample: sample["task_id"] == task_id
147
- )
148
- self._subtasks_cache[task_id] = subtasks
149
- return self._subtasks_cache[task_id]
150
-
151
- def submit_repoid(self, repo_id: str) -> bool:
152
- # Try to acquire lock without blocking
153
- if not self._submit_lock.acquire(blocking=False):
154
- return False, "Submission already in progress."
155
-
156
- message = f"Error in submitting {repo_id}."
157
- try:
158
- # Cancel any existing timer
159
- if self._submit_timer is not None:
160
- self._submit_timer.cancel()
161
-
162
- # Set a timer to release the lock after execution
163
- def _release_lock():
164
- self._submit_lock.release()
165
- # print(f"[Dataset] Submit lock released after timeout")
166
-
167
- # Validate existance of repository
168
- try:
169
- self._hf_api.model_info(repo_id)
170
- except HfHubHTTPError as e:
171
- return False, "The submitted repository does not exist."
172
-
173
- # TODO(Fhrozen): add validation of repo_id for ESPnet format
174
- self._new_submits.append({
175
- "model_id": repo_id,
176
- "reviewed": False,
177
- "date_review": "",
178
- "commit_version": "",
179
- "valid_repo": True,
180
- "need_review": True,
181
- })
182
- # Set timer to auto-release lock after 5 seconds
183
- self._submit_timer = threading.Timer(self._submit_period, _release_lock)
184
- self._submit_timer.start()
185
-
186
- return True, ""
187
- except Exception as e:
188
- # Release lock on any error
189
- self._submit_lock.release()
190
- # print(f"[Dataset] Error in submit_repoid: {e}")
191
- message += f" {e}"
192
- return False, message
193
-
194
- def _periodic_submissions_update(self):
195
- if len(self._new_submits) == 0:
196
- print("[Dataset] No additional submits were found.")
197
- return
198
-
199
- request_ds = load_dataset(self._repo_id, "request_db", split="train")
200
- num_rows = len(self._new_submits)
201
- new_ds = Dataset.from_list(self._new_submits)
202
- request_ds = concatenate_datasets([request_ds, new_ds])
203
- print(f"[Dataset] Adding {num_rows} new rows to request db.")
204
- request_ds.push_to_hub(self._repo_id, config_name="request_db")
205
- self._new_submits = []
206
- return
207
-
208
- def _scheduler_loop(self, interval_seconds):
209
- print(
210
- "[Dataset] Background Register submissions thread started. "
211
- f"Running every {interval_seconds} seconds."
212
- )
213
- while not self._stop_event.is_set():
214
- self._periodic_submissions_update()
215
- self._stop_event.wait(interval_seconds)
216
- return
217
-
218
- def start_register_submission(self, hours=0, minutes=0, seconds=0):
219
- total_interval = (hours * 3600) + (minutes * 60) + seconds
220
- assert total_interval > 0, "Interval must be greater than 0"
221
-
222
- if self._background_submission is not None and self._background_submission.is_alive():
223
- print("[Dataset] Register submissions is already running.")
224
-
225
- self._stop_event.clear()
226
- # Create the thread
227
- # target: the function to run
228
- # args: arguments to pass to that function
229
- # daemon=True: ensures the thread dies if the main program crashes hard
230
- self._background_submission = threading.Thread(
231
- target=self._scheduler_loop,
232
- args=(total_interval,),
233
- daemon=True
234
- )
235
- self._background_submission.start()
236
-
237
- def stop_register_submission(self):
238
- print("[Dataset] Stopping Register submissions task...")
239
- self._stop_event.set() # This breaks the wait() in the loop immediately
240
- if self._background_submission:
241
- self._background_submission.join() # Wait for the thread to clean up
242
- print("[Dataset] Register submissions task stopped.")
243
- return
 
 
 
1
  import math
2
  from typing import Dict, Tuple
3
 
4
  import pandas as pd
5
 
 
 
 
 
 
6
 
7
  class LeaderboardData:
8
  """Class to manage leaderboard data from datasets"""
 
109
  paginated_df = df.iloc[start_idx:end_idx].copy()
110
 
111
  return paginated_df, page, total_pages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
espn_ldbd/leaderboard/dataset.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import threading
3
+
4
+ from datasets import Dataset, load_dataset, concatenate_datasets
5
+
6
+ from huggingface_hub import HfApi
7
+ from huggingface_hub.utils import HfHubHTTPError
8
+
9
+
10
+ class LeaderboardDataset:
11
+ def __init__(self, min_time_submit: float = 10.0):
12
+ repo_id = os.environ.get("ESPNET_DB", None)
13
+ assert repo_id is not None
14
+ self._repo_id = repo_id
15
+ self._tasks = load_dataset(repo_id, "task_db", split="train")
16
+ self._subtasks_db = load_dataset(repo_id, "sub_task_db", split="train")
17
+ self._subtasks_cache = {}
18
+ self._new_submits = []
19
+
20
+ self._stop_event = threading.Event()
21
+ self._background_submission = None
22
+ self._hf_api = HfApi()
23
+ self._submit_lock = threading.Lock()
24
+ self._submit_timer = None
25
+ self._submit_period = min_time_submit
26
+ return
27
+
28
+ @property
29
+ def get_tasks(self):
30
+ return self._tasks
31
+
32
+ def get_subtasks(self, task_id: str):
33
+ if task_id not in self._subtasks_cache:
34
+ subtasks = self._subtasks_db.filter(
35
+ lambda sample: sample["task_id"] == task_id
36
+ )
37
+ self._subtasks_cache[task_id] = subtasks
38
+ return self._subtasks_cache[task_id]
39
+
40
+ def submit_repoid(self, repo_id: str) -> bool:
41
+ # Try to acquire lock without blocking
42
+ if not self._submit_lock.acquire(blocking=False):
43
+ return False, "Submission already in progress."
44
+
45
+ message = f"Error in submitting {repo_id}."
46
+ try:
47
+ # Cancel any existing timer
48
+ if self._submit_timer is not None:
49
+ self._submit_timer.cancel()
50
+
51
+ # Set a timer to release the lock after execution
52
+ def _release_lock():
53
+ self._submit_lock.release()
54
+ # print(f"[Dataset] Submit lock released after timeout")
55
+
56
+ # Validate existance of repository
57
+ try:
58
+ self._hf_api.model_info(repo_id)
59
+ except HfHubHTTPError as e:
60
+ return False, "The submitted repository does not exist."
61
+
62
+ # TODO(Fhrozen): add validation of repo_id for ESPnet format
63
+ self._new_submits.append({
64
+ "model_id": repo_id,
65
+ "reviewed": False,
66
+ "date_review": "",
67
+ "commit_version": "",
68
+ "valid_repo": True,
69
+ "need_review": True,
70
+ })
71
+ # Set timer to auto-release lock after 5 seconds
72
+ self._submit_timer = threading.Timer(self._submit_period, _release_lock)
73
+ self._submit_timer.start()
74
+
75
+ return True, ""
76
+ except Exception as e:
77
+ # Release lock on any error
78
+ self._submit_lock.release()
79
+ # print(f"[Dataset] Error in submit_repoid: {e}")
80
+ message += f" {e}"
81
+ return False, message
82
+
83
+ def _periodic_submissions_update(self):
84
+ if len(self._new_submits) == 0:
85
+ print("[Dataset] No additional submits were found.")
86
+ return
87
+
88
+ request_ds = load_dataset(self._repo_id, "request_db", split="train")
89
+ num_rows = len(self._new_submits)
90
+ new_ds = Dataset.from_list(self._new_submits)
91
+ request_ds = concatenate_datasets([request_ds, new_ds])
92
+ print(f"[Dataset] Adding {num_rows} new rows to request db.")
93
+ request_ds.push_to_hub(self._repo_id, config_name="request_db")
94
+ self._new_submits = []
95
+ return
96
+
97
+ def _scheduler_loop(self, interval_seconds):
98
+ print(
99
+ "[Dataset] Background Register submissions thread started. "
100
+ f"Running every {interval_seconds} seconds."
101
+ )
102
+ while not self._stop_event.is_set():
103
+ self._periodic_submissions_update()
104
+ self._stop_event.wait(interval_seconds)
105
+ return
106
+
107
+ def start_register_submission(self, hours=0, minutes=0, seconds=0):
108
+ total_interval = (hours * 3600) + (minutes * 60) + seconds
109
+ assert total_interval > 0, "Interval must be greater than 0"
110
+
111
+ if self._background_submission is not None and self._background_submission.is_alive():
112
+ print("[Dataset] Register submissions is already running.")
113
+
114
+ self._stop_event.clear()
115
+ # Create the thread
116
+ # target: the function to run
117
+ # args: arguments to pass to that function
118
+ # daemon=True: ensures the thread dies if the main program crashes hard
119
+ self._background_submission = threading.Thread(
120
+ target=self._scheduler_loop,
121
+ args=(total_interval,),
122
+ daemon=True
123
+ )
124
+ self._background_submission.start()
125
+
126
+ def stop_register_submission(self):
127
+ print("[Dataset] Stopping Register submissions task...")
128
+ self._stop_event.set() # This breaks the wait() in the loop immediately
129
+ if self._background_submission:
130
+ self._background_submission.join() # Wait for the thread to clean up
131
+ print("[Dataset] Register submissions task stopped.")
132
+ return
espn_ldbd/submission/__init__.py DELETED
File without changes