Fix the first submission for the task case and tidy up code a bit
Browse files- src/submission_uploader.py +33 -38
src/submission_uploader.py
CHANGED
|
@@ -3,7 +3,7 @@ import logging
|
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
from tempfile import TemporaryDirectory
|
| 6 |
-
from typing import
|
| 7 |
|
| 8 |
import jsonlines
|
| 9 |
from huggingface_hub import CommitOperationAdd # type: ignore[import]
|
|
@@ -30,13 +30,13 @@ class SubmissionUploader:
|
|
| 30 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
| 31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
| 32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
| 33 |
-
self.
|
| 34 |
-
self.
|
| 35 |
|
| 36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
| 37 |
-
"""Searches among discussions of dataset
|
| 38 |
try:
|
| 39 |
-
discussions = self._api.get_repo_discussions(repo_id=self.
|
| 40 |
except Exception:
|
| 41 |
return None
|
| 42 |
for discussion in discussions:
|
|
@@ -44,22 +44,6 @@ class SubmissionUploader:
|
|
| 44 |
return discussion
|
| 45 |
return None
|
| 46 |
|
| 47 |
-
def _get_metadata(
|
| 48 |
-
self,
|
| 49 |
-
model_name_pretty: str,
|
| 50 |
-
model_availability: str,
|
| 51 |
-
urls: Optional[str],
|
| 52 |
-
context_size: str,
|
| 53 |
-
submitted_by: str,
|
| 54 |
-
) -> Dict[str, Optional[str]]:
|
| 55 |
-
return {
|
| 56 |
-
"model_name": model_name_pretty,
|
| 57 |
-
"model_availability": model_availability,
|
| 58 |
-
"urls": urls,
|
| 59 |
-
"context_size": context_size,
|
| 60 |
-
"submitted_by": submitted_by,
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
def _upload_request(
|
| 64 |
self,
|
| 65 |
task_id: str,
|
|
@@ -74,6 +58,7 @@ class SubmissionUploader:
|
|
| 74 |
pr_url: str,
|
| 75 |
temp_directory: str,
|
| 76 |
) -> List[CommitOperationAdd]:
|
|
|
|
| 77 |
request_metadata = {
|
| 78 |
"model_folder": model_folder,
|
| 79 |
"model_name_pretty": model_name_pretty,
|
|
@@ -90,7 +75,11 @@ class SubmissionUploader:
|
|
| 90 |
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
| 91 |
json.dump(request_metadata, f)
|
| 92 |
|
| 93 |
-
num_requests_already_present =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
commit_operations = [
|
| 95 |
CommitOperationAdd(
|
| 96 |
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
|
@@ -105,6 +94,7 @@ class SubmissionUploader:
|
|
| 105 |
model_folder: str,
|
| 106 |
filenames: List[str],
|
| 107 |
) -> List[CommitOperationAdd]:
|
|
|
|
| 108 |
commit_operations = [
|
| 109 |
CommitOperationAdd(
|
| 110 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
|
@@ -115,6 +105,7 @@ class SubmissionUploader:
|
|
| 115 |
return commit_operations
|
| 116 |
|
| 117 |
def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
|
|
|
|
| 118 |
metrics_module = METRICS[task_id]
|
| 119 |
assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
|
| 120 |
metrics_module.reset()
|
|
@@ -153,18 +144,20 @@ class SubmissionUploader:
|
|
| 153 |
submitted_by: str,
|
| 154 |
temp_directory: str,
|
| 155 |
) -> List[CommitOperationAdd]:
|
|
|
|
| 156 |
final_results = {}
|
| 157 |
with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
|
| 158 |
metrics = json.load(f)
|
| 159 |
final_results.update(metrics)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
|
| 166 |
)
|
| 167 |
-
final_results.update(metadata_dict)
|
| 168 |
|
| 169 |
with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
|
| 170 |
writer.write(final_results)
|
|
@@ -189,6 +182,7 @@ class SubmissionUploader:
|
|
| 189 |
comment: Optional[str],
|
| 190 |
filenames: Optional[List[str]],
|
| 191 |
):
|
|
|
|
| 192 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
| 193 |
assert model_folder, "Please, specify non-empty name for a directory with a model's results."
|
| 194 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
|
@@ -238,15 +232,17 @@ class SubmissionUploader:
|
|
| 238 |
|
| 239 |
logging.info("Checking if this request has already been submitted...")
|
| 240 |
if not force:
|
| 241 |
-
if
|
| 242 |
return styled_warning(
|
| 243 |
-
f"{model_folder} is already present in {self.
|
| 244 |
)
|
| 245 |
|
| 246 |
prev_pr = self._get_previous_pr(pr_title)
|
| 247 |
if prev_pr is not None:
|
| 248 |
-
url = f"https://huggingface.co/datasets/{self.
|
| 249 |
-
return styled_warning(
|
|
|
|
|
|
|
| 250 |
|
| 251 |
logging.info("Processing predictions...")
|
| 252 |
predictions_commit_operations = self._upload_predictions(
|
|
@@ -271,9 +267,9 @@ class SubmissionUploader:
|
|
| 271 |
temp_directory=str(d),
|
| 272 |
)
|
| 273 |
|
| 274 |
-
logging.info(
|
| 275 |
new_pr = self._api.create_commit(
|
| 276 |
-
repo_id=self.
|
| 277 |
operations=predictions_commit_operations + results_commit_operations,
|
| 278 |
commit_message=pr_title,
|
| 279 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
|
@@ -281,7 +277,7 @@ class SubmissionUploader:
|
|
| 281 |
repo_type="dataset",
|
| 282 |
)
|
| 283 |
|
| 284 |
-
logging.info(
|
| 285 |
request_commit_operations = self._upload_request(
|
| 286 |
task_id=task_id,
|
| 287 |
model_folder=model_folder,
|
|
@@ -296,7 +292,7 @@ class SubmissionUploader:
|
|
| 296 |
pr_url=new_pr.pr_url,
|
| 297 |
)
|
| 298 |
self._api.create_commit(
|
| 299 |
-
repo_id=self.
|
| 300 |
operations=request_commit_operations,
|
| 301 |
commit_message=pr_title,
|
| 302 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
|
@@ -307,7 +303,6 @@ class SubmissionUploader:
|
|
| 307 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
| 308 |
|
| 309 |
except Exception as e:
|
| 310 |
-
logging.exception(e)
|
| 311 |
exception_msg = str(e)
|
| 312 |
if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
|
| 313 |
exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")
|
|
|
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
from tempfile import TemporaryDirectory
|
| 6 |
+
from typing import List, Optional
|
| 7 |
|
| 8 |
import jsonlines
|
| 9 |
from huggingface_hub import CommitOperationAdd # type: ignore[import]
|
|
|
|
| 30 |
def __init__(self, dataset_id: str, private_dataset_id: str):
|
| 31 |
self._api = HfApi(token=os.environ["HF_TOKEN"])
|
| 32 |
self._fs = HfFileSystem(token=os.environ["HF_TOKEN"])
|
| 33 |
+
self._results_dataset_id = dataset_id
|
| 34 |
+
self._requests_dataset_id = private_dataset_id
|
| 35 |
|
| 36 |
def _get_previous_pr(self, pr_title: str) -> Optional[Discussion]:
|
| 37 |
+
"""Searches among discussions of the results dataset for a PR with the given title."""
|
| 38 |
try:
|
| 39 |
+
discussions = self._api.get_repo_discussions(repo_id=self._results_dataset_id, repo_type="dataset")
|
| 40 |
except Exception:
|
| 41 |
return None
|
| 42 |
for discussion in discussions:
|
|
|
|
| 44 |
return discussion
|
| 45 |
return None
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
def _upload_request(
|
| 48 |
self,
|
| 49 |
task_id: str,
|
|
|
|
| 58 |
pr_url: str,
|
| 59 |
temp_directory: str,
|
| 60 |
) -> List[CommitOperationAdd]:
|
| 61 |
+
"""Adds a file with metadata about the current request to the requests dataset."""
|
| 62 |
request_metadata = {
|
| 63 |
"model_folder": model_folder,
|
| 64 |
"model_name_pretty": model_name_pretty,
|
|
|
|
| 75 |
with open(os.path.join(temp_directory, "request_metadata.json"), "w") as f:
|
| 76 |
json.dump(request_metadata, f)
|
| 77 |
|
| 78 |
+
num_requests_already_present = (
|
| 79 |
+
len(self._fs.ls(f"datasets/{self._requests_dataset_id}/{task_id}/"))
|
| 80 |
+
if self._fs.isdir(f"datasets/{self._requests_dataset_id}/{task_id}/")
|
| 81 |
+
else 0
|
| 82 |
+
)
|
| 83 |
commit_operations = [
|
| 84 |
CommitOperationAdd(
|
| 85 |
path_in_repo=f"{task_id}/{num_requests_already_present}_{model_folder}.json",
|
|
|
|
| 94 |
model_folder: str,
|
| 95 |
filenames: List[str],
|
| 96 |
) -> List[CommitOperationAdd]:
|
| 97 |
+
"""Adds all files with current model's predictions to the results dataset."""
|
| 98 |
commit_operations = [
|
| 99 |
CommitOperationAdd(
|
| 100 |
path_in_repo=f"{task_id}/predictions/{model_folder}/{os.path.basename(filename)}",
|
|
|
|
| 105 |
return commit_operations
|
| 106 |
|
| 107 |
def _compute_metrics_for_predictions(self, task_id: str, filenames: List[str], temp_directory: str) -> None:
|
| 108 |
+
"""Computes metrics for each submitted file with the current model's predictions."""
|
| 109 |
metrics_module = METRICS[task_id]
|
| 110 |
assert metrics_module is not None, f"Computing metrics for {task_id} is not supported."
|
| 111 |
metrics_module.reset()
|
|
|
|
| 144 |
submitted_by: str,
|
| 145 |
temp_directory: str,
|
| 146 |
) -> List[CommitOperationAdd]:
|
| 147 |
+
"""Adds files with the current model's metrics values to the results dataset."""
|
| 148 |
final_results = {}
|
| 149 |
with open(os.path.join(temp_directory, "final_metrics.json"), "r") as f:
|
| 150 |
metrics = json.load(f)
|
| 151 |
final_results.update(metrics)
|
| 152 |
+
final_results.update(
|
| 153 |
+
{
|
| 154 |
+
"model_name": model_name_pretty,
|
| 155 |
+
"model_availability": model_availability,
|
| 156 |
+
"urls": urls,
|
| 157 |
+
"context_size": context_size,
|
| 158 |
+
"submitted_by": submitted_by,
|
| 159 |
+
}
|
| 160 |
)
|
|
|
|
| 161 |
|
| 162 |
with jsonlines.open(os.path.join(temp_directory, "final_results.jsonl"), "w") as writer:
|
| 163 |
writer.write(final_results)
|
|
|
|
| 182 |
comment: Optional[str],
|
| 183 |
filenames: Optional[List[str]],
|
| 184 |
):
|
| 185 |
+
"""Verifies that all necessary arguments are not None (and also runs other sanity checks)."""
|
| 186 |
assert task_pretty and task_pretty in TASKS_PRETTY_REVERSE, "Please, select one of the supported tasks."
|
| 187 |
assert model_folder, "Please, specify non-empty name for a directory with a model's results."
|
| 188 |
assert model_name_pretty, "Please, specify non-empty name for a model."
|
|
|
|
| 232 |
|
| 233 |
logging.info("Checking if this request has already been submitted...")
|
| 234 |
if not force:
|
| 235 |
+
if self._fs.isdir(f"datasets/{self._results_dataset_id}/{task_id}/predictions/{model_folder}"):
|
| 236 |
return styled_warning(
|
| 237 |
+
f"{model_folder} is already present in {self._results_dataset_id}, please, select another folder name."
|
| 238 |
)
|
| 239 |
|
| 240 |
prev_pr = self._get_previous_pr(pr_title)
|
| 241 |
if prev_pr is not None:
|
| 242 |
+
url = f"https://huggingface.co/datasets/{self._results_dataset_id}/discussions/{prev_pr.num}"
|
| 243 |
+
return styled_warning(
|
| 244 |
+
f"{self._results_dataset_id} already has an open PR for this submission: {url}."
|
| 245 |
+
)
|
| 246 |
|
| 247 |
logging.info("Processing predictions...")
|
| 248 |
predictions_commit_operations = self._upload_predictions(
|
|
|
|
| 267 |
temp_directory=str(d),
|
| 268 |
)
|
| 269 |
|
| 270 |
+
logging.info("Creating commit to the results dataset...")
|
| 271 |
new_pr = self._api.create_commit(
|
| 272 |
+
repo_id=self._results_dataset_id,
|
| 273 |
operations=predictions_commit_operations + results_commit_operations,
|
| 274 |
commit_message=pr_title,
|
| 275 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}""",
|
|
|
|
| 277 |
repo_type="dataset",
|
| 278 |
)
|
| 279 |
|
| 280 |
+
logging.info("Creating commit to the requests dataset...")
|
| 281 |
request_commit_operations = self._upload_request(
|
| 282 |
task_id=task_id,
|
| 283 |
model_folder=model_folder,
|
|
|
|
| 292 |
pr_url=new_pr.pr_url,
|
| 293 |
)
|
| 294 |
self._api.create_commit(
|
| 295 |
+
repo_id=self._requests_dataset_id,
|
| 296 |
operations=request_commit_operations,
|
| 297 |
commit_message=pr_title,
|
| 298 |
commit_description=f"""New submission to {task_pretty} task in ποΈ Long Code Arena benchmark!\n* Model name: {model_name_pretty}\n* Model availability: {model_availability}\n* Context Size: {context_size}\n* Relevant URLs: {urls}\n* Submitted By: {submitted_by}\n* PR: {new_pr.pr_url}\n* Contact information: {contact_information}\n* Comment: {comment}""",
|
|
|
|
| 303 |
return styled_message(f"π PR created at {new_pr.pr_url}.")
|
| 304 |
|
| 305 |
except Exception as e:
|
|
|
|
| 306 |
exception_msg = str(e)
|
| 307 |
if exception_msg and os.environ["PRIVATE_DATASET_ID"] in exception_msg:
|
| 308 |
exception_msg = exception_msg.replace(os.environ["PRIVATE_DATASET_ID"], "{private_dataset}")
|