jonswain commited on
Commit
e125a0c
·
1 Parent(s): 4d51140

feat(hf-space): enforce submission cooldown and switch leaderboard reads to boto3

Browse files

- add 8-hour per-track submission cooldown checks in submit flow
- add helper to fetch last submission timestamp from S3 metadata objects
- replace pandas+s3fs leaderboard reads with boto3 get_object + in-memory CSV parsing (Avoids 403 Forbidden error)
- change leaderboard config paths to S3 keys (bucket configured separately)
- remove s3fs dependency from hf_space requirements
- grant hf-space IAM user s3:ListBucket on submissions prefixes for cooldown lookup

Files changed (4) hide show
  1. app.py +116 -18
  2. config.py +3 -3
  3. requirements.txt +1 -2
  4. submission_store.py +52 -1
app.py CHANGED
@@ -1,7 +1,10 @@
 
1
  import tempfile
2
  import zipfile
 
3
  from pathlib import Path
4
 
 
5
  import gradio as gr
6
  import numpy as np
7
  import pandas as pd
@@ -9,14 +12,18 @@ from config import (
9
  ACTIVITY_DATASET_SIZE,
10
  ACTIVITY_LEADERBOARD_S3,
11
  REQUIRED_ACTIVITY_COLUMNS,
 
12
  STRUCTURE_DATASET_SIZE,
13
  STRUCTURE_LEADERBOARD_S3,
 
14
  )
15
  from gradio.themes.utils import sizes
16
  from gradio_leaderboard import Leaderboard
17
  from loguru import logger
18
- from models import Submission
19
- from submission_store import upload_submission
 
 
20
 
21
 
22
  def make_user_clickable(name: str) -> str:
@@ -48,6 +55,7 @@ def _collapse_mean_std(df: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
48
 
49
  Returns:
50
  DataFrame with combined columns replacing the original pairs.
 
51
  """
52
  df = df.copy()
53
  for metric in metrics:
@@ -55,19 +63,35 @@ def _collapse_mean_std(df: pd.DataFrame, metrics: list[str]) -> pd.DataFrame:
55
  std_col = f"{metric}_std"
56
  if mean_col in df.columns and std_col in df.columns:
57
  df[metric] = (
58
- df[mean_col].map(_fmt_metric)
59
- + "±"
60
- + df[std_col].map(_fmt_metric)
61
  )
62
  df = df.drop(columns=[mean_col, std_col])
63
  return df
64
 
65
 
66
  _ACTIVITY_EMPTY = pd.DataFrame(
67
- columns=["rank", "username", "Submitted", "MAE", "RAE", "R2", "Spearman ρ", "Kendall's τ"]
 
 
 
 
 
 
 
 
 
68
  )
69
  _STRUCTURE_EMPTY = pd.DataFrame(
70
- columns=["rank", "username", "Submitted", "model_report_link", "LDDT-PLI", "BiSyRMSD", "Ligand RMSD", "LDDT-LP"]
 
 
 
 
 
 
 
 
 
71
  )
72
 
73
 
@@ -75,12 +99,16 @@ def _prepare_activity_df(df: pd.DataFrame) -> pd.DataFrame:
75
  """Sort, collapse, and rename activity leaderboard columns (no HTML)."""
76
  df = df.sort_values("RAE_mean", ascending=True).reset_index(drop=True)
77
  df = _collapse_mean_std(df, ["MAE", "RAE", "R2", "Spearman_R", "Kendall's_Tau"])
78
- df = df.rename(columns={
79
- "Spearman_R": "Spearman ρ",
80
- "Kendall's_Tau": "Kendall's τ",
81
- "submitted_at": "Submitted",
82
- })
83
- df["Submitted"] = pd.to_datetime(df["Submitted"], utc=True).dt.strftime("%Y-%m-%d %H:%M UTC")
 
 
 
 
84
  return df
85
 
86
 
@@ -89,7 +117,9 @@ def _prepare_structure_df(df: pd.DataFrame) -> pd.DataFrame:
89
  df = df.sort_values("LDDT-PLI_mean", ascending=False).reset_index(drop=True)
90
  df = _collapse_mean_std(df, ["LDDT-PLI", "BiSyRMSD", "Ligand_RMSD", "LDDT-LP"])
91
  df = df.rename(columns={"Ligand_RMSD": "Ligand RMSD", "submitted_at": "Submitted"})
92
- df["Submitted"] = pd.to_datetime(df["Submitted"], utc=True).dt.strftime("%Y-%m-%d %H:%M UTC")
 
 
93
  df["model_report_link"] = df["model_report_link"].fillna("")
94
  return df
95
 
@@ -98,7 +128,11 @@ def load_activity_leaderboard() -> pd.DataFrame:
98
  """Load the activity leaderboard from S3."""
99
  logger.info("Refreshing activity leaderboard...")
100
  try:
101
- df = pd.read_csv(ACTIVITY_LEADERBOARD_S3)
 
 
 
 
102
  except Exception as exc:
103
  logger.warning("Could not load activity leaderboard: {}", exc)
104
  return _ACTIVITY_EMPTY
@@ -112,7 +146,11 @@ def load_structure_leaderboard() -> pd.DataFrame:
112
  """Load the structure leaderboard from S3."""
113
  logger.info("Refreshing structure leaderboard...")
114
  try:
115
- df = pd.read_csv(STRUCTURE_LEADERBOARD_S3)
 
 
 
 
116
  except Exception as exc:
117
  logger.warning("Could not load structure leaderboard: {}", exc)
118
  return _STRUCTURE_EMPTY
@@ -126,7 +164,11 @@ def load_structure_leaderboard() -> pd.DataFrame:
126
  def download_activity_leaderboard() -> str:
127
  """Write the activity leaderboard to a temp CSV and return the file path."""
128
  try:
129
- df = pd.read_csv(ACTIVITY_LEADERBOARD_S3)
 
 
 
 
130
  except Exception as exc:
131
  logger.warning("Could not load activity leaderboard for download: {}", exc)
132
  df = _ACTIVITY_EMPTY
@@ -141,7 +183,11 @@ def download_activity_leaderboard() -> str:
141
  def download_structure_leaderboard() -> str:
142
  """Write the structure leaderboard to a temp CSV and return the file path."""
143
  try:
144
- df = pd.read_csv(STRUCTURE_LEADERBOARD_S3)
 
 
 
 
145
  except Exception as exc:
146
  logger.warning("Could not load structure leaderboard for download: {}", exc)
147
  df = _STRUCTURE_EMPTY
@@ -153,6 +199,24 @@ def download_structure_leaderboard() -> str:
153
  return f.name
154
 
155
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  def submit_predictions(
157
  username,
158
  user_alias,
@@ -239,6 +303,23 @@ def submit_predictions(
239
  return gr.update(
240
  value="Error: pEC50 column contains infinite values.", visible=True
241
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
  elif track_select == "Structure Prediction":
244
  if file_path.suffix.lower() != ".zip":
@@ -257,6 +338,23 @@ def submit_predictions(
257
  value=f"Error: Expected {STRUCTURE_DATASET_SIZE} files in zip, got {n_files}.",
258
  visible=True,
259
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  # --- build submission model and persist to S3 ---
262
  submission = Submission(
 
1
+ import io
2
  import tempfile
3
  import zipfile
4
+ from datetime import datetime, timezone
5
  from pathlib import Path
6
 
7
+ import boto3
8
  import gradio as gr
9
  import numpy as np
10
  import pandas as pd
 
12
  ACTIVITY_DATASET_SIZE,
13
  ACTIVITY_LEADERBOARD_S3,
14
  REQUIRED_ACTIVITY_COLUMNS,
15
+ S3_BUCKET,
16
  STRUCTURE_DATASET_SIZE,
17
  STRUCTURE_LEADERBOARD_S3,
18
+ TIME_BETWEEN_SUBMISSIONS,
19
  )
20
  from gradio.themes.utils import sizes
21
  from gradio_leaderboard import Leaderboard
22
  from loguru import logger
23
+ from models import Submission, _safeify_username
24
+ from submission_store import _fetch_last_submission_date, upload_submission
25
+
26
+ s3_client = boto3.client("s3", region_name="us-east-1")
27
 
28
 
29
  def make_user_clickable(name: str) -> str:
 
55
 
56
  Returns:
57
  DataFrame with combined columns replacing the original pairs.
58
+
59
  """
60
  df = df.copy()
61
  for metric in metrics:
 
63
  std_col = f"{metric}_std"
64
  if mean_col in df.columns and std_col in df.columns:
65
  df[metric] = (
66
+ df[mean_col].map(_fmt_metric) + "±" + df[std_col].map(_fmt_metric)
 
 
67
  )
68
  df = df.drop(columns=[mean_col, std_col])
69
  return df
70
 
71
 
72
  _ACTIVITY_EMPTY = pd.DataFrame(
73
+ columns=[
74
+ "rank",
75
+ "username",
76
+ "Submitted",
77
+ "MAE",
78
+ "RAE",
79
+ "R2",
80
+ "Spearman ρ",
81
+ "Kendall's τ",
82
+ ]
83
  )
84
  _STRUCTURE_EMPTY = pd.DataFrame(
85
+ columns=[
86
+ "rank",
87
+ "username",
88
+ "Submitted",
89
+ "model_report_link",
90
+ "LDDT-PLI",
91
+ "BiSyRMSD",
92
+ "Ligand RMSD",
93
+ "LDDT-LP",
94
+ ]
95
  )
96
 
97
 
 
99
  """Sort, collapse, and rename activity leaderboard columns (no HTML)."""
100
  df = df.sort_values("RAE_mean", ascending=True).reset_index(drop=True)
101
  df = _collapse_mean_std(df, ["MAE", "RAE", "R2", "Spearman_R", "Kendall's_Tau"])
102
+ df = df.rename(
103
+ columns={
104
+ "Spearman_R": "Spearman ρ",
105
+ "Kendall's_Tau": "Kendall's τ",
106
+ "submitted_at": "Submitted",
107
+ }
108
+ )
109
+ df["Submitted"] = pd.to_datetime(df["Submitted"], utc=True).dt.strftime(
110
+ "%Y-%m-%d %H:%M UTC"
111
+ )
112
  return df
113
 
114
 
 
117
  df = df.sort_values("LDDT-PLI_mean", ascending=False).reset_index(drop=True)
118
  df = _collapse_mean_std(df, ["LDDT-PLI", "BiSyRMSD", "Ligand_RMSD", "LDDT-LP"])
119
  df = df.rename(columns={"Ligand_RMSD": "Ligand RMSD", "submitted_at": "Submitted"})
120
+ df["Submitted"] = pd.to_datetime(df["Submitted"], utc=True).dt.strftime(
121
+ "%Y-%m-%d %H:%M UTC"
122
+ )
123
  df["model_report_link"] = df["model_report_link"].fillna("")
124
  return df
125
 
 
128
  """Load the activity leaderboard from S3."""
129
  logger.info("Refreshing activity leaderboard...")
130
  try:
131
+ obj = s3_client.get_object(
132
+ Bucket=S3_BUCKET,
133
+ Key=ACTIVITY_LEADERBOARD_S3,
134
+ )
135
+ df = pd.read_csv(io.BytesIO(obj["Body"].read()))
136
  except Exception as exc:
137
  logger.warning("Could not load activity leaderboard: {}", exc)
138
  return _ACTIVITY_EMPTY
 
146
  """Load the structure leaderboard from S3."""
147
  logger.info("Refreshing structure leaderboard...")
148
  try:
149
+ obj = s3_client.get_object(
150
+ Bucket=S3_BUCKET,
151
+ Key=STRUCTURE_LEADERBOARD_S3,
152
+ )
153
+ df = pd.read_csv(io.BytesIO(obj["Body"].read()))
154
  except Exception as exc:
155
  logger.warning("Could not load structure leaderboard: {}", exc)
156
  return _STRUCTURE_EMPTY
 
164
  def download_activity_leaderboard() -> str:
165
  """Write the activity leaderboard to a temp CSV and return the file path."""
166
  try:
167
+ obj = s3_client.get_object(
168
+ Bucket=S3_BUCKET,
169
+ Key=ACTIVITY_LEADERBOARD_S3,
170
+ )
171
+ df = pd.read_csv(io.BytesIO(obj["Body"].read()))
172
  except Exception as exc:
173
  logger.warning("Could not load activity leaderboard for download: {}", exc)
174
  df = _ACTIVITY_EMPTY
 
183
  def download_structure_leaderboard() -> str:
184
  """Write the structure leaderboard to a temp CSV and return the file path."""
185
  try:
186
+ obj = s3_client.get_object(
187
+ Bucket=S3_BUCKET,
188
+ Key=STRUCTURE_LEADERBOARD_S3,
189
+ )
190
+ df = pd.read_csv(io.BytesIO(obj["Body"].read()))
191
  except Exception as exc:
192
  logger.warning("Could not load structure leaderboard for download: {}", exc)
193
  df = _STRUCTURE_EMPTY
 
199
  return f.name
200
 
201
 
202
+ def _format_submission_time_message(last_submission: datetime, track: str) -> str:
203
+ """Format a message indicating when the user can next submit next."""
204
+ track_name = "an activity" if track == "activity" else "a structure"
205
+ next_submission_time = last_submission + pd.Timedelta(
206
+ seconds=TIME_BETWEEN_SUBMISSIONS
207
+ )
208
+ time_remaining = next_submission_time - datetime.now(timezone.utc)
209
+ seconds_left = max(0, int(time_remaining.total_seconds()))
210
+ hours, rem = divmod(seconds_left, 3600)
211
+ minutes, seconds = divmod(rem, 60)
212
+ wait_str = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
213
+ return (
214
+ f"Error: You submitted {track_name} prediction on "
215
+ f"{last_submission.strftime('%Y-%m-%d %H:%M:%S (UTC)')}.\n"
216
+ f"Please wait {wait_str} before submitting again."
217
+ )
218
+
219
+
220
  def submit_predictions(
221
  username,
222
  user_alias,
 
303
  return gr.update(
304
  value="Error: pEC50 column contains infinite values.", visible=True
305
  )
306
+ last_submission = _fetch_last_submission_date(
307
+ "activity", _safeify_username(username.strip())
308
+ )
309
+ logger.info(
310
+ f"Last submission date for user {username.strip()!r}: {last_submission}"
311
+ )
312
+ if (
313
+ last_submission
314
+ and (datetime.now(timezone.utc) - last_submission).total_seconds()
315
+ < TIME_BETWEEN_SUBMISSIONS
316
+ ):
317
+ return gr.update(
318
+ value=_format_submission_time_message(
319
+ last_submission, track="activity"
320
+ ),
321
+ visible=True,
322
+ )
323
 
324
  elif track_select == "Structure Prediction":
325
  if file_path.suffix.lower() != ".zip":
 
338
  value=f"Error: Expected {STRUCTURE_DATASET_SIZE} files in zip, got {n_files}.",
339
  visible=True,
340
  )
341
+ last_submission = _fetch_last_submission_date(
342
+ "structure", _safeify_username(username.strip())
343
+ )
344
+ logger.info(
345
+ f"Last submission date for user {username.strip()!r}: {last_submission}"
346
+ )
347
+ if (
348
+ last_submission
349
+ and (datetime.now(timezone.utc) - last_submission).total_seconds()
350
+ < TIME_BETWEEN_SUBMISSIONS
351
+ ):
352
+ return gr.update(
353
+ value=_format_submission_time_message(
354
+ last_submission, track="structure"
355
+ ),
356
+ visible=True,
357
+ )
358
 
359
  # --- build submission model and persist to S3 ---
360
  submission = Submission(
config.py CHANGED
@@ -5,8 +5,8 @@ import os
5
  ACTIVITY_DATASET_SIZE = 531
6
  STRUCTURE_DATASET_SIZE = 125
7
  REQUIRED_ACTIVITY_COLUMNS = {"SMILES", "Molecule Name", "pEC50"}
 
8
 
9
  S3_BUCKET: str = os.environ.get("S3_BUCKET", "")
10
- ACTIVITY_LEADERBOARD_S3 = f"s3://{S3_BUCKET}/leaderboard/interim/activity/leaderboard_latest.csv"
11
-
12
- STRUCTURE_LEADERBOARD_S3 = f"s3://{S3_BUCKET}/leaderboard/interim/structure/leaderboard_latest.csv"
 
5
  ACTIVITY_DATASET_SIZE = 531
6
  STRUCTURE_DATASET_SIZE = 125
7
  REQUIRED_ACTIVITY_COLUMNS = {"SMILES", "Molecule Name", "pEC50"}
8
+ TIME_BETWEEN_SUBMISSIONS = 28800 # 8 hours in seconds
9
 
10
  S3_BUCKET: str = os.environ.get("S3_BUCKET", "")
11
+ ACTIVITY_LEADERBOARD_S3 = "leaderboard/interim/activity/leaderboard_latest.csv"
12
+ STRUCTURE_LEADERBOARD_S3 = "leaderboard/interim/structure/leaderboard_latest.csv"
 
requirements.txt CHANGED
@@ -8,5 +8,4 @@ scikit-learn
8
  loguru
9
  statsmodels
10
  tqdm
11
- boto3
12
- s3fs
 
8
  loguru
9
  statsmodels
10
  tqdm
11
+ boto3
 
submission_store.py CHANGED
@@ -19,8 +19,8 @@ AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_DEFAULT_REGION
19
  Standard boto3 credentials — set via HuggingFace Space secrets.
20
  """
21
 
22
- import json
23
  import os
 
24
  from pathlib import Path
25
 
26
  import boto3
@@ -89,3 +89,54 @@ def upload_submission(submission: Submission, file_path: Path) -> Submission:
89
  )
90
 
91
  return submission
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  Standard boto3 credentials — set via HuggingFace Space secrets.
20
  """
21
 
 
22
  import os
23
+ from datetime import datetime, timezone
24
  from pathlib import Path
25
 
26
  import boto3
 
89
  )
90
 
91
  return submission
92
+
93
+
94
+ def _fetch_last_submission_date(track: str, user_id: str) -> datetime | None:
95
+ """Fetch the submission date of the most recent submission for a track and user.
96
+
97
+ Args:
98
+ track (str): The track name (e.g., "activity" or "structure").
99
+ user_id (str): The user ID to check for previous submissions.
100
+
101
+ Returns:
102
+ datetime | None: The submission date of the most recent submission, or None if
103
+ no previous submissions are found.
104
+
105
+ """
106
+ bucket = os.environ.get("S3_BUCKET")
107
+ if not bucket:
108
+ logger.warning(
109
+ "S3_BUCKET not set — cannot fetch last submission date. "
110
+ "Set S3_BUCKET and AWS credentials as Space secrets to enable this feature."
111
+ )
112
+ return None
113
+
114
+ s3 = boto3.client("s3")
115
+ prefix = f"submissions/{track}/{user_id}/"
116
+ try:
117
+ response = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)
118
+ if response["IsTruncated"]: # Unlikely to be > 1000 submissions per user
119
+ logger.warning(
120
+ f"ListObjectsV2 response truncated for prefix {prefix!r}. "
121
+ "Only the first 1000 objects will be considered."
122
+ )
123
+ if "Contents" not in response:
124
+ return None # No submissions found
125
+
126
+ logger.info(
127
+ f"Found {len(response['Contents'])} objects under prefix {prefix!r}."
128
+ )
129
+ submission_dates = []
130
+ for obj in response["Contents"]:
131
+ if obj["Key"].endswith("metadata.json"):
132
+ submission_dates.append(obj["LastModified"])
133
+
134
+ if not submission_dates: # Shouldn't be possible
135
+ logger.warning(f"No metadata.json files found under prefix {prefix!r}.")
136
+ return None
137
+
138
+ return max(submission_dates).astimezone(timezone.utc)
139
+
140
+ except Exception as exc:
141
+ logger.error(f"Failed to fetch last submission date for {user_id!r}: {exc}")
142
+ return None