| import gradio as gr |
| import pandas as pd |
| from pathlib import Path |
| from scipy.stats import spearmanr, kendalltau |
| from sklearn.metrics import mean_absolute_error, r2_score |
| from typing import Optional |
| from about import ENDPOINTS, API, submissions_repo, results_repo, test_repo |
| from huggingface_hub import hf_hub_download |
| import datetime |
| import io |
| import json, tempfile |
| import pydantic |
|
|
|
|
| class ParticipantRecord(pydantic.BaseModel): |
| hf_username: Optional[str] = pydantic.Field(default=None, description="Hugging Face username") |
| participant_name: Optional[str] = pydantic.Field(default=None, description="Participant's real name") |
| discord_username: Optional[str] = pydantic.Field(default=None, description="Discord username") |
| email: Optional[str] = pydantic.Field(default=None, description="Email address") |
| affiliation: Optional[str] = pydantic.Field(default=None, description="Affiliation") |
| model_tag: Optional[str] = pydantic.Field(default=None, description="Model tag") |
|
|
|
|
| class SubmissionMetadata(pydantic.BaseModel): |
| submission_time_utc: datetime.datetime |
| user: str |
| original_filename: str |
| evaluated: bool |
| participant: ParticipantRecord |
|
|
|
|
| def _safeify_username(username: str) -> str: |
| return str(username.strip()).replace("/", "_").replace(" ", "_") |
|
|
| def _unsafify_username(username: str) -> str: |
| return str(username.strip()).replace("/", "_").replace(" ", "_") |
|
|
| def submit_data(predictions_file: str, |
| user_state, |
| participant_name: str = "", |
| discord_username: str = "", |
| email: str = "", |
| affiliation: str = "" |
| ): |
| |
| if user_state is None: |
| raise gr.Error("Username or alias is required for submission.") |
| |
| file_path = Path(predictions_file).resolve() |
|
|
| if not file_path.exists(): |
| raise gr.Error("Uploaded file object does not have a valid file path.") |
|
|
| |
| try: |
| results_df = pd.read_csv(file_path) |
| except Exception as e: |
| return f"β Error reading results file: {str(e)}" |
|
|
| if results_df.empty: |
| return gr.Error("The uploaded file is empty.") |
| if not set(ENDPOINTS).issubset(set(results_df.columns)): |
| return gr.Error(f"The uploaded file must contain all endpoint predictions {ENDPOINTS} as columns.") |
| |
| |
|
|
| |
| ts = datetime.datetime.now(datetime.timezone.utc).isoformat(timespec="seconds") |
| safe_user = _safeify_username(user_state) |
|
|
| destination_csv = f"submissions/{safe_user}_{ts}.csv" |
| destination_json = destination_csv.replace(".csv", ".json") |
| |
| API.upload_file( |
| path_or_fileobj=str(file_path), |
| path_in_repo=destination_csv, |
| repo_id=submissions_repo, |
| repo_type="dataset", |
| commit_message=f"Add submission for {safe_user} at {ts}" |
| ) |
|
|
| |
| try: |
|
|
| participant_record = ParticipantRecord( |
| hf_username=user_state, |
| participant_name=participant_name, |
| discord_username=discord_username, |
| email=email, |
| affiliation=affiliation, |
| ) |
| except pydantic.ValidationError as e: |
| return f"β Error in participant information: {str(e)}" |
|
|
|
|
| try: |
| meta = SubmissionMetadata( |
| submission_time_utc=ts, |
| original_filename=file_path.name, |
| evaluated=False, |
| participant=participant_record |
| ) |
| except pydantic.ValidationError as e: |
| return f"β Error in metadata information: {str(e)}" |
|
|
| meta_bytes = io.BytesIO(json.dumps(meta.model_dump(), indent=2).encode("utf-8")) |
|
|
| API.upload_file( |
| path_or_fileobj=meta_bytes, |
| path_in_repo=destination_json, |
| repo_id=submissions_repo, |
| repo_type="dataset", |
| commit_message=f"Add metadata for {user_state} submission at {ts}" |
| ) |
|
|
| return "β
Your submission has been received! Your scores will appear on the leaderboard shortly.", destination_csv |
|
|
| def evaluate_data(filename: str) -> None: |
|
|
| |
| try: |
| local_path = hf_hub_download( |
| repo_id=submissions_repo, |
| repo_type="dataset", |
| filename=filename, |
| ) |
| except Exception as e: |
| raise gr.Error(f"Failed to download submission file: {e}") |
| |
| |
| try: |
| test_path = hf_hub_download( |
| repo_id=test_repo, |
| repo_type="dataset", |
| filename="data/test_dataset.csv", |
| ) |
| except Exception as e: |
| raise gr.Error(f"Failed to download test file: {e}") |
| |
| data_df = pd.read_csv(local_path) |
| test_df = pd.read_csv(test_path) |
| try: |
| results_df = calculate_metrics(data_df, test_df) |
| if not isinstance(results_df, pd.DataFrame) or results_df.empty: |
| raise gr.Error("Evaluation produced no results.") |
| except Exception as e: |
| raise gr.Error(f'Evaluation failed: {e}. No results written to results dataset.') |
| |
| |
| meta_filename = filename.replace(".csv", ".json") |
| try: |
| meta_path = hf_hub_download( |
| repo_id=submissions_repo, |
| repo_type="dataset", |
| filename=meta_filename, |
| ) |
| with open(meta_path, "r", encoding="utf-8") as f: |
| _meta = json.load(f) |
| meta = SubmissionMetadata(**_meta) |
| username = meta.participant.hf_username |
| timestamp = meta.submission_time_utc |
| except Exception as e: |
| raise gr.Error(f"Failed to load metadata file: {e}. No results written to results dataset.") |
|
|
| |
| results_df['user'] = username |
| safe_user = _unsafify_username(username) |
| destination_path = f"results/{safe_user}_{timestamp}_results.csv" |
| tmp_name = None |
| with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as tmp: |
| results_df.to_csv(tmp, index=False) |
| tmp.flush() |
| tmp_name = tmp.name |
| |
| API.upload_file( |
| path_or_fileobj=tmp_name, |
| path_in_repo=destination_path, |
| repo_id=results_repo, |
| repo_type="dataset", |
| commit_message=f"Add result data for {username}" |
| ) |
| Path(tmp_name).unlink() |
|
|
|
|
| def calculate_metrics( |
| results_dataframe: pd.DataFrame, |
| test_dataframe: pd.DataFrame |
| ): |
|
|
| def metrics_per_ep(pred, true): |
| mae = mean_absolute_error(true, pred) |
| r2 = r2_score(true, pred) |
| spr, _ = spearmanr(true, pred) |
| ktau, _ = kendalltau(true, pred) |
| return mae, r2, spr, ktau |
|
|
| df_results = pd.DataFrame(columns=["endpoint", "MAE", "R2", "Spearman R", "Kendall's Tau"]) |
| for i, measurement in enumerate(ENDPOINTS): |
| df_pred = results_dataframe[['Molecule Name', measurement]].dropna() |
| df_true = test_dataframe[['Molecule Name', measurement]].dropna() |
| |
| pred = df_pred.sort_values(by='Molecule Name')[measurement] |
| true = df_true.sort_values(by='Molecule Name')[measurement] |
| mae, r2, spearman, ktau = metrics_per_ep(pred, true) |
| df_results.loc[i, 'endpoint'] = measurement |
| df_results.loc[i, 'MAE'] = mae |
| df_results.loc[i, 'R2'] = r2 |
| df_results.loc[i, 'Spearman R'] = spearman |
| df_results.loc[i, "Kendall's Tau"] = ktau |
|
|
| return df_results |
|
|