Nicolas Wagner commited on
Commit
30f0c04
·
1 Parent(s): bc714de

proper metric evaluation

Browse files
app.py CHANGED
@@ -84,15 +84,15 @@ def init_leaderboard(dataframe):
84
  if not valid_cols:
85
  valid_cols = [
86
  "Team Name",
87
- "Best Accuracy ⬆️",
88
  "Best F1 Score ⬆️",
 
89
  "Best Precision ⬆️",
90
  "Best Recall ⬆️",
91
  "Best TP ⬆️",
92
  "Best FP ⬇️",
93
  "Best FN ⬇️",
94
  "Best TN ⬆️",
95
- "Last Submission",
96
  ]
97
 
98
  if dataframe is None or dataframe.empty:
@@ -290,7 +290,7 @@ with demo:
290
  gr.Markdown("## Submit Your Predictions", elem_classes="markdown-text")
291
  gr.Markdown(
292
  "Upload a CSV file with your predictions. The CSV must have two columns: "
293
- "`index` and `prediction`. Predictions should be binary (0/1 or 'real'/'fake').",
294
  elem_classes="markdown-text",
295
  )
296
 
 
84
  if not valid_cols:
85
  valid_cols = [
86
  "Team Name",
 
87
  "Best F1 Score ⬆️",
88
+ "Best Accuracy ⬆️",
89
  "Best Precision ⬆️",
90
  "Best Recall ⬆️",
91
  "Best TP ⬆️",
92
  "Best FP ⬇️",
93
  "Best FN ⬇️",
94
  "Best TN ⬆️",
95
+ "Submission Date",
96
  ]
97
 
98
  if dataframe is None or dataframe.empty:
 
290
  gr.Markdown("## Submit Your Predictions", elem_classes="markdown-text")
291
  gr.Markdown(
292
  "Upload a CSV file with your predictions. The CSV must have two columns: "
293
+ "`id` (UUID) and `label` (must be exactly `0.0` or `1.0`). All IDs from the test set must be included.",
294
  elem_classes="markdown-text",
295
  )
296
 
src/about.py CHANGED
@@ -13,62 +13,88 @@ LLM_BENCHMARKS_TEXT = """
13
  - **Save your token immediately** - you'll need it to submit predictions
14
  - You won't be able to see your token again after registration
15
 
16
- ### 2. Prepare Your Predictions
 
 
 
17
  Create a CSV file with two columns:
18
- - `index`: The index of the test sample (must match the test set)
19
- - `prediction`: Your prediction (binary: 0/1, or "real"/"fake")
20
 
21
  Example CSV format:
22
  ```csv
23
- index,prediction
24
- 43555,0
25
- 43556,1
26
- 43557,real
27
- 43558,fake
28
  ```
29
 
30
- ### 3. Submit Your Predictions
31
  - Go to the "Submit Predictions" tab
32
  - Enter your team token
33
  - Upload your CSV file
34
  - Your submission will be automatically evaluated
35
 
36
- ### 4. Evaluation Metrics
37
  Your predictions are evaluated on:
38
  - **Accuracy**: Percentage of correct predictions
39
  - **F1 Score**: Harmonic mean of precision and recall
40
- - **Error Rate**: Percentage of incorrect predictions
 
41
 
42
- ### 5. Leaderboard Updates
43
  - Only your **best** scores are displayed on the leaderboard
44
- - A submission is accepted only if it improves at least one metric
45
- - The leaderboard is sorted by best accuracy (primary metric)
46
- - If accuracy is tied, F1 score is used as a tiebreaker
 
 
 
 
 
 
 
 
 
 
47
 
48
  ## Important Notes
49
  - True labels are kept private and not accessible to participants
50
  - You can submit multiple times - only your best scores count
51
  - Make sure your CSV file format is correct before submitting
52
- - Indices in your CSV must exactly match the test set indices
53
  """
54
 
55
  EVALUATION_QUEUE_TEXT = """
56
  ## Submission Guidelines
57
 
58
  ### CSV File Requirements
59
- - Must contain exactly two columns: `index` and `prediction`
60
- - `index` must match the test set indices exactly
61
- - `prediction` must be binary: 0/1 or "real"/"fake"
62
  - No missing values allowed
 
 
63
 
64
- ### Prediction Format
65
- Accepted formats for predictions:
66
- - Numeric: `0` (real) or `1` (fake)
67
- - String: `"real"` or `"fake"` (case-insensitive)
68
 
69
  ### Scoring
70
  - Submissions are evaluated immediately upon upload
71
- - Scores are computed using accuracy, F1 score, and error rate
72
- - Only submissions that improve your best scores are accepted
73
  - Rejected submissions are logged but don't update the leaderboard
 
 
 
 
 
 
 
 
 
 
74
  """
 
13
  - **Save your token immediately** - you'll need it to submit predictions
14
  - You won't be able to see your token again after registration
15
 
16
+ ### 2. Explore the Data
17
+ Check out this [Exploratory Notebook](https://colab.research.google.com/drive/16O_P901xLdjkka8Xi4CfysF6h8l8q28H?usp=sharing) to understand the dataset and get started with your analysis.
18
+
19
+ ### 3. Prepare Your Predictions
20
  Create a CSV file with two columns:
21
+ - `id`: The UUID identifier of the test sample (must match the test set)
22
+ - `label`: Your prediction (must be exactly `0.0` for real or `1.0` for fake)
23
 
24
  Example CSV format:
25
  ```csv
26
+ id,label
27
+ 550e8400-e29b-41d4-a716-446655440000,0.0
28
+ 550e8400-e29b-41d4-a716-446655440001,1.0
29
+ 550e8400-e29b-41d4-a716-446655440002,0.0
30
+ 550e8400-e29b-41d4-a716-446655440003,1.0
31
  ```
32
 
33
+ ### 4. Submit Your Predictions
34
  - Go to the "Submit Predictions" tab
35
  - Enter your team token
36
  - Upload your CSV file
37
  - Your submission will be automatically evaluated
38
 
39
+ ### 5. Evaluation Metrics
40
  Your predictions are evaluated on:
41
  - **Accuracy**: Percentage of correct predictions
42
  - **F1 Score**: Harmonic mean of precision and recall
43
+ - **Precision**: True positives / (True positives + False positives)
44
+ - **Recall**: True positives / (True positives + False negatives)
45
 
46
+ ### 6. Leaderboard Updates
47
  - Only your **best** scores are displayed on the leaderboard
48
+ - A submission is accepted only if it improves your accuracy or F1 score
49
+ - The leaderboard is sorted by best F1 score (primary metric)
50
+ - If F1 score is tied, earlier submission date is used as a tiebreaker
51
+
52
+ ## 🏆 Prize Distribution & Evaluation Criteria
53
+
54
+ Prizes are awarded based on the **F1 Score** metric:
55
+
56
+ - **1st Prize**: Team with the highest F1 score
57
+ - **2nd Prize**: Team with the second highest F1 score
58
+ - **Tiebreaker**: In case of equal F1 scores, the team that submitted their winning score **earlier** will be ranked higher
59
+
60
+ The final rankings will be determined at the end of the hackathon based on each team's best F1 score.
61
 
62
  ## Important Notes
63
  - True labels are kept private and not accessible to participants
64
  - You can submit multiple times - only your best scores count
65
  - Make sure your CSV file format is correct before submitting
66
+ - **All IDs from the test set must be present in your submission**
67
  """
68
 
69
  EVALUATION_QUEUE_TEXT = """
70
  ## Submission Guidelines
71
 
72
  ### CSV File Requirements
73
+ - Must contain exactly two columns: `id` and `label`
74
+ - `id` must be UUID strings matching the test set exactly
75
+ - `label` must be exactly `0.0` (real) or `1.0` (fake)
76
  - No missing values allowed
77
+ - **All IDs from the test set must be included** in your submission
78
+ - No unknown IDs are allowed (only IDs from the test set)
79
 
80
+ ### Label Format
81
+ Accepted formats for labels:
82
+ - **Only**: `0.0` (real) or `1.0` (fake)
83
+ - Any other format will be rejected
84
 
85
  ### Scoring
86
  - Submissions are evaluated immediately upon upload
87
+ - Scores are computed using accuracy, F1 score, precision, and recall
88
+ - Only submissions that improve your best accuracy or F1 score are accepted
89
  - Rejected submissions are logged but don't update the leaderboard
90
+
91
+ ## 🏆 Prize Distribution & Evaluation Criteria
92
+
93
+ Prizes are awarded based on the **F1 Score** metric:
94
+
95
+ - **1st Prize**: Team with the highest F1 score
96
+ - **2nd Prize**: Team with the second highest F1 score
97
+ - **Tiebreaker**: In case of equal F1 scores, the team that submitted their winning score **earlier** will be ranked higher
98
+
99
+ The final rankings will be determined at the end of the hackathon based on each team's best F1 score.
100
  """
src/display/utils.py CHANGED
@@ -24,23 +24,23 @@ class ColumnContent:
24
  @dataclass(frozen=True)
25
  class TeamColumn:
26
  team_name = ColumnContent("Team Name", "str", True, never_hidden=True)
27
- best_accuracy = ColumnContent("Best Accuracy ⬆️", "number", True)
28
  best_f1 = ColumnContent("Best F1 Score ⬆️", "number", True)
 
29
  best_precision = ColumnContent("Best Precision ⬆️", "number", True)
30
  best_recall = ColumnContent("Best Recall ⬆️", "number", True)
31
  best_tp = ColumnContent("Best TP ⬆️", "number", True)
32
  best_fp = ColumnContent("Best FP ⬇️", "number", True)
33
  best_fn = ColumnContent("Best FN ⬇️", "number", True)
34
  best_tn = ColumnContent("Best TN ⬆️", "number", True)
35
- last_submission_date = ColumnContent("Last Submission", "str", True)
36
 
37
 
38
  @dataclass(frozen=True)
39
  class SubmissionQueueColumn:
40
  team_name = ColumnContent("Team Name", "str", True)
41
  submission_date = ColumnContent("Submission Date", "str", True)
42
- accuracy = ColumnContent("Accuracy ⬆️", "number", True)
43
  f1 = ColumnContent("F1 Score ⬆️", "number", True)
 
44
  precision = ColumnContent("Precision ⬆️", "number", True)
45
  recall = ColumnContent("Recall ⬆️", "number", True)
46
  tp = ColumnContent("TP ⬆️", "number", True)
 
24
  @dataclass(frozen=True)
25
  class TeamColumn:
26
  team_name = ColumnContent("Team Name", "str", True, never_hidden=True)
 
27
  best_f1 = ColumnContent("Best F1 Score ⬆️", "number", True)
28
+ best_accuracy = ColumnContent("Best Accuracy ⬆️", "number", True)
29
  best_precision = ColumnContent("Best Precision ⬆️", "number", True)
30
  best_recall = ColumnContent("Best Recall ⬆️", "number", True)
31
  best_tp = ColumnContent("Best TP ⬆️", "number", True)
32
  best_fp = ColumnContent("Best FP ⬇️", "number", True)
33
  best_fn = ColumnContent("Best FN ⬇️", "number", True)
34
  best_tn = ColumnContent("Best TN ⬆️", "number", True)
35
+ best_submission_date = ColumnContent("Submission Date", "str", True)
36
 
37
 
38
  @dataclass(frozen=True)
39
  class SubmissionQueueColumn:
40
  team_name = ColumnContent("Team Name", "str", True)
41
  submission_date = ColumnContent("Submission Date", "str", True)
 
42
  f1 = ColumnContent("F1 Score ⬆️", "number", True)
43
+ accuracy = ColumnContent("Accuracy ⬆️", "number", True)
44
  precision = ColumnContent("Precision ⬆️", "number", True)
45
  recall = ColumnContent("Recall ⬆️", "number", True)
46
  tp = ColumnContent("TP ⬆️", "number", True)
src/leaderboard/read_team_results.py CHANGED
@@ -16,7 +16,7 @@ class TeamResult:
16
  best_fp: int
17
  best_fn: int
18
  best_tn: int
19
- last_submission_date: str
20
 
21
  def to_dict(self):
22
  return {
@@ -29,7 +29,7 @@ class TeamResult:
29
  TeamColumn.best_fp.name: self.best_fp,
30
  TeamColumn.best_fn.name: self.best_fn,
31
  TeamColumn.best_tn.name: self.best_tn,
32
- TeamColumn.last_submission_date.name: self.last_submission_date,
33
  }
34
 
35
 
@@ -58,7 +58,7 @@ def get_team_results(results_path: str) -> list[TeamResult]:
58
  best_fp=data.get("best_fp", 0),
59
  best_fn=data.get("best_fn", 0),
60
  best_tn=data.get("best_tn", 0),
61
- last_submission_date=data.get("last_submission_date", ""),
62
  )
63
  results.append(result)
64
  except Exception:
 
16
  best_fp: int
17
  best_fn: int
18
  best_tn: int
19
+ best_submission_date: str
20
 
21
  def to_dict(self):
22
  return {
 
29
  TeamColumn.best_fp.name: self.best_fp,
30
  TeamColumn.best_fn.name: self.best_fn,
31
  TeamColumn.best_tn.name: self.best_tn,
32
+ TeamColumn.best_submission_date.name: self.best_submission_date,
33
  }
34
 
35
 
 
58
  best_fp=data.get("best_fp", 0),
59
  best_fn=data.get("best_fn", 0),
60
  best_tn=data.get("best_tn", 0),
61
+ best_submission_date=data.get("best_submission_date", ""),
62
  )
63
  results.append(result)
64
  except Exception:
src/populate.py CHANGED
@@ -15,7 +15,10 @@ def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame:
15
  return pd.DataFrame(columns=cols)
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
- df = df.sort_values(by=[TeamColumn.best_accuracy.name], ascending=False)
 
 
 
19
  df = df[cols].round(decimals=4)
20
  return df
21
 
 
15
  return pd.DataFrame(columns=cols)
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
+ df = df.sort_values(
19
+ by=[TeamColumn.best_f1.name, TeamColumn.best_submission_date.name],
20
+ ascending=[False, True],
21
+ )
22
  df = df[cols].round(decimals=4)
23
  return df
24
 
src/submission/submit_csv.py CHANGED
@@ -162,7 +162,7 @@ def submit_csv(token: str, csv_content: str) -> tuple[bool, str]:
162
  "best_fp": scores["fp"],
163
  "best_fn": scores["fn"],
164
  "best_tn": scores["tn"],
165
- "last_submission_date": timestamp,
166
  }
167
  save_team_best_scores(team_name, updated_scores)
168
  status = "ACCEPTED"
 
162
  "best_fp": scores["fp"],
163
  "best_fn": scores["fn"],
164
  "best_tn": scores["tn"],
165
+ "best_submission_date": timestamp,
166
  }
167
  save_team_best_scores(team_name, updated_scores)
168
  status = "ACCEPTED"