Spaces:
Sleeping
Sleeping
Nicolas Wagner commited on
Commit ·
30f0c04
1
Parent(s): bc714de
proper metric evaluation
Browse files- app.py +3 -3
- src/about.py +51 -25
- src/display/utils.py +3 -3
- src/leaderboard/read_team_results.py +3 -3
- src/populate.py +4 -1
- src/submission/submit_csv.py +1 -1
app.py
CHANGED
|
@@ -84,15 +84,15 @@ def init_leaderboard(dataframe):
|
|
| 84 |
if not valid_cols:
|
| 85 |
valid_cols = [
|
| 86 |
"Team Name",
|
| 87 |
-
"Best Accuracy ⬆️",
|
| 88 |
"Best F1 Score ⬆️",
|
|
|
|
| 89 |
"Best Precision ⬆️",
|
| 90 |
"Best Recall ⬆️",
|
| 91 |
"Best TP ⬆️",
|
| 92 |
"Best FP ⬇️",
|
| 93 |
"Best FN ⬇️",
|
| 94 |
"Best TN ⬆️",
|
| 95 |
-
"
|
| 96 |
]
|
| 97 |
|
| 98 |
if dataframe is None or dataframe.empty:
|
|
@@ -290,7 +290,7 @@ with demo:
|
|
| 290 |
gr.Markdown("## Submit Your Predictions", elem_classes="markdown-text")
|
| 291 |
gr.Markdown(
|
| 292 |
"Upload a CSV file with your predictions. The CSV must have two columns: "
|
| 293 |
-
"`
|
| 294 |
elem_classes="markdown-text",
|
| 295 |
)
|
| 296 |
|
|
|
|
| 84 |
if not valid_cols:
|
| 85 |
valid_cols = [
|
| 86 |
"Team Name",
|
|
|
|
| 87 |
"Best F1 Score ⬆️",
|
| 88 |
+
"Best Accuracy ⬆️",
|
| 89 |
"Best Precision ⬆️",
|
| 90 |
"Best Recall ⬆️",
|
| 91 |
"Best TP ⬆️",
|
| 92 |
"Best FP ⬇️",
|
| 93 |
"Best FN ⬇️",
|
| 94 |
"Best TN ⬆️",
|
| 95 |
+
"Submission Date",
|
| 96 |
]
|
| 97 |
|
| 98 |
if dataframe is None or dataframe.empty:
|
|
|
|
| 290 |
gr.Markdown("## Submit Your Predictions", elem_classes="markdown-text")
|
| 291 |
gr.Markdown(
|
| 292 |
"Upload a CSV file with your predictions. The CSV must have two columns: "
|
| 293 |
+
"`id` (UUID) and `label` (must be exactly `0.0` or `1.0`). All IDs from the test set must be included.",
|
| 294 |
elem_classes="markdown-text",
|
| 295 |
)
|
| 296 |
|
src/about.py
CHANGED
|
@@ -13,62 +13,88 @@ LLM_BENCHMARKS_TEXT = """
|
|
| 13 |
- **Save your token immediately** - you'll need it to submit predictions
|
| 14 |
- You won't be able to see your token again after registration
|
| 15 |
|
| 16 |
-
### 2.
|
|
|
|
|
|
|
|
|
|
| 17 |
Create a CSV file with two columns:
|
| 18 |
-
- `
|
| 19 |
-
- `
|
| 20 |
|
| 21 |
Example CSV format:
|
| 22 |
```csv
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
```
|
| 29 |
|
| 30 |
-
###
|
| 31 |
- Go to the "Submit Predictions" tab
|
| 32 |
- Enter your team token
|
| 33 |
- Upload your CSV file
|
| 34 |
- Your submission will be automatically evaluated
|
| 35 |
|
| 36 |
-
###
|
| 37 |
Your predictions are evaluated on:
|
| 38 |
- **Accuracy**: Percentage of correct predictions
|
| 39 |
- **F1 Score**: Harmonic mean of precision and recall
|
| 40 |
-
- **
|
|
|
|
| 41 |
|
| 42 |
-
###
|
| 43 |
- Only your **best** scores are displayed on the leaderboard
|
| 44 |
-
- A submission is accepted only if it improves
|
| 45 |
-
- The leaderboard is sorted by best
|
| 46 |
-
- If
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
## Important Notes
|
| 49 |
- True labels are kept private and not accessible to participants
|
| 50 |
- You can submit multiple times - only your best scores count
|
| 51 |
- Make sure your CSV file format is correct before submitting
|
| 52 |
-
-
|
| 53 |
"""
|
| 54 |
|
| 55 |
EVALUATION_QUEUE_TEXT = """
|
| 56 |
## Submission Guidelines
|
| 57 |
|
| 58 |
### CSV File Requirements
|
| 59 |
-
- Must contain exactly two columns: `
|
| 60 |
-
- `
|
| 61 |
-
- `
|
| 62 |
- No missing values allowed
|
|
|
|
|
|
|
| 63 |
|
| 64 |
-
###
|
| 65 |
-
Accepted formats for
|
| 66 |
-
-
|
| 67 |
-
-
|
| 68 |
|
| 69 |
### Scoring
|
| 70 |
- Submissions are evaluated immediately upon upload
|
| 71 |
-
- Scores are computed using accuracy, F1 score, and
|
| 72 |
-
- Only submissions that improve your best
|
| 73 |
- Rejected submissions are logged but don't update the leaderboard
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"""
|
|
|
|
| 13 |
- **Save your token immediately** - you'll need it to submit predictions
|
| 14 |
- You won't be able to see your token again after registration
|
| 15 |
|
| 16 |
+
### 2. Explore the Data
|
| 17 |
+
Check out this [Exploratory Notebook](https://colab.research.google.com/drive/16O_P901xLdjkka8Xi4CfysF6h8l8q28H?usp=sharing) to understand the dataset and get started with your analysis.
|
| 18 |
+
|
| 19 |
+
### 3. Prepare Your Predictions
|
| 20 |
Create a CSV file with two columns:
|
| 21 |
+
- `id`: The UUID identifier of the test sample (must match the test set)
|
| 22 |
+
- `label`: Your prediction (must be exactly `0.0` for real or `1.0` for fake)
|
| 23 |
|
| 24 |
Example CSV format:
|
| 25 |
```csv
|
| 26 |
+
id,label
|
| 27 |
+
550e8400-e29b-41d4-a716-446655440000,0.0
|
| 28 |
+
550e8400-e29b-41d4-a716-446655440001,1.0
|
| 29 |
+
550e8400-e29b-41d4-a716-446655440002,0.0
|
| 30 |
+
550e8400-e29b-41d4-a716-446655440003,1.0
|
| 31 |
```
|
| 32 |
|
| 33 |
+
### 4. Submit Your Predictions
|
| 34 |
- Go to the "Submit Predictions" tab
|
| 35 |
- Enter your team token
|
| 36 |
- Upload your CSV file
|
| 37 |
- Your submission will be automatically evaluated
|
| 38 |
|
| 39 |
+
### 5. Evaluation Metrics
|
| 40 |
Your predictions are evaluated on:
|
| 41 |
- **Accuracy**: Percentage of correct predictions
|
| 42 |
- **F1 Score**: Harmonic mean of precision and recall
|
| 43 |
+
- **Precision**: True positives / (True positives + False positives)
|
| 44 |
+
- **Recall**: True positives / (True positives + False negatives)
|
| 45 |
|
| 46 |
+
### 6. Leaderboard Updates
|
| 47 |
- Only your **best** scores are displayed on the leaderboard
|
| 48 |
+
- A submission is accepted only if it improves your accuracy or F1 score
|
| 49 |
+
- The leaderboard is sorted by best F1 score (primary metric)
|
| 50 |
+
- If F1 score is tied, earlier submission date is used as a tiebreaker
|
| 51 |
+
|
| 52 |
+
## 🏆 Prize Distribution & Evaluation Criteria
|
| 53 |
+
|
| 54 |
+
Prizes are awarded based on the **F1 Score** metric:
|
| 55 |
+
|
| 56 |
+
- **1st Prize**: Team with the highest F1 score
|
| 57 |
+
- **2nd Prize**: Team with the second highest F1 score
|
| 58 |
+
- **Tiebreaker**: In case of equal F1 scores, the team that submitted their winning score **earlier** will be ranked higher
|
| 59 |
+
|
| 60 |
+
The final rankings will be determined at the end of the hackathon based on each team's best F1 score.
|
| 61 |
|
| 62 |
## Important Notes
|
| 63 |
- True labels are kept private and not accessible to participants
|
| 64 |
- You can submit multiple times - only your best scores count
|
| 65 |
- Make sure your CSV file format is correct before submitting
|
| 66 |
+
- **All IDs from the test set must be present in your submission**
|
| 67 |
"""
|
| 68 |
|
| 69 |
EVALUATION_QUEUE_TEXT = """
|
| 70 |
## Submission Guidelines
|
| 71 |
|
| 72 |
### CSV File Requirements
|
| 73 |
+
- Must contain exactly two columns: `id` and `label`
|
| 74 |
+
- `id` must be UUID strings matching the test set exactly
|
| 75 |
+
- `label` must be exactly `0.0` (real) or `1.0` (fake)
|
| 76 |
- No missing values allowed
|
| 77 |
+
- **All IDs from the test set must be included** in your submission
|
| 78 |
+
- No unknown IDs are allowed (only IDs from the test set)
|
| 79 |
|
| 80 |
+
### Label Format
|
| 81 |
+
Accepted formats for labels:
|
| 82 |
+
- **Only**: `0.0` (real) or `1.0` (fake)
|
| 83 |
+
- Any other format will be rejected
|
| 84 |
|
| 85 |
### Scoring
|
| 86 |
- Submissions are evaluated immediately upon upload
|
| 87 |
+
- Scores are computed using accuracy, F1 score, precision, and recall
|
| 88 |
+
- Only submissions that improve your best accuracy or F1 score are accepted
|
| 89 |
- Rejected submissions are logged but don't update the leaderboard
|
| 90 |
+
|
| 91 |
+
## 🏆 Prize Distribution & Evaluation Criteria
|
| 92 |
+
|
| 93 |
+
Prizes are awarded based on the **F1 Score** metric:
|
| 94 |
+
|
| 95 |
+
- **1st Prize**: Team with the highest F1 score
|
| 96 |
+
- **2nd Prize**: Team with the second highest F1 score
|
| 97 |
+
- **Tiebreaker**: In case of equal F1 scores, the team that submitted their winning score **earlier** will be ranked higher
|
| 98 |
+
|
| 99 |
+
The final rankings will be determined at the end of the hackathon based on each team's best F1 score.
|
| 100 |
"""
|
src/display/utils.py
CHANGED
|
@@ -24,23 +24,23 @@ class ColumnContent:
|
|
| 24 |
@dataclass(frozen=True)
|
| 25 |
class TeamColumn:
|
| 26 |
team_name = ColumnContent("Team Name", "str", True, never_hidden=True)
|
| 27 |
-
best_accuracy = ColumnContent("Best Accuracy ⬆️", "number", True)
|
| 28 |
best_f1 = ColumnContent("Best F1 Score ⬆️", "number", True)
|
|
|
|
| 29 |
best_precision = ColumnContent("Best Precision ⬆️", "number", True)
|
| 30 |
best_recall = ColumnContent("Best Recall ⬆️", "number", True)
|
| 31 |
best_tp = ColumnContent("Best TP ⬆️", "number", True)
|
| 32 |
best_fp = ColumnContent("Best FP ⬇️", "number", True)
|
| 33 |
best_fn = ColumnContent("Best FN ⬇️", "number", True)
|
| 34 |
best_tn = ColumnContent("Best TN ⬆️", "number", True)
|
| 35 |
-
|
| 36 |
|
| 37 |
|
| 38 |
@dataclass(frozen=True)
|
| 39 |
class SubmissionQueueColumn:
|
| 40 |
team_name = ColumnContent("Team Name", "str", True)
|
| 41 |
submission_date = ColumnContent("Submission Date", "str", True)
|
| 42 |
-
accuracy = ColumnContent("Accuracy ⬆️", "number", True)
|
| 43 |
f1 = ColumnContent("F1 Score ⬆️", "number", True)
|
|
|
|
| 44 |
precision = ColumnContent("Precision ⬆️", "number", True)
|
| 45 |
recall = ColumnContent("Recall ⬆️", "number", True)
|
| 46 |
tp = ColumnContent("TP ⬆️", "number", True)
|
|
|
|
| 24 |
@dataclass(frozen=True)
|
| 25 |
class TeamColumn:
|
| 26 |
team_name = ColumnContent("Team Name", "str", True, never_hidden=True)
|
|
|
|
| 27 |
best_f1 = ColumnContent("Best F1 Score ⬆️", "number", True)
|
| 28 |
+
best_accuracy = ColumnContent("Best Accuracy ⬆️", "number", True)
|
| 29 |
best_precision = ColumnContent("Best Precision ⬆️", "number", True)
|
| 30 |
best_recall = ColumnContent("Best Recall ⬆️", "number", True)
|
| 31 |
best_tp = ColumnContent("Best TP ⬆️", "number", True)
|
| 32 |
best_fp = ColumnContent("Best FP ⬇️", "number", True)
|
| 33 |
best_fn = ColumnContent("Best FN ⬇️", "number", True)
|
| 34 |
best_tn = ColumnContent("Best TN ⬆️", "number", True)
|
| 35 |
+
best_submission_date = ColumnContent("Submission Date", "str", True)
|
| 36 |
|
| 37 |
|
| 38 |
@dataclass(frozen=True)
|
| 39 |
class SubmissionQueueColumn:
|
| 40 |
team_name = ColumnContent("Team Name", "str", True)
|
| 41 |
submission_date = ColumnContent("Submission Date", "str", True)
|
|
|
|
| 42 |
f1 = ColumnContent("F1 Score ⬆️", "number", True)
|
| 43 |
+
accuracy = ColumnContent("Accuracy ⬆️", "number", True)
|
| 44 |
precision = ColumnContent("Precision ⬆️", "number", True)
|
| 45 |
recall = ColumnContent("Recall ⬆️", "number", True)
|
| 46 |
tp = ColumnContent("TP ⬆️", "number", True)
|
src/leaderboard/read_team_results.py
CHANGED
|
@@ -16,7 +16,7 @@ class TeamResult:
|
|
| 16 |
best_fp: int
|
| 17 |
best_fn: int
|
| 18 |
best_tn: int
|
| 19 |
-
|
| 20 |
|
| 21 |
def to_dict(self):
|
| 22 |
return {
|
|
@@ -29,7 +29,7 @@ class TeamResult:
|
|
| 29 |
TeamColumn.best_fp.name: self.best_fp,
|
| 30 |
TeamColumn.best_fn.name: self.best_fn,
|
| 31 |
TeamColumn.best_tn.name: self.best_tn,
|
| 32 |
-
TeamColumn.
|
| 33 |
}
|
| 34 |
|
| 35 |
|
|
@@ -58,7 +58,7 @@ def get_team_results(results_path: str) -> list[TeamResult]:
|
|
| 58 |
best_fp=data.get("best_fp", 0),
|
| 59 |
best_fn=data.get("best_fn", 0),
|
| 60 |
best_tn=data.get("best_tn", 0),
|
| 61 |
-
|
| 62 |
)
|
| 63 |
results.append(result)
|
| 64 |
except Exception:
|
|
|
|
| 16 |
best_fp: int
|
| 17 |
best_fn: int
|
| 18 |
best_tn: int
|
| 19 |
+
best_submission_date: str
|
| 20 |
|
| 21 |
def to_dict(self):
|
| 22 |
return {
|
|
|
|
| 29 |
TeamColumn.best_fp.name: self.best_fp,
|
| 30 |
TeamColumn.best_fn.name: self.best_fn,
|
| 31 |
TeamColumn.best_tn.name: self.best_tn,
|
| 32 |
+
TeamColumn.best_submission_date.name: self.best_submission_date,
|
| 33 |
}
|
| 34 |
|
| 35 |
|
|
|
|
| 58 |
best_fp=data.get("best_fp", 0),
|
| 59 |
best_fn=data.get("best_fn", 0),
|
| 60 |
best_tn=data.get("best_tn", 0),
|
| 61 |
+
best_submission_date=data.get("best_submission_date", ""),
|
| 62 |
)
|
| 63 |
results.append(result)
|
| 64 |
except Exception:
|
src/populate.py
CHANGED
|
@@ -15,7 +15,10 @@ def get_leaderboard_df(results_path: str, cols: list) -> pd.DataFrame:
|
|
| 15 |
return pd.DataFrame(columns=cols)
|
| 16 |
|
| 17 |
df = pd.DataFrame.from_records(all_data_json)
|
| 18 |
-
df = df.sort_values(
|
|
|
|
|
|
|
|
|
|
| 19 |
df = df[cols].round(decimals=4)
|
| 20 |
return df
|
| 21 |
|
|
|
|
| 15 |
return pd.DataFrame(columns=cols)
|
| 16 |
|
| 17 |
df = pd.DataFrame.from_records(all_data_json)
|
| 18 |
+
df = df.sort_values(
|
| 19 |
+
by=[TeamColumn.best_f1.name, TeamColumn.best_submission_date.name],
|
| 20 |
+
ascending=[False, True],
|
| 21 |
+
)
|
| 22 |
df = df[cols].round(decimals=4)
|
| 23 |
return df
|
| 24 |
|
src/submission/submit_csv.py
CHANGED
|
@@ -162,7 +162,7 @@ def submit_csv(token: str, csv_content: str) -> tuple[bool, str]:
|
|
| 162 |
"best_fp": scores["fp"],
|
| 163 |
"best_fn": scores["fn"],
|
| 164 |
"best_tn": scores["tn"],
|
| 165 |
-
"
|
| 166 |
}
|
| 167 |
save_team_best_scores(team_name, updated_scores)
|
| 168 |
status = "ACCEPTED"
|
|
|
|
| 162 |
"best_fp": scores["fp"],
|
| 163 |
"best_fn": scores["fn"],
|
| 164 |
"best_tn": scores["tn"],
|
| 165 |
+
"best_submission_date": timestamp,
|
| 166 |
}
|
| 167 |
save_team_best_scores(team_name, updated_scores)
|
| 168 |
status = "ACCEPTED"
|