Update app.py
Browse files
app.py
CHANGED
|
@@ -13,7 +13,7 @@ token = os.environ.get("HG_TOKEN")
|
|
| 13 |
if token:
|
| 14 |
login(token)
|
| 15 |
|
| 16 |
-
|
| 17 |
try:
|
| 18 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
| 19 |
references = {row["id"]: row["text"] for row in dataset}
|
|
@@ -22,10 +22,10 @@ except Exception as e:
|
|
| 22 |
print(f"Error loading dataset: {str(e)}")
|
| 23 |
references = {}
|
| 24 |
|
| 25 |
-
|
| 26 |
leaderboard_file = "leaderboard.csv"
|
| 27 |
if not os.path.exists(leaderboard_file):
|
| 28 |
-
|
| 29 |
sample_data = []
|
| 30 |
# ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
|
| 31 |
# ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
|
|
@@ -36,7 +36,7 @@ if not os.path.exists(leaderboard_file):
|
|
| 36 |
else:
|
| 37 |
leaderboard_df = pd.read_csv(leaderboard_file)
|
| 38 |
|
| 39 |
-
|
| 40 |
if "Combined_Score" not in leaderboard_df.columns:
|
| 41 |
leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
|
| 42 |
leaderboard_df.to_csv(leaderboard_file, index=False)
|
|
@@ -78,7 +78,6 @@ def calculate_metrics(predictions_df):
|
|
| 78 |
sample_wer = wer(reference, hypothesis)
|
| 79 |
sample_cer = cer(reference, hypothesis)
|
| 80 |
|
| 81 |
-
# Cap extreme values to prevent outliers from skewing results
|
| 82 |
sample_wer = min(sample_wer, 2.0)
|
| 83 |
sample_cer = min(sample_cer, 2.0)
|
| 84 |
|
|
@@ -104,7 +103,7 @@ def calculate_metrics(predictions_df):
|
|
| 104 |
avg_wer = sum(item["wer"] for item in results) / len(results)
|
| 105 |
avg_cer = sum(item["cer"] for item in results) / len(results)
|
| 106 |
|
| 107 |
-
|
| 108 |
weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
|
| 109 |
weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
|
| 110 |
|
|
@@ -119,22 +118,19 @@ def prepare_leaderboard_for_display(df, sort_by="Combined_Score"):
|
|
| 119 |
if df is None or len(df) == 0:
|
| 120 |
return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
|
| 121 |
|
| 122 |
-
|
| 123 |
display_df = df.copy()
|
| 124 |
|
| 125 |
-
|
| 126 |
display_df = display_df.sort_values(sort_by)
|
| 127 |
|
| 128 |
-
# Add ranking column
|
| 129 |
display_df.insert(0, "Rank", range(1, len(display_df) + 1))
|
| 130 |
|
| 131 |
-
# Format numeric columns as percentages
|
| 132 |
for col in ["WER", "CER", "Combined_Score"]:
|
| 133 |
if col in display_df.columns:
|
| 134 |
display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
|
| 135 |
|
| 136 |
-
|
| 137 |
-
# This allows for proper sorting while showing formatted values
|
| 138 |
|
| 139 |
return display_df
|
| 140 |
|
|
@@ -198,16 +194,13 @@ def process_submission(model_name, csv_file):
|
|
| 198 |
except Exception as e:
|
| 199 |
return f"Error calculating metrics: {str(e)}", None
|
| 200 |
|
| 201 |
-
|
| 202 |
leaderboard = pd.read_csv(leaderboard_file)
|
| 203 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 204 |
|
| 205 |
-
# Calculate combined score (70% WER, 30% CER)
|
| 206 |
combined_score = avg_wer * 0.7 + avg_cer * 0.3
|
| 207 |
|
| 208 |
-
# Check if model already exists
|
| 209 |
if model_name in leaderboard["Model_Name"].values:
|
| 210 |
-
# Update existing entry
|
| 211 |
idx = leaderboard[leaderboard["Model_Name"] == model_name].index
|
| 212 |
leaderboard.loc[idx, "WER"] = avg_wer
|
| 213 |
leaderboard.loc[idx, "CER"] = avg_cer
|
|
@@ -215,18 +208,15 @@ def process_submission(model_name, csv_file):
|
|
| 215 |
leaderboard.loc[idx, "timestamp"] = timestamp
|
| 216 |
updated_leaderboard = leaderboard
|
| 217 |
else:
|
| 218 |
-
# Add new entry
|
| 219 |
new_entry = pd.DataFrame(
|
| 220 |
[[model_name, avg_wer, avg_cer, combined_score, timestamp]],
|
| 221 |
columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
|
| 222 |
)
|
| 223 |
updated_leaderboard = pd.concat([leaderboard, new_entry])
|
| 224 |
|
| 225 |
-
# Sort and save updated leaderboard
|
| 226 |
updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
|
| 227 |
updated_leaderboard.to_csv(leaderboard_file, index=False)
|
| 228 |
|
| 229 |
-
# Prepare for display
|
| 230 |
display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
|
| 231 |
|
| 232 |
return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
|
|
@@ -270,7 +260,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
| 270 |
|
| 271 |
current_data = get_current_leaderboard()
|
| 272 |
|
| 273 |
-
|
| 274 |
if len(current_data) > 0:
|
| 275 |
best_model = current_data.sort_values("Combined_Score").iloc[0]
|
| 276 |
gr.Markdown(f"""
|
|
@@ -282,7 +272,7 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
| 282 |
|
| 283 |
with gr.Tabs() as tabs:
|
| 284 |
with gr.TabItem("π
Model Rankings"):
|
| 285 |
-
|
| 286 |
initial_leaderboard = create_leaderboard_table()
|
| 287 |
|
| 288 |
ranking_method = gr.Radio(
|
|
@@ -373,12 +363,12 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
| 373 |
"""
|
| 374 |
## About the Benchmark Dataset
|
| 375 |
|
| 376 |
-
This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/
|
| 377 |
|
| 378 |
* Contains diverse Bambara speech samples
|
| 379 |
* Includes various speakers, accents, and dialects
|
| 380 |
* Covers different speech styles and recording conditions
|
| 381 |
-
*
|
| 382 |
|
| 383 |
### How to Generate Predictions
|
| 384 |
|
|
@@ -394,6 +384,8 @@ with gr.Blocks(title="Bambara ASR Leaderboard") as demo:
|
|
| 394 |
* Text is normalized (lowercase, punctuation removed) before metrics calculation
|
| 395 |
* Extreme outliers are capped to prevent skewing results
|
| 396 |
* All submissions are validated for format and completeness
|
|
|
|
|
|
|
| 397 |
"""
|
| 398 |
)
|
| 399 |
|
|
|
|
| 13 |
if token:
|
| 14 |
login(token)
|
| 15 |
|
| 16 |
+
|
| 17 |
try:
|
| 18 |
dataset = load_dataset("sudoping01/bambara-speech-recognition-benchmark", name="default")["eval"]
|
| 19 |
references = {row["id"]: row["text"] for row in dataset}
|
|
|
|
| 22 |
print(f"Error loading dataset: {str(e)}")
|
| 23 |
references = {}
|
| 24 |
|
| 25 |
+
|
| 26 |
leaderboard_file = "leaderboard.csv"
|
| 27 |
if not os.path.exists(leaderboard_file):
|
| 28 |
+
|
| 29 |
sample_data = []
|
| 30 |
# ["MALIBA-AI/bambara-asr-v1", 0.2264, 0.1094, 0.1922, "2025-03-15 10:30:45"],
|
| 31 |
# ["whisper-large-v3-bambara", 0.3120, 0.1870, 0.2745, "2025-02-20 14:22:33"]
|
|
|
|
| 36 |
else:
|
| 37 |
leaderboard_df = pd.read_csv(leaderboard_file)
|
| 38 |
|
| 39 |
+
|
| 40 |
if "Combined_Score" not in leaderboard_df.columns:
|
| 41 |
leaderboard_df["Combined_Score"] = leaderboard_df["WER"] * 0.7 + leaderboard_df["CER"] * 0.3
|
| 42 |
leaderboard_df.to_csv(leaderboard_file, index=False)
|
|
|
|
| 78 |
sample_wer = wer(reference, hypothesis)
|
| 79 |
sample_cer = cer(reference, hypothesis)
|
| 80 |
|
|
|
|
| 81 |
sample_wer = min(sample_wer, 2.0)
|
| 82 |
sample_cer = min(sample_cer, 2.0)
|
| 83 |
|
|
|
|
| 103 |
avg_wer = sum(item["wer"] for item in results) / len(results)
|
| 104 |
avg_cer = sum(item["cer"] for item in results) / len(results)
|
| 105 |
|
| 106 |
+
|
| 107 |
weighted_wer = sum(item["wer"] * item["ref_word_count"] for item in results) / total_ref_words
|
| 108 |
weighted_cer = sum(item["cer"] * item["ref_char_count"] for item in results) / total_ref_chars
|
| 109 |
|
|
|
|
| 118 |
if df is None or len(df) == 0:
|
| 119 |
return pd.DataFrame(columns=["Rank", "Model_Name", "WER (%)", "CER (%)", "Combined_Score (%)", "timestamp"])
|
| 120 |
|
| 121 |
+
|
| 122 |
display_df = df.copy()
|
| 123 |
|
| 124 |
+
|
| 125 |
display_df = display_df.sort_values(sort_by)
|
| 126 |
|
|
|
|
| 127 |
display_df.insert(0, "Rank", range(1, len(display_df) + 1))
|
| 128 |
|
|
|
|
| 129 |
for col in ["WER", "CER", "Combined_Score"]:
|
| 130 |
if col in display_df.columns:
|
| 131 |
display_df[f"{col} (%)"] = display_df[col].apply(lambda x: f"{x * 100:.2f}")
|
| 132 |
|
| 133 |
+
|
|
|
|
| 134 |
|
| 135 |
return display_df
|
| 136 |
|
|
|
|
| 194 |
except Exception as e:
|
| 195 |
return f"Error calculating metrics: {str(e)}", None
|
| 196 |
|
| 197 |
+
|
| 198 |
leaderboard = pd.read_csv(leaderboard_file)
|
| 199 |
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
| 200 |
|
|
|
|
| 201 |
combined_score = avg_wer * 0.7 + avg_cer * 0.3
|
| 202 |
|
|
|
|
| 203 |
if model_name in leaderboard["Model_Name"].values:
|
|
|
|
| 204 |
idx = leaderboard[leaderboard["Model_Name"] == model_name].index
|
| 205 |
leaderboard.loc[idx, "WER"] = avg_wer
|
| 206 |
leaderboard.loc[idx, "CER"] = avg_cer
|
|
|
|
| 208 |
leaderboard.loc[idx, "timestamp"] = timestamp
|
| 209 |
updated_leaderboard = leaderboard
|
| 210 |
else:
|
|
|
|
| 211 |
new_entry = pd.DataFrame(
|
| 212 |
[[model_name, avg_wer, avg_cer, combined_score, timestamp]],
|
| 213 |
columns=["Model_Name", "WER", "CER", "Combined_Score", "timestamp"]
|
| 214 |
)
|
| 215 |
updated_leaderboard = pd.concat([leaderboard, new_entry])
|
| 216 |
|
|
|
|
| 217 |
updated_leaderboard = updated_leaderboard.sort_values("Combined_Score")
|
| 218 |
updated_leaderboard.to_csv(leaderboard_file, index=False)
|
| 219 |
|
|
|
|
| 220 |
display_leaderboard = prepare_leaderboard_for_display(updated_leaderboard)
|
| 221 |
|
| 222 |
return f"Submission processed successfully! WER: {format_as_percentage(avg_wer)}, CER: {format_as_percentage(avg_cer)}, Combined Score: {format_as_percentage(combined_score)}", display_leaderboard
|
|
|
|
| 260 |
|
| 261 |
current_data = get_current_leaderboard()
|
| 262 |
|
| 263 |
+
|
| 264 |
if len(current_data) > 0:
|
| 265 |
best_model = current_data.sort_values("Combined_Score").iloc[0]
|
| 266 |
gr.Markdown(f"""
|
|
|
|
| 272 |
|
| 273 |
with gr.Tabs() as tabs:
|
| 274 |
with gr.TabItem("π
Model Rankings"):
|
| 275 |
+
|
| 276 |
initial_leaderboard = create_leaderboard_table()
|
| 277 |
|
| 278 |
ranking_method = gr.Radio(
|
|
|
|
| 363 |
"""
|
| 364 |
## About the Benchmark Dataset
|
| 365 |
|
| 366 |
+
This leaderboard uses the **[sudoping01/bambara-speech-recognition-benchmark](https://huggingface.co/datasets/MALIBA-AI/bambara-speech-recognition-leaderboard)** dataset:
|
| 367 |
|
| 368 |
* Contains diverse Bambara speech samples
|
| 369 |
* Includes various speakers, accents, and dialects
|
| 370 |
* Covers different speech styles and recording conditions
|
| 371 |
+
* Transcribed and validated
|
| 372 |
|
| 373 |
### How to Generate Predictions
|
| 374 |
|
|
|
|
| 384 |
* Text is normalized (lowercase, punctuation removed) before metrics calculation
|
| 385 |
* Extreme outliers are capped to prevent skewing results
|
| 386 |
* All submissions are validated for format and completeness
|
| 387 |
+
|
| 388 |
+
NB: This work is a collaboration between MALIBA-AI, RobotsMali AI4D-LAB and Djelia
|
| 389 |
"""
|
| 390 |
)
|
| 391 |
|