akera commited on
Commit
3739a4f
Β·
verified Β·
1 Parent(s): ce626d3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +87 -106
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py
2
  import subprocess
3
  import sys
4
  import os
@@ -57,10 +57,6 @@ if not setup_salt():
57
  print("πŸ’‘ Please check that git is available and GitHub is accessible")
58
  sys.exit(1)
59
 
60
-
61
-
62
-
63
-
64
  import gradio as gr
65
  import pandas as pd
66
  import json
@@ -74,7 +70,7 @@ from src.validation import validate_submission_complete
74
  from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
75
  from src.leaderboard import (
76
  load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
77
- filter_leaderboard, export_leaderboard, get_model_comparison
78
  )
79
  from src.plotting import (
80
  create_leaderboard_ranking_plot, create_metrics_comparison_plot,
@@ -131,26 +127,26 @@ def download_test_set() -> Tuple[str, str]:
131
 
132
  # Create info message
133
  info_msg = f"""
134
- πŸ“₯ **SALT Test Set Downloaded Successfully!**
135
-
136
- **Dataset Statistics:**
137
- - **Total Samples**: {stats['total_samples']:,}
138
- - **Language Pairs**: {stats['language_pairs']}
139
- - **Google Comparable**: {stats['google_comparable_samples']:,} samples
140
- - **Languages**: {', '.join(stats['languages'])}
141
-
142
- **File Format:**
143
- - `sample_id`: Unique identifier for each sample
144
- - `source_text`: Text to be translated
145
- - `source_language`: Source language code
146
- - `target_language`: Target language code
147
- - `domain`: Content domain (if available)
148
- - `google_comparable`: Whether this pair can be compared with Google Translate
149
-
150
- **Next Steps:**
151
- 1. Run your model on the source texts
152
- 2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
153
- 3. Upload your predictions using the "Submit Predictions" tab
154
  """
155
 
156
  return download_path, info_msg
@@ -159,7 +155,6 @@ def download_test_set() -> Tuple[str, str]:
159
  error_msg = f"❌ Error creating test set download: {str(e)}"
160
  return None, error_msg
161
 
162
-
163
  def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
164
  """Validate uploaded prediction file, supporting str paths, bytes, and Gradio wrappers."""
165
  try:
@@ -213,8 +208,6 @@ def validate_submission(file, model_name: str, author: str, description: str) ->
213
  None,
214
  )
215
 
216
-
217
-
218
  def evaluate_submission(
219
  predictions_df: pd.DataFrame,
220
  model_name: str,
@@ -268,24 +261,24 @@ def evaluate_submission(
268
  total_models = len(updated_leaderboard)
269
 
270
  success_msg = f"""
271
- πŸŽ‰ **Evaluation Complete!**
272
-
273
- **Your Results:**
274
- - **Model**: {model_name}
275
- - **Rank**: #{rank} out of {total_models} models
276
- - **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
277
- - **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
278
- - **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
279
-
280
- **Coverage:**
281
- - **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
282
- - **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
283
- - **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
284
-
285
- {report}
286
  """
287
 
288
- return success_msg, updated_leaderboard, summary_plot, ranking_plot
289
 
290
  except Exception as e:
291
  error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
@@ -313,6 +306,9 @@ def refresh_leaderboard_display(
313
  google_comparable_only=google_only
314
  )
315
 
 
 
 
316
  # Create plots
317
  ranking_plot = create_leaderboard_ranking_plot(filtered_df)
318
  comparison_plot = create_metrics_comparison_plot(filtered_df)
@@ -320,17 +316,17 @@ def refresh_leaderboard_display(
320
  # Get stats
321
  stats = get_leaderboard_stats(filtered_df)
322
  stats_text = f"""
323
- πŸ“Š **Leaderboard Statistics**
324
-
325
- - **Total Models**: {stats['total_models']}
326
- - **Average Quality Score**: {stats['avg_quality_score']:.4f}
327
- - **Google Comparable Models**: {stats['google_comparable_models']}
328
-
329
- **Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
330
- **Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
331
  """
332
 
333
- return filtered_df, ranking_plot, comparison_plot, stats_text
334
 
335
  except Exception as e:
336
  error_msg = f"Error loading leaderboard: {str(e)}"
@@ -364,31 +360,31 @@ def get_model_details(model_name: str) -> Tuple[str, object]:
364
 
365
  # Format model details
366
  details_text = f"""
367
- # πŸ” Model Details: {model_name}
368
-
369
- **Basic Information:**
370
- - **Author**: {model_info['author']}
371
- - **Submission Date**: {model_info['submission_date'][:10]}
372
- - **Model Type**: {model_info['model_type']}
373
- - **Description**: {model_info['description'] or 'No description provided'}
374
-
375
- **Performance Metrics:**
376
- - **Quality Score**: {model_info['quality_score']:.4f}
377
- - **BLEU**: {model_info['bleu']:.2f}
378
- - **ChrF**: {model_info['chrf']:.4f}
379
- - **ROUGE-1**: {model_info['rouge1']:.4f}
380
- - **ROUGE-L**: {model_info['rougeL']:.4f}
381
-
382
- **Coverage Information:**
383
- - **Total Samples**: {model_info['total_samples']:,}
384
- - **Language Pairs Covered**: {model_info['language_pairs_covered']}
385
- - **Google Comparable Pairs**: {model_info['google_pairs_covered']}
386
- - **Coverage Rate**: {model_info['coverage_rate']:.1%}
387
-
388
- **Google Translate Comparison:**
389
- - **Google Quality Score**: {model_info['google_quality_score']:.4f}
390
- - **Google BLEU**: {model_info['google_bleu']:.2f}
391
- - **Google ChrF**: {model_info['google_chrf']:.4f}
392
  """
393
 
394
  return details_text, detail_plot
@@ -443,15 +439,11 @@ with gr.Blocks(
443
  ) as demo:
444
 
445
  # Header
446
- gr.Markdown(f"""
447
  <div class="main-header">
448
-
449
- # {TITLE}
450
-
451
- {DESCRIPTION}
452
-
453
- **Supported Languages**: {len(ALL_UG40_LANGUAGES)} Ugandan languages | **Google Comparable**: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages
454
-
455
  </div>
456
  """)
457
 
@@ -541,7 +533,6 @@ with gr.Blocks(
541
  gr.Markdown("### πŸ“€ Upload Predictions")
542
  gr.Markdown("Upload a CSV/TSV/JSON file with your model's predictions")
543
 
544
-
545
  predictions_file = gr.File(
546
  label="πŸ“‚ Predictions File",
547
  file_types=[".csv", ".tsv", ".json"]
@@ -645,10 +636,10 @@ with gr.Blocks(
645
 
646
  ## πŸ—£οΈ Supported Languages
647
 
648
- **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
649
  {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
650
 
651
- **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
652
  {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
653
 
654
  ## πŸ“Š Evaluation Metrics
@@ -720,7 +711,7 @@ with gr.Blocks(
720
 
721
  This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
722
 
723
- **Contact**: [research@sunbird.ai](mailto:research@sunbird.ai)
724
  **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
725
 
726
  ## πŸ“„ Citation
@@ -753,21 +744,12 @@ with gr.Blocks(
753
  outputs=[download_file, download_info]
754
  )
755
 
756
- # # Validate predictions
757
- # def handle_validation(file, model_name, author, description):
758
- # report, predictions = validate_submission(file, model_name, author, description)
759
- # is_valid = predictions is not None
760
- # return report, predictions, predictions, is_valid
761
-
762
  def handle_validation(file, model_name, author, description):
763
  report, predictions = validate_submission(file, model_name, author, description)
764
  valid = predictions is not None
765
 
766
  # Build the four returns:
767
- # 1) report Markdown
768
- # 2) store predictions in state
769
- # 3) store validation info in state
770
- # 4) enable or disable the submit button
771
  if valid:
772
  return (
773
  report,
@@ -782,16 +764,12 @@ with gr.Blocks(
782
  None,
783
  gr.update(interactive=False) # <β€” this *disables* the button
784
  )
785
-
786
 
787
  validate_btn.click(
788
  fn=handle_validation,
789
  inputs=[predictions_file, model_name_input, author_input, description_input],
790
  outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
791
  )
792
-
793
-
794
-
795
 
796
  # Submit for evaluation
797
  def handle_submission(predictions, model_name, author, description, validation_info):
@@ -817,7 +795,10 @@ with gr.Blocks(
817
  table, plot1, plot2, stats = refresh_leaderboard_display(*args)
818
 
819
  # Update model dropdown choices
820
- model_choices = table['model_name'].tolist() if not table.empty else []
 
 
 
821
 
822
  return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
823
 
 
1
+ # app.py - Fixed version with better formatting and display
2
  import subprocess
3
  import sys
4
  import os
 
57
  print("πŸ’‘ Please check that git is available and GitHub is accessible")
58
  sys.exit(1)
59
 
 
 
 
 
60
  import gradio as gr
61
  import pandas as pd
62
  import json
 
70
  from src.evaluation import evaluate_predictions, generate_evaluation_report, get_google_translate_baseline
71
  from src.leaderboard import (
72
  load_leaderboard, add_model_to_leaderboard, get_leaderboard_stats,
73
+ filter_leaderboard, export_leaderboard, get_model_comparison, prepare_leaderboard_display
74
  )
75
  from src.plotting import (
76
  create_leaderboard_ranking_plot, create_metrics_comparison_plot,
 
127
 
128
  # Create info message
129
  info_msg = f"""
130
+ ## πŸ“₯ SALT Test Set Downloaded Successfully!
131
+
132
+ ### Dataset Statistics:
133
+ - **Total Samples**: {stats['total_samples']:,}
134
+ - **Language Pairs**: {stats['language_pairs']}
135
+ - **Google Comparable**: {stats['google_comparable_samples']:,} samples
136
+ - **Languages**: {', '.join(stats['languages'])}
137
+
138
+ ### File Format:
139
+ - `sample_id`: Unique identifier for each sample
140
+ - `source_text`: Text to be translated
141
+ - `source_language`: Source language code
142
+ - `target_language`: Target language code
143
+ - `domain`: Content domain (if available)
144
+ - `google_comparable`: Whether this pair can be compared with Google Translate
145
+
146
+ ### Next Steps:
147
+ 1. Run your model on the source texts
148
+ 2. Create a CSV/JSON file with columns: `sample_id`, `prediction`
149
+ 3. Upload your predictions using the "Submit Predictions" tab
150
  """
151
 
152
  return download_path, info_msg
 
155
  error_msg = f"❌ Error creating test set download: {str(e)}"
156
  return None, error_msg
157
 
 
158
  def validate_submission(file, model_name: str, author: str, description: str) -> Tuple[str, Optional[pd.DataFrame]]:
159
  """Validate uploaded prediction file, supporting str paths, bytes, and Gradio wrappers."""
160
  try:
 
208
  None,
209
  )
210
 
 
 
211
  def evaluate_submission(
212
  predictions_df: pd.DataFrame,
213
  model_name: str,
 
261
  total_models = len(updated_leaderboard)
262
 
263
  success_msg = f"""
264
+ ## πŸŽ‰ Evaluation Complete!
265
+
266
+ ### Your Results:
267
+ - **Model**: {model_name}
268
+ - **Rank**: #{rank} out of {total_models} models
269
+ - **Quality Score**: {evaluation_results['averages'].get('quality_score', 0):.4f}
270
+ - **BLEU**: {evaluation_results['averages'].get('bleu', 0):.2f}
271
+ - **ChrF**: {evaluation_results['averages'].get('chrf', 0):.4f}
272
+
273
+ ### Coverage:
274
+ - **Samples Evaluated**: {evaluation_results['evaluated_samples']:,}
275
+ - **Language Pairs**: {evaluation_results['summary']['language_pairs_covered']}
276
+ - **Google Comparable**: {evaluation_results['summary']['google_comparable_pairs']} pairs
277
+
278
+ {report}
279
  """
280
 
281
+ return success_msg, prepare_leaderboard_display(updated_leaderboard), summary_plot, ranking_plot
282
 
283
  except Exception as e:
284
  error_msg = f"❌ Evaluation failed: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
 
306
  google_comparable_only=google_only
307
  )
308
 
309
+ # Prepare for display (removes detailed_metrics column)
310
+ display_df = prepare_leaderboard_display(filtered_df)
311
+
312
  # Create plots
313
  ranking_plot = create_leaderboard_ranking_plot(filtered_df)
314
  comparison_plot = create_metrics_comparison_plot(filtered_df)
 
316
  # Get stats
317
  stats = get_leaderboard_stats(filtered_df)
318
  stats_text = f"""
319
+ ### πŸ“Š Leaderboard Statistics
320
+
321
+ - **Total Models**: {stats['total_models']}
322
+ - **Average Quality Score**: {stats['avg_quality_score']:.4f}
323
+ - **Google Comparable Models**: {stats['google_comparable_models']}
324
+
325
+ **Best Model**: {stats['best_model']['name'] if stats['best_model'] else 'None'}
326
+ **Latest Submission**: {stats['latest_submission'][:10] if stats['latest_submission'] else 'None'}
327
  """
328
 
329
+ return display_df, ranking_plot, comparison_plot, stats_text
330
 
331
  except Exception as e:
332
  error_msg = f"Error loading leaderboard: {str(e)}"
 
360
 
361
  # Format model details
362
  details_text = f"""
363
+ ## πŸ” Model Details: {model_name}
364
+
365
+ ### Basic Information:
366
+ - **Author**: {model_info['author']}
367
+ - **Submission Date**: {model_info['submission_date'][:10]}
368
+ - **Model Type**: {model_info['model_type']}
369
+ - **Description**: {model_info['description'] or 'No description provided'}
370
+
371
+ ### Performance Metrics:
372
+ - **Quality Score**: {model_info['quality_score']:.4f}
373
+ - **BLEU**: {model_info['bleu']:.2f}
374
+ - **ChrF**: {model_info['chrf']:.4f}
375
+ - **ROUGE-1**: {model_info['rouge1']:.4f}
376
+ - **ROUGE-L**: {model_info['rougeL']:.4f}
377
+
378
+ ### Coverage Information:
379
+ - **Total Samples**: {model_info['total_samples']:,}
380
+ - **Language Pairs Covered**: {model_info['language_pairs_covered']}
381
+ - **Google Comparable Pairs**: {model_info['google_pairs_covered']}
382
+ - **Coverage Rate**: {model_info['coverage_rate']:.1%}
383
+
384
+ ### Google Translate Comparison:
385
+ - **Google Quality Score**: {model_info['google_quality_score']:.4f}
386
+ - **Google BLEU**: {model_info['google_bleu']:.2f}
387
+ - **Google ChrF**: {model_info['google_chrf']:.4f}
388
  """
389
 
390
  return details_text, detail_plot
 
439
  ) as demo:
440
 
441
  # Header
442
+ gr.HTML(f"""
443
  <div class="main-header">
444
+ <h1>{TITLE}</h1>
445
+ <p>{DESCRIPTION}</p>
446
+ <p><strong>Supported Languages</strong>: {len(ALL_UG40_LANGUAGES)} Ugandan languages | <strong>Google Comparable</strong>: {len(GOOGLE_SUPPORTED_LANGUAGES)} languages</p>
 
 
 
 
447
  </div>
448
  """)
449
 
 
533
  gr.Markdown("### πŸ“€ Upload Predictions")
534
  gr.Markdown("Upload a CSV/TSV/JSON file with your model's predictions")
535
 
 
536
  predictions_file = gr.File(
537
  label="πŸ“‚ Predictions File",
538
  file_types=[".csv", ".tsv", ".json"]
 
636
 
637
  ## πŸ—£οΈ Supported Languages
638
 
639
+ **All UG40 Languages ({len(ALL_UG40_LANGUAGES)} total):**
640
  {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in ALL_UG40_LANGUAGES])}
641
 
642
+ **Google Translate Comparable ({len(GOOGLE_SUPPORTED_LANGUAGES)} languages):**
643
  {', '.join([f"{code} ({LANGUAGE_NAMES.get(code, code)})" for code in GOOGLE_SUPPORTED_LANGUAGES])}
644
 
645
  ## πŸ“Š Evaluation Metrics
 
711
 
712
  This leaderboard is maintained by [Sunbird AI](https://sunbird.ai).
713
 
714
+ **Contact**: [research@sunbird.ai](mailto:research@sunbird.ai)
715
  **GitHub**: [Sunbird AI GitHub](https://github.com/sunbirdai)
716
 
717
  ## πŸ“„ Citation
 
744
  outputs=[download_file, download_info]
745
  )
746
 
747
+ # Validate predictions
 
 
 
 
 
748
  def handle_validation(file, model_name, author, description):
749
  report, predictions = validate_submission(file, model_name, author, description)
750
  valid = predictions is not None
751
 
752
  # Build the four returns:
 
 
 
 
753
  if valid:
754
  return (
755
  report,
 
764
  None,
765
  gr.update(interactive=False) # <β€” this *disables* the button
766
  )
 
767
 
768
  validate_btn.click(
769
  fn=handle_validation,
770
  inputs=[predictions_file, model_name_input, author_input, description_input],
771
  outputs=[validation_output, predictions_validated, validation_info_state, submit_btn]
772
  )
 
 
 
773
 
774
  # Submit for evaluation
775
  def handle_submission(predictions, model_name, author, description, validation_info):
 
795
  table, plot1, plot2, stats = refresh_leaderboard_display(*args)
796
 
797
  # Update model dropdown choices
798
+ if current_leaderboard is not None and not current_leaderboard.empty:
799
+ model_choices = current_leaderboard['model_name'].tolist()
800
+ else:
801
+ model_choices = []
802
 
803
  return table, plot1, plot2, stats, gr.Dropdown(choices=model_choices)
804