ai-forever commited on
Commit
0c6a8e1
·
verified ·
1 Parent(s): 1e1a12f

Update leaderboard display

Browse files
Files changed (1) hide show
  1. app.py +1006 -803
app.py CHANGED
@@ -1,803 +1,1006 @@
1
- import gradio as gr
2
- import json
3
- import pandas as pd
4
- import numpy as np
5
- import plotly.express as px
6
- import plotly.graph_objects as go
7
- from plotly.subplots import make_subplots
8
- import os
9
- import traceback
10
- from datetime import datetime
11
- from packaging import version
12
-
13
- # Color scheme for charts
14
- COLORS = px.colors.qualitative.Plotly
15
-
16
- # Line colors for radar charts
17
- line_colors = [
18
- "#EE4266",
19
- "#00a6ed",
20
- "#ECA72C",
21
- "#B42318",
22
- "#3CBBB1",
23
- ]
24
-
25
- # Fill colors for radar charts
26
- fill_colors = [
27
- "rgba(238,66,102,0.05)",
28
- "rgba(0,166,237,0.05)",
29
- "rgba(236,167,44,0.05)",
30
- "rgba(180,35,24,0.05)",
31
- "rgba(60,187,177,0.05)",
32
- ]
33
-
34
- # Define the question categories
35
- QUESTION_CATEGORIES = ["simple", "set", "mh", "cond", "comp"]
36
- METRIC_TYPES = ["retrieval", "generation"]
37
-
38
- def load_results():
39
- """Load results from the results.json file."""
40
- try:
41
- # Get the directory of the current script
42
- script_dir = os.path.dirname(os.path.abspath(__file__))
43
- # Build the path to results.json
44
- results_path = os.path.join(script_dir, 'results.json')
45
-
46
- print(f"Loading results from: {results_path}")
47
-
48
- with open(results_path, 'r', encoding='utf-8') as f:
49
- results = json.load(f)
50
- print(f"Successfully loaded results with {len(results.get('items', {}))} version(s)")
51
- return results
52
- except FileNotFoundError:
53
- # Return empty structure if file doesn't exist
54
- print(f"Results file not found, creating empty structure")
55
- return {"items": {}, "last_version": "1.0", "n_questions": "0"}
56
- except Exception as e:
57
- print(f"Error loading results: {e}")
58
- print(traceback.format_exc())
59
- return {"items": {}, "last_version": "1.0", "n_questions": "0"}
60
-
61
- def filter_and_process_results(results, n_versions, only_actual_versions):
62
- """Filter results by version and process them for display."""
63
- if not results or "items" not in results:
64
- return pd.DataFrame(), [], [], []
65
-
66
- all_items = results["items"]
67
- last_version_str = results.get("last_version", "1.0")
68
- last_version = version.parse(last_version_str)
69
-
70
- print(f"Last version: {last_version_str}")
71
-
72
- # Group items by model_name
73
- model_groups = {}
74
-
75
- for version_str, version_items in all_items.items():
76
- version_obj = version.parse(version_str)
77
- for item_id, item in version_items.items():
78
- model_name = item.get("model_name", "Unknown")
79
-
80
- if model_name not in model_groups:
81
- model_groups[model_name] = []
82
-
83
- # Add version info to the item (both as string and as parsed version object for comparison)
84
- item["version_str"] = version_str
85
- item["version_obj"] = version_obj
86
- model_groups[model_name].append(item)
87
-
88
- rows = []
89
- for model_name, items in model_groups.items():
90
- # Sort items by version (newest first)
91
- items.sort(key=lambda x: x["version_obj"], reverse=True)
92
-
93
- # Filter versions based on selection
94
- filtered_items = []
95
-
96
- if only_actual_versions:
97
- # Get the n most recent actual dataset versions
98
- all_versions = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
99
- # Take at most n_versions
100
- versions_to_consider = all_versions[:n_versions] if all_versions else []
101
-
102
- # Filter items that match those versions
103
- filtered_items = [item for item in items if any(item["version_obj"] == v for v in versions_to_consider)]
104
- else:
105
- # Consider n_versions most recent items for this model
106
- filtered_items = items[:n_versions]
107
-
108
- if not filtered_items:
109
- continue
110
-
111
- config = filtered_items[0]["config"] # Use config from most recent version
112
-
113
- # Create row with basic info
114
- row = {
115
- 'Model': model_name,
116
- 'Embeddings': config.get('embedding_model', 'N/A'),
117
- 'Retriever': config.get('retriever_type', 'N/A'),
118
- 'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
119
- 'Versions': ", ".join([item["version_str"] for item in filtered_items]),
120
- 'Last Updated': filtered_items[0].get("timestamp", "")
121
- }
122
-
123
- # Format timestamp if available
124
- if row['Last Updated']:
125
- try:
126
- dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
127
- row['Last Updated'] = dt.strftime("%Y-%m-%d")
128
- except:
129
- pass
130
-
131
- # Process metrics based on categories
132
- category_metrics = {
133
- category: {
134
- metric_type: {
135
- "avg": 0.0,
136
- "count": 0
137
- } for metric_type in METRIC_TYPES
138
- } for category in QUESTION_CATEGORIES
139
- }
140
-
141
- # Collect metrics by category
142
- for item in filtered_items:
143
- metrics = item.get("metrics", {})
144
- for category in QUESTION_CATEGORIES:
145
- if category in metrics:
146
- for metric_type in METRIC_TYPES:
147
- if metric_type in metrics[category]:
148
- metric_values = metrics[category][metric_type]
149
- avg_value = sum(metric_values.values()) / len(metric_values)
150
-
151
- # Add to the running sum for this category and metric type
152
- category_metrics[category][metric_type]["avg"] += avg_value
153
- category_metrics[category][metric_type]["count"] += 1
154
-
155
- # Calculate averages and add to row
156
- for category in QUESTION_CATEGORIES:
157
- for metric_type in METRIC_TYPES:
158
- metric_data = category_metrics[category][metric_type]
159
- if metric_data["count"] > 0:
160
- avg_value = metric_data["avg"] / metric_data["count"]
161
- # Add to row with appropriate column name
162
- col_name = f"{category}_{metric_type}"
163
- row[col_name] = round(avg_value, 4)
164
-
165
- # Calculate overall averages for each metric type
166
- for metric_type in METRIC_TYPES:
167
- total_sum = 0
168
- total_count = 0
169
-
170
- for category in QUESTION_CATEGORIES:
171
- metric_data = category_metrics[category][metric_type]
172
- if metric_data["count"] > 0:
173
- total_sum += metric_data["avg"]
174
- total_count += metric_data["count"]
175
-
176
- if total_count > 0:
177
- row[f"{metric_type}_avg"] = round(total_sum / total_count, 4)
178
-
179
- rows.append(row)
180
-
181
- # Create DataFrame
182
- df = pd.DataFrame(rows)
183
-
184
- # Get lists of metrics for each category
185
- category_metrics = []
186
- for category in QUESTION_CATEGORIES:
187
- metrics = []
188
- for metric_type in METRIC_TYPES:
189
- col_name = f"{category}_{metric_type}"
190
- if col_name in df.columns:
191
- metrics.append(col_name)
192
- if metrics:
193
- category_metrics.append((category, metrics))
194
-
195
- # Define retrieval and generation columns for radar charts
196
- retrieval_metrics = [f"{category}_retrieval" for category in QUESTION_CATEGORIES if f"{category}_retrieval" in df.columns]
197
- generation_metrics = [f"{category}_generation" for category in QUESTION_CATEGORIES if f"{category}_generation" in df.columns]
198
-
199
- return df, retrieval_metrics, generation_metrics, category_metrics
200
-
201
- def create_radar_chart(df, selected_models, metrics, title):
202
- """Create a radar chart for the selected models and metrics."""
203
- if not metrics or len(selected_models) == 0:
204
- # Return empty figure if no metrics or models selected
205
- fig = go.Figure()
206
- fig.update_layout(
207
- title=title,
208
- title_font_size=16,
209
- height=400,
210
- width=500,
211
- margin=dict(l=30, r=30, t=50, b=30)
212
- )
213
- return fig
214
-
215
- # Filter dataframe for selected models
216
- filtered_df = df[df['Model'].isin(selected_models)]
217
-
218
- if filtered_df.empty:
219
- # Return empty figure if no data
220
- fig = go.Figure()
221
- fig.update_layout(
222
- title=title,
223
- title_font_size=16,
224
- height=400,
225
- width=500,
226
- margin=dict(l=30, r=30, t=50, b=30)
227
- )
228
- return fig
229
-
230
- # Limit to top 5 models for better visualization (similar to inspiration file)
231
- if len(filtered_df) > 5:
232
- filtered_df = filtered_df.head(5)
233
-
234
- # Prepare data for radar chart
235
- categories = [m.split('_', 1)[0] for m in metrics] # Get category name (simple, set, etc.)
236
-
237
- fig = go.Figure()
238
-
239
- # Process in reverse order to match inspiration file
240
- for i, (_, row) in enumerate(filtered_df.iterrows()):
241
- values = [row[m] for m in metrics]
242
- # Close the loop for radar chart
243
- values.append(values[0])
244
- categories_loop = categories + [categories[0]]
245
-
246
- fig.add_trace(go.Scatterpolar(
247
- name=row['Model'],
248
- r=values,
249
- theta=categories_loop,
250
- showlegend=True,
251
- mode="lines",
252
- line=dict(width=2, color=line_colors[i % len(line_colors)]),
253
- fill="toself",
254
- fillcolor=fill_colors[i % len(fill_colors)]
255
- ))
256
-
257
- fig.update_layout(
258
- font=dict(size=13, color="black"),
259
- template="plotly_white",
260
- polar=dict(
261
- radialaxis=dict(
262
- visible=True,
263
- gridcolor="black",
264
- linecolor="rgba(0,0,0,0)",
265
- gridwidth=1,
266
- showticklabels=False,
267
- ticks="",
268
- range=[0, 1] # Ensure consistent range for scores
269
- ),
270
- angularaxis=dict(
271
- gridcolor="black",
272
- gridwidth=1.5,
273
- linecolor="rgba(0,0,0,0)"
274
- ),
275
- ),
276
- legend=dict(
277
- orientation="h",
278
- yanchor="bottom",
279
- y=-0.35,
280
- xanchor="center",
281
- x=0.4,
282
- itemwidth=30,
283
- font=dict(size=13),
284
- entrywidth=0.6,
285
- entrywidthmode="fraction",
286
- ),
287
- margin=dict(l=0, r=16, t=30, b=30),
288
- autosize=True,
289
- )
290
-
291
- return fig
292
-
293
- def create_summary_df(df, retrieval_metrics, generation_metrics):
294
- """Create a summary dataframe with averaged metrics for display."""
295
- if df.empty:
296
- return pd.DataFrame()
297
-
298
- summary_df = df.copy()
299
-
300
- # Add retrieval average
301
- if retrieval_metrics:
302
- retrieval_avg = summary_df[retrieval_metrics].mean(axis=1).round(4)
303
- summary_df['Retrieval (avg)'] = retrieval_avg
304
-
305
- # Add generation average
306
- if generation_metrics:
307
- generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
308
- summary_df['Generation (avg)'] = generation_avg
309
-
310
- # Add total score if both averages exist
311
- if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns:
312
- summary_df['Total Score'] = summary_df['Retrieval (avg)'] + summary_df['Generation (avg)']
313
- summary_df = summary_df.sort_values('Total Score', ascending=False)
314
-
315
- # Select columns for display
316
- summary_cols = ['Model', 'Embeddings', 'Retriever', 'Top-K']
317
- if 'Retrieval (avg)' in summary_df.columns:
318
- summary_cols.append('Retrieval (avg)')
319
- if 'Generation (avg)' in summary_df.columns:
320
- summary_cols.append('Generation (avg)')
321
- if 'Total Score' in summary_df.columns:
322
- summary_cols.append('Total Score')
323
- if 'Versions' in summary_df.columns:
324
- summary_cols.append('Versions')
325
- if 'Last Updated' in summary_df.columns:
326
- summary_cols.append('Last Updated')
327
-
328
- return summary_df[summary_cols]
329
-
330
- def create_category_df(df, category, retrieval_col, generation_col):
331
- """Create a dataframe for a specific category with detailed metrics."""
332
- if df.empty or retrieval_col not in df.columns or generation_col not in df.columns:
333
- return pd.DataFrame()
334
-
335
- category_df = df.copy()
336
-
337
- # Calculate total score for this category
338
- category_df[f'{category} Score'] = category_df[retrieval_col] + category_df[generation_col]
339
-
340
- # Sort by total score
341
- category_df = category_df.sort_values(f'{category} Score', ascending=False)
342
-
343
- # Select columns for display
344
- category_cols = ['Model', 'Embeddings', 'Retriever', retrieval_col, generation_col, f'{category} Score']
345
-
346
- # Rename columns for display
347
- category_df = category_df[category_cols].rename(columns={
348
- retrieval_col: 'Retrieval',
349
- generation_col: 'Generation'
350
- })
351
-
352
- return category_df
353
-
354
- # Load initial data
355
- results = load_results()
356
- last_version = results.get("last_version", "1.0")
357
- n_questions = results.get("n_questions", "100")
358
- date_title = results.get("date_title", "---")
359
-
360
- # Initial data processing
361
- df, retrieval_metrics, generation_metrics, category_metrics = filter_and_process_results(
362
- results, n_versions=1, only_actual_versions=True
363
- )
364
-
365
- # Pre-generate charts for initial display
366
- default_models = df['Model'].head(5).tolist() if not df.empty else []
367
- initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, "Performance on Generation Tasks")
368
- initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, "Performance on Retrieval Tasks")
369
-
370
- # Create summary dataframe
371
- summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
372
-
373
- with gr.Blocks(css="""
374
- .title-container {
375
- text-align: center;
376
- margin-bottom: 10px;
377
- }
378
- .description-text {
379
- text-align: left;
380
- padding: 10px;
381
- margin-bottom: 0px;
382
- }
383
- .version-info {
384
- text-align: center;
385
- padding: 10px;
386
- background-color: #f0f0f0;
387
- border-radius: 8px;
388
- margin-bottom: 15px;
389
- }
390
- .version-selector {
391
- padding: 15px;
392
- border: 1px solid #ddd;
393
- border-radius: 8px;
394
- margin-bottom: 20px;
395
- background-color: #f9f9f9;
396
- height: 100%;
397
- }
398
- .citation-block {
399
- padding: 15px;
400
- border: 1px solid #ddd;
401
- border-radius: 8px;
402
- margin-bottom: 20px;
403
- background-color: #f9f9f9;
404
- font-family: monospace;
405
- font-size: 14px;
406
- overflow-x: auto;
407
- height: 100%;
408
- }
409
- .flex-row-container {
410
- display: flex;
411
- justify-content: space-between;
412
- gap: 20px;
413
- width: 100%;
414
- }
415
- .charts-container {
416
- display: flex;
417
- gap: 20px;
418
- margin-bottom: 20px;
419
- }
420
- .chart-box {
421
- flex: 1;
422
- border: 1px solid #eee;
423
- border-radius: 8px;
424
- padding: 10px;
425
- background-color: white;
426
- min-height: 550px; /* Increased height to accommodate legend at bottom */
427
- }
428
- .metrics-table {
429
- border: 1px solid #eee;
430
- border-radius: 8px;
431
- padding: 15px;
432
- background-color: white;
433
- }
434
- .info-text {
435
- font-size: 0.9em;
436
- font-style: italic;
437
- color: #666;
438
- margin-top: 5px;
439
- }
440
- footer {
441
- text-align: center;
442
- margin-top: 30px;
443
- font-size: 0.9em;
444
- color: #666;
445
- }
446
- /* Style for selected rows */
447
- table tbody tr.selected {
448
- background-color: rgba(25, 118, 210, 0.1) !important;
449
- border-left: 3px solid #1976d2;
450
- }
451
- /* Add this class via JavaScript */
452
- .gr-table tbody tr.selected td:first-child {
453
- font-weight: bold;
454
- color: #1976d2;
455
- }
456
- .category-tab {
457
- padding: 10px;
458
- }
459
- .chart-title {
460
- font-size: 1.2em;
461
- font-weight: bold;
462
- margin-bottom: 10px;
463
- text-align: center;
464
- }
465
- .clear-charts-button {
466
- display: flex;
467
- justify-content: center;
468
- margin-top: 10px;
469
- margin-bottom: 20px;
470
- }
471
- """) as demo:
472
- # Title
473
- with gr.Row(elem_classes=["title-container"]):
474
- gr.Markdown("# 🐙 Dynamic RAG Benchmark")
475
-
476
- # Version info
477
- with gr.Row(elem_classes=["description-text"]):
478
- gr.Markdown(f"На этом лидерборде можно сравнить RAG системы в разрезе генеративных и поисковых метрик моделей по вопросам разного типа (простые вопросы, сравнения, multi-hop, условные и др.). <li>Вопросы автоматичеки генерируются на основе новостных источников.</li><li>Обновление датасета с вопросами происходит регулярно, при этом пересчитываются все метрики для открытых моделей.</li><li>Для пользовательских сабмитов учитываются последние посчитанные для них метрики.</li><li>Чтобы посчитать ранее отправленную конфигурацию на последней версии данных, используйте submit_id, полученный при первой отправке через клиент (см. инструкцию ниже).</li>")
479
-
480
- # Version info
481
- with gr.Row(elem_classes=["version-info"]):
482
- gr.Markdown(f"## Версия {last_version} → {n_questions} вопросов, сгенерированных по новостным источникам → {date_title}")
483
-
484
- # Radar Charts
485
- with gr.Row(elem_classes=["charts-container"]):
486
- with gr.Column(elem_classes=["chart-box"]):
487
- gr.Markdown("### Генеративные метрики", elem_classes=["chart-title"])
488
- generation_chart = gr.Plot(value=initial_gen_chart)
489
-
490
- with gr.Column(elem_classes=["chart-box"]):
491
- gr.Markdown("### Метрики поиска", elem_classes=["chart-title"])
492
- retrieval_chart = gr.Plot(value=initial_ret_chart)
493
-
494
- # Clear Charts Button
495
- with gr.Row(elem_classes=["clear-charts-button"]):
496
- clear_charts_btn = gr.Button("Очистить графики", variant="secondary")
497
-
498
- # Metrics table with tabs
499
- with gr.Tabs(elem_classes=["metrics-table"]) as metrics_tabs:
500
- with gr.TabItem("Общая таблица"):
501
- selected_models = gr.State(default_models)
502
-
503
- # If dataframe is empty, show a message
504
- if df.empty:
505
- gr.Markdown("No data available. Please submit some results.")
506
- metrics_table = gr.DataFrame()
507
- else:
508
- metrics_table = gr.DataFrame(
509
- value=summary_df,
510
- headers=summary_df.columns.tolist(),
511
- datatype=["str"] * len(summary_df.columns),
512
- row_count=(min(10, len(summary_df)) if not summary_df.empty else 0),
513
- col_count=(len(summary_df.columns) if not summary_df.empty else 0),
514
- interactive=False,
515
- wrap=True
516
- )
517
-
518
- with gr.TabItem("По типам вопросов"):
519
- # Create tabs for each category
520
- category_tabs = gr.Tabs()
521
- category_tables = {}
522
-
523
- # Dictionary to map category codes to display names
524
- category_display_names = {
525
- "simple": "Simple Questions",
526
- "set": "Set-based",
527
- "mh": "Multi-hop",
528
- "cond": "Conditional",
529
- "comp": "Comparison"
530
- }
531
-
532
- with category_tabs:
533
- for category, _ in category_metrics:
534
- if f"{category}_retrieval" in df.columns and f"{category}_generation" in df.columns:
535
- with gr.TabItem(category_display_names.get(category, category.capitalize()), elem_classes=["category-tab"]):
536
- # Create dataframe for this category
537
- category_df = create_category_df(df, category, f"{category}_retrieval", f"{category}_generation")
538
-
539
- if category_df.empty:
540
- gr.Markdown(f"No data available for {category_display_names.get(category, category)} category.")
541
- category_tables[category] = gr.DataFrame()
542
- else:
543
- gr.Markdown(f"#### Performance on {category_display_names.get(category, category)}")
544
- category_tables[category] = gr.DataFrame(
545
- value=category_df,
546
- headers=category_df.columns.tolist(),
547
- datatype=["str"] * len(category_df.columns),
548
- row_count=(min(10, len(category_df)) if not category_df.empty else 0),
549
- col_count=(len(category_df.columns) if not category_df.empty else 0),
550
- interactive=False,
551
- wrap=True
552
- )
553
-
554
- # Version selector and Citation block in a flex container
555
- with gr.Row():
556
- # Citation block (left side)
557
- with gr.Column(scale=1, elem_classes=["citation-block"]):
558
- gr.Markdown("### Цитирование")
559
- gr.Markdown("""
560
- ```
561
- @article{dynamic-rag-benchmark,
562
- title={Dynamic RAG Benchmark},
563
- author={RAG Benchmark Team},
564
- journal={arXiv preprint},
565
- year={2024},
566
- url={https://github.com/rag-benchmark}
567
- }
568
- ```
569
-
570
- Шаблон для цитирования нашего бенча.
571
- """)
572
-
573
- # Version selector (right side)
574
- with gr.Column(scale=1, elem_classes=["version-selector"]):
575
- gr.Markdown("### Выбор версий")
576
- with gr.Column():
577
- with gr.Row():
578
- with gr.Column(scale=3):
579
- only_actual_versions = gr.Checkbox(
580
- label="Только актуальные версии",
581
- value=True,
582
- info="Считать, начиная с актуальной версии датасета"
583
- )
584
- with gr.Column(scale=5):
585
- n_versions_slider = gr.Slider(
586
- minimum=1,
587
- maximum=5,
588
- value=1,
589
- step=1,
590
- label="Взять n последних версий",
591
- info="Количество версий для подсчета метрик"
592
- )
593
- with gr.Row():
594
- filter_btn = gr.Button("Применить фильтр", variant="primary")
595
-
596
- gr.Markdown(
597
- "Кликайте на модели в таблице, чтобы добавить их в графики",
598
- elem_classes=["info-text"]
599
- )
600
-
601
- # Footer
602
- with gr.Row():
603
- gr.Markdown("""
604
- <footer>Dynamic RAG Benchmark Leaderboard</footer>
605
- """)
606
-
607
- # Handle row selection for radar charts
608
- def update_charts(evt: gr.SelectData, selected_models):
609
- try:
610
- # Get current data with the latest filters
611
- current_df, current_ret_metrics, current_gen_metrics, _ = filter_and_process_results(
612
- results, n_versions=n_versions_slider.value, only_actual_versions=only_actual_versions.value
613
- )
614
-
615
- # Debug info
616
- print(f"Selection event: {evt}, type: {type(evt)}")
617
-
618
- selected_model = None
619
-
620
- # Extract the selected model based on the row index
621
- try:
622
- # Get the table component that was clicked
623
- component = evt.target
624
-
625
- # Get the row index
626
- row_idx = evt.index[0] if isinstance(evt.index, list) else evt.index
627
- print(f"Row index: {row_idx}")
628
-
629
- # Determine what type of data we're dealing with and extract model name
630
- # First check if it's a summary table
631
- if component is metrics_table:
632
- # Summary table was clicked
633
- if isinstance(summary_df, pd.DataFrame) and 0 <= row_idx < len(summary_df):
634
- selected_model = summary_df.iloc[row_idx]['Model']
635
- print(f"Selected from summary table: {selected_model}")
636
- else:
637
- # Check if it's a category table
638
- for category, table in category_tables.items():
639
- if component is table:
640
- # Get the category dataframe
641
- category_df = create_category_df(
642
- current_df,
643
- category,
644
- f"{category}_retrieval",
645
- f"{category}_generation"
646
- )
647
-
648
- if isinstance(category_df, pd.DataFrame) and 0 <= row_idx < len(category_df):
649
- selected_model = category_df.iloc[row_idx]['Model']
650
- print(f"Selected from {category} table: {selected_model}")
651
- break
652
-
653
- # If we still couldn't identify the model, try to get it from the raw data
654
- if selected_model is None and hasattr(component, "value"):
655
- table_value = component.value
656
- if isinstance(table_value, pd.DataFrame) and 0 <= row_idx < len(table_value):
657
- selected_model = table_value.iloc[row_idx]['Model']
658
- elif isinstance(table_value, list) and 0 <= row_idx < len(table_value):
659
- selected_model = table_value[row_idx][0] # Assuming Model is the first column
660
- elif isinstance(table_value, dict) and 'data' in table_value and 0 <= row_idx < len(table_value['data']):
661
- selected_model = table_value['data'][row_idx][0]
662
- except Exception as e:
663
- print(f"Error extracting model name: {e}")
664
- traceback.print_exc()
665
-
666
- # If we found a model name, toggle its selection
667
- if selected_model:
668
- print(f"Selected model: {selected_model}")
669
-
670
- # Make sure the model exists in the current dataframe
671
- available_models = current_df['Model'].tolist() if not current_df.empty else []
672
-
673
- if selected_model in available_models:
674
- # Add to list if not already there, otherwise remove (toggle selection)
675
- if selected_model in selected_models:
676
- selected_models.remove(selected_model)
677
- else:
678
- selected_models.append(selected_model)
679
- else:
680
- print(f"Model {selected_model} not found in current dataframe")
681
-
682
- # Ensure only models from the current dataframe are included
683
- available_models = current_df['Model'].tolist() if not current_df.empty else []
684
- selected_models = [model for model in selected_models if model in available_models]
685
-
686
- # If no models are selected after filtering, use the first available model
687
- if not selected_models and available_models:
688
- selected_models = [available_models[0]]
689
-
690
- # Create radar charts using the current dataframe and metrics
691
- gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, "Performance on Generation Tasks")
692
- ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, "Performance on Retrieval Tasks")
693
-
694
- return selected_models, gen_chart, ret_chart
695
- except Exception as e:
696
- print(f"Error in update_charts: {e}")
697
- print(traceback.format_exc())
698
- return selected_models, generation_chart.value, retrieval_chart.value
699
-
700
- # Use custom event handler for row selection
701
- metrics_table.select(
702
- fn=update_charts,
703
- inputs=[selected_models],
704
- outputs=[selected_models, generation_chart, retrieval_chart]
705
- )
706
-
707
- # Add selection handlers for category tables too
708
- for category_table in category_tables.values():
709
- category_table.select(
710
- fn=update_charts,
711
- inputs=[selected_models],
712
- outputs=[selected_models, generation_chart, retrieval_chart]
713
- )
714
-
715
- # Handle version filter changes
716
- def update_data(n_versions, only_actual, current_selected_models):
717
- try:
718
- # Get updated data
719
- new_df, new_ret_metrics, new_gen_metrics, new_category_metrics = filter_and_process_results(
720
- results, n_versions=n_versions, only_actual_versions=only_actual
721
- )
722
-
723
- # Get available models
724
- available_models = new_df['Model'].tolist() if not new_df.empty else []
725
-
726
- # Filter selected models to only include those that exist in the new dataset
727
- filtered_selected_models = [model for model in current_selected_models if model in available_models]
728
-
729
- # If no previously selected models remain, select the top models
730
- if not filtered_selected_models and available_models:
731
- filtered_selected_models = available_models[:min(5, len(available_models))]
732
-
733
- # Create radar charts
734
- gen_chart = create_radar_chart(new_df, filtered_selected_models, new_gen_metrics, "Performance on Generation Tasks")
735
- ret_chart = create_radar_chart(new_df, filtered_selected_models, new_ret_metrics, "Performance on Retrieval Tasks")
736
-
737
- # Create summary dataframe
738
- summary_df = create_summary_df(new_df, new_ret_metrics, new_gen_metrics)
739
-
740
- # Create category tables dictionary for output
741
- category_tables_output = {}
742
-
743
- # First initialize all tables to empty DataFrame
744
- for category in category_tables.keys():
745
- category_tables_output[category] = pd.DataFrame()
746
-
747
- # Then populate available tables
748
- for category, _ in new_category_metrics:
749
- if f"{category}_retrieval" in new_df.columns and f"{category}_generation" in new_df.columns:
750
- category_df = create_category_df(new_df, category, f"{category}_retrieval", f"{category}_generation")
751
- if category in category_tables:
752
- category_tables_output[category] = category_df if not category_df.empty else pd.DataFrame()
753
-
754
- # Prepare all outputs
755
- outputs = [summary_df, gen_chart, ret_chart, filtered_selected_models]
756
-
757
- # Add category tables to outputs in the same order as in category_tables
758
- for category in category_tables.keys():
759
- outputs.append(category_tables_output.get(category, pd.DataFrame()))
760
-
761
- # Update global df for later use
762
- global df, retrieval_metrics, generation_metrics
763
- df = new_df
764
- retrieval_metrics = new_ret_metrics
765
- generation_metrics = new_gen_metrics
766
-
767
- return outputs
768
- except Exception as e:
769
- print(f"Error in update_data: {e}")
770
- print(traceback.format_exc())
771
- # Return original values in case of error
772
- empty_tables = [pd.DataFrame() for _ in category_tables]
773
- return summary_df, generation_chart.value, retrieval_chart.value, current_selected_models, *empty_tables
774
-
775
- # Define filter button outputs
776
- filter_outputs = [metrics_table, generation_chart, retrieval_chart, selected_models]
777
- # Add category tables to outputs
778
- for category_table in category_tables.values():
779
- filter_outputs.append(category_table)
780
-
781
- filter_btn.click(
782
- fn=update_data,
783
- inputs=[n_versions_slider, only_actual_versions, selected_models],
784
- outputs=filter_outputs
785
- )
786
-
787
- # Function to clear charts
788
- def clear_charts():
789
- empty_models = []
790
- # Create empty charts
791
- empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, "Performance on Generation Tasks")
792
- empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, "Performance on Retrieval Tasks")
793
- return empty_models, empty_gen_chart, empty_ret_chart
794
-
795
- # Connect clear charts button
796
- clear_charts_btn.click(
797
- fn=clear_charts,
798
- inputs=[],
799
- outputs=[selected_models, generation_chart, retrieval_chart]
800
- )
801
-
802
- if __name__ == "__main__":
803
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import pandas as pd
4
+ import numpy as np
5
+ import plotly.express as px
6
+ import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
+ import os
9
+ import traceback
10
+ from datetime import datetime
11
+ from packaging import version
12
+
13
+ # Color scheme for charts
14
+ COLORS = px.colors.qualitative.Plotly
15
+
16
+ # Line colors for radar charts
17
+ line_colors = [
18
+ "#EE4266",
19
+ "#00a6ed",
20
+ "#ECA72C",
21
+ "#B42318",
22
+ "#3CBBB1",
23
+ ]
24
+
25
+ # Fill colors for radar charts
26
+ fill_colors = [
27
+ "rgba(238,66,102,0.05)",
28
+ "rgba(0,166,237,0.05)",
29
+ "rgba(236,167,44,0.05)",
30
+ "rgba(180,35,24,0.05)",
31
+ "rgba(60,187,177,0.05)",
32
+ ]
33
+
34
+ # Language definitions
35
+ LANGUAGES = {"English": {
36
+ "clear_charts": "Clear Charts",
37
+ "lang_selector_label": "Language / Язык",
38
+ "description": "This leaderboard allows comparing RAG systems based on generative and retrieval metrics across different question types (simple, comparison, multi-hop, conditional, etc.). <li>Questions are automatically generated from news sources.</li><li>The question dataset is updated regularly, and metrics for open models are recalculated.</li><li>User submissions use the latest calculated metrics for them.</li><li>To recalculate a previously submitted configuration with the latest data version, use the submit_id received during the initial submission via the client (see instructions below).</li>",
39
+ "version_info_template": "## Version {} → {} questions, generated from news sources → {}",
40
+ "gen_metrics_title": "### Generation Metrics",
41
+ "ret_metrics_title": "### Retrieval Metrics",
42
+ "overall_tab_title": "Overall Table",
43
+ "no_data_message": "No data available. Please submit some results.",
44
+ "by_type_tab_title": "By Question Type",
45
+ "category_display_names": {
46
+ "simple": "Simple Questions",
47
+ "set": "Set-based",
48
+ "mh": "Multi-hop",
49
+ "cond": "Conditional",
50
+ "comp": "Comparison"
51
+ },
52
+ "no_data_category_template": "No data available for {} category.",
53
+ "category_performance_template": "#### Performance on {}",
54
+ "citation_title": "### Citation",
55
+ "citation_description": """
56
+ ```
57
+ @article{dynamic-rag-benchmark,
58
+ title={Dynamic RAG Benchmark},
59
+ author={RAG Benchmark Team},
60
+ journal={arXiv preprint},
61
+ year={2024},
62
+ url={https://github.com/rag-benchmark}
63
+ }
64
+ ```
65
+
66
+ Template for citing our benchmark.
67
+ """,
68
+ "version_selector_title": "### Version Selection",
69
+ "only_actual_label": "Only actual versions",
70
+ "only_actual_info": "Start counting from the current dataset version",
71
+ "n_versions_label": "Take n last versions",
72
+ "n_versions_info": "Number of versions to calculate metrics for",
73
+ "filter_button": "Apply Filter",
74
+ "info_text": "Click on models in the table to add them to the charts",
75
+ "footer_text": "<footer>Dynamic RAG Benchmark Leaderboard</footer>",
76
+ "radar_gen_title": "Performance on Generation Tasks",
77
+ "radar_ret_title": "Performance on Retrieval Tasks"
78
+ },
79
+ "Русский": {
80
+ "clear_charts": "Очистить графики",
81
+ # "lang_selector_label": "Language",
82
+ "description": "На этом лидерборде можно сравнить RAG системы в разрезе генеративных и поисковых метрик моделей по вопросам разного типа (простые вопросы, сравнения, multi-hop, условные и др.). <li>Вопросы автоматичеки генерируются на основе новостных источников.</li><li>Обновление датасета с вопросами происходит регулярно, при этом пересчитываются все метрики для открытых моделей.</li><li>Для пользовательских сабмитов учитываются последние посчитанные для них метрики.</li><li>Чтобы посчитать ранее отправленную конфигурацию на последней версии данных, используйте submit_id, полученный при первой отправке через клиент (см. инструкцию ниже).</li>",
83
+ "version_info_template": "## Версия {} {} вопросов, сгенерированных по новостным источникам {}",
84
+ "gen_metrics_title": "### Генеративные метрики",
85
+ "ret_metrics_title": "### Метрики поиска",
86
+ "overall_tab_title": "Общая таблица",
87
+ "no_data_message": "Нет данных. Пожалуйста, отправьте результаты.",
88
+ "by_type_tab_title": "По типам вопросов",
89
+ "category_display_names": {
90
+ "simple": "Простые вопросы",
91
+ "set": "На основе набора",
92
+ "mh": "Multi-hop",
93
+ "cond": "Условные",
94
+ "comp": "Сравнение"
95
+ },
96
+ "no_data_category_template": "Нет данных для категории {}.",
97
+ "category_performance_template": "#### Производительность на {}",
98
+ "citation_title": "### Цитирование",
99
+ "citation_description": """
100
+ ```
101
+ @article{dynamic-rag-benchmark,
102
+ title={Dynamic RAG Benchmark},
103
+ author={RAG Benchmark Team},
104
+ journal={arXiv preprint},
105
+ year={2024},
106
+ url={https://github.com/rag-benchmark}
107
+ }
108
+ ```
109
+
110
+ Шаблон для цитирования нашего бенча.
111
+ """,
112
+ "version_selector_title": "### Выбор версий",
113
+ "only_actual_label": "Только актуальные версии",
114
+ "only_actual_info": "Считать, начиная с актуальной версии датасета",
115
+ "n_versions_label": "Взять n последних версий",
116
+ "n_versions_info": "Количество версий для подсчета метрик",
117
+ "filter_button": "Применить фильтр",
118
+ "info_text": "Кликайте на модели в таблице, чтобы добавить их в графики",
119
+ "footer_text": "<footer>Dynamic RAG Benchmark Leaderboard</footer>",
120
+ "radar_gen_title": "Производительность на Генеративных Заданиях",
121
+ "radar_ret_title": "Производительность на Поисковых Заданиях"
122
+ }
123
+ }
124
+ DEFAULT_LANG = "English"
125
+
126
+ # Define the question categories
127
+ QUESTION_CATEGORIES = ["simple", "set", "mh", "cond", "comp"]
128
+ METRIC_TYPES = ["retrieval", "generation"]
129
+
130
+ def load_results():
131
+ """Load results from the results.json file."""
132
+ try:
133
+ # Get the directory of the current script
134
+ script_dir = os.path.dirname(os.path.abspath(__file__))
135
+ # Build the path to results.json
136
+ results_path = os.path.join(script_dir, 'results.json')
137
+
138
+ print(f"Loading results from: {results_path}")
139
+
140
+ with open(results_path, 'r', encoding='utf-8') as f:
141
+ results = json.load(f)
142
+ print(f"Successfully loaded results with {len(results.get('items', {}))} version(s)")
143
+ return results
144
+ except FileNotFoundError:
145
+ # Return empty structure if file doesn't exist
146
+ print(f"Results file not found, creating empty structure")
147
+ return {"items": {}, "last_version": "1.0", "n_questions": "0"}
148
+ except Exception as e:
149
+ print(f"Error loading results: {e}")
150
+ print(traceback.format_exc())
151
+ return {"items": {}, "last_version": "1.0", "n_questions": "0"}
152
+
153
+ def filter_and_process_results(results, n_versions, only_actual_versions):
154
+ """Filter results by version and process them for display."""
155
+ if not results or "items" not in results:
156
+ return pd.DataFrame(), [], [], []
157
+
158
+ all_items = results["items"]
159
+ last_version_str = results.get("last_version", "1.0")
160
+ last_version = version.parse(last_version_str)
161
+
162
+ print(f"Last version: {last_version_str}")
163
+
164
+ # Group items by model_name
165
+ model_groups = {}
166
+
167
+ for version_str, version_items in all_items.items():
168
+ version_obj = version.parse(version_str)
169
+ for item_id, item in version_items.items():
170
+ model_name = item.get("model_name", "Unknown")
171
+
172
+ if model_name not in model_groups:
173
+ model_groups[model_name] = []
174
+
175
+ # Add version info to the item (both as string and as parsed version object for comparison)
176
+ item["version_str"] = version_str
177
+ item["version_obj"] = version_obj
178
+ model_groups[model_name].append(item)
179
+
180
+ rows = []
181
+ for model_name, items in model_groups.items():
182
+ # Sort items by version (newest first)
183
+ items.sort(key=lambda x: x["version_obj"], reverse=True)
184
+
185
+ # Filter versions based on selection
186
+ filtered_items = []
187
+
188
+ if only_actual_versions:
189
+ # Get the n most recent actual dataset versions
190
+ all_versions = sorted([version.parse(v_str) for v_str in all_items.keys()], reverse=True)
191
+ # Take at most n_versions
192
+ versions_to_consider = all_versions[:n_versions] if all_versions else []
193
+
194
+ # Filter items that match those versions
195
+ filtered_items = [item for item in items if any(item["version_obj"] == v for v in versions_to_consider)]
196
+ else:
197
+ # Consider n_versions most recent items for this model
198
+ filtered_items = items[:n_versions]
199
+
200
+ if not filtered_items:
201
+ continue
202
+
203
+ config = filtered_items[0]["config"] # Use config from most recent version
204
+
205
+ # Create row with basic info
206
+ row = {
207
+ 'Model': model_name,
208
+ 'Embeddings': config.get('embedding_model', 'N/A'),
209
+ 'Retriever': config.get('retriever_type', 'N/A'),
210
+ 'Top-K': config.get('retrieval_config', {}).get('top_k', 'N/A'),
211
+ 'Versions': ", ".join([item["version_str"] for item in filtered_items]),
212
+ 'Last Updated': filtered_items[0].get("timestamp", "")
213
+ }
214
+
215
+ # Format timestamp if available
216
+ if row['Last Updated']:
217
+ try:
218
+ dt = datetime.fromisoformat(row['Last Updated'].replace('Z', '+00:00'))
219
+ row['Last Updated'] = dt.strftime("%Y-%m-%d")
220
+ except:
221
+ pass
222
+
223
+ # Process metrics based on categories
224
+ category_metrics = {
225
+ category: {
226
+ metric_type: {
227
+ "avg": 0.0,
228
+ "count": 0
229
+ } for metric_type in METRIC_TYPES
230
+ } for category in QUESTION_CATEGORIES
231
+ }
232
+
233
+ # Collect metrics by category
234
+ for item in filtered_items:
235
+ metrics = item.get("metrics", {})
236
+ for category in QUESTION_CATEGORIES:
237
+ if category in metrics:
238
+ for metric_type in METRIC_TYPES:
239
+ if metric_type in metrics[category]:
240
+ metric_values = metrics[category][metric_type]
241
+ avg_value = sum(metric_values.values()) / len(metric_values)
242
+
243
+ # Add to the running sum for this category and metric type
244
+ category_metrics[category][metric_type]["avg"] += avg_value
245
+ category_metrics[category][metric_type]["count"] += 1
246
+
247
+ # Calculate averages and add to row
248
+ for category in QUESTION_CATEGORIES:
249
+ for metric_type in METRIC_TYPES:
250
+ metric_data = category_metrics[category][metric_type]
251
+ if metric_data["count"] > 0:
252
+ avg_value = metric_data["avg"] / metric_data["count"]
253
+ # Add to row with appropriate column name
254
+ col_name = f"{category}_{metric_type}"
255
+ row[col_name] = round(avg_value, 4)
256
+
257
+ # Calculate overall averages for each metric type
258
+ for metric_type in METRIC_TYPES:
259
+ total_sum = 0
260
+ total_count = 0
261
+
262
+ for category in QUESTION_CATEGORIES:
263
+ metric_data = category_metrics[category][metric_type]
264
+ if metric_data["count"] > 0:
265
+ total_sum += metric_data["avg"]
266
+ total_count += metric_data["count"]
267
+
268
+ if total_count > 0:
269
+ row[f"{metric_type}_avg"] = round(total_sum / total_count, 4)
270
+
271
+ rows.append(row)
272
+
273
+ # Create DataFrame
274
+ df = pd.DataFrame(rows)
275
+
276
+ # Get lists of metrics for each category
277
+ category_metrics = []
278
+ for category in QUESTION_CATEGORIES:
279
+ metrics = []
280
+ for metric_type in METRIC_TYPES:
281
+ col_name = f"{category}_{metric_type}"
282
+ if col_name in df.columns:
283
+ metrics.append(col_name)
284
+ if metrics:
285
+ category_metrics.append((category, metrics))
286
+
287
+ # Define retrieval and generation columns for radar charts
288
+ retrieval_metrics = [f"{category}_retrieval" for category in QUESTION_CATEGORIES if f"{category}_retrieval" in df.columns]
289
+ generation_metrics = [f"{category}_generation" for category in QUESTION_CATEGORIES if f"{category}_generation" in df.columns]
290
+
291
+ return df, retrieval_metrics, generation_metrics, category_metrics
292
+
293
+ def create_radar_chart(df, selected_models, metrics, title):
294
+ """Create a radar chart for the selected models and metrics."""
295
+ if not metrics or len(selected_models) == 0:
296
+ # Return empty figure if no metrics or models selected
297
+ fig = go.Figure()
298
+ fig.update_layout(
299
+ title=title,
300
+ title_font_size=16,
301
+ height=400,
302
+ width=500,
303
+ margin=dict(l=30, r=30, t=50, b=30)
304
+ )
305
+ return fig
306
+
307
+ # Filter dataframe for selected models
308
+ filtered_df = df[df['Model'].isin(selected_models)]
309
+
310
+ if filtered_df.empty:
311
+ # Return empty figure if no data
312
+ fig = go.Figure()
313
+ fig.update_layout(
314
+ title=title,
315
+ title_font_size=16,
316
+ height=400,
317
+ width=500,
318
+ margin=dict(l=30, r=30, t=50, b=30)
319
+ )
320
+ return fig
321
+
322
+ # Limit to top 5 models for better visualization (similar to inspiration file)
323
+ if len(filtered_df) > 5:
324
+ filtered_df = filtered_df.head(5)
325
+
326
+ # Prepare data for radar chart
327
+ categories = [m.split('_', 1)[0] for m in metrics] # Get category name (simple, set, etc.)
328
+
329
+ fig = go.Figure()
330
+
331
+ # Process in reverse order to match inspiration file
332
+ for i, (_, row) in enumerate(filtered_df.iterrows()):
333
+ values = [row[m] for m in metrics]
334
+ # Close the loop for radar chart
335
+ values.append(values[0])
336
+ categories_loop = categories + [categories[0]]
337
+
338
+ fig.add_trace(go.Scatterpolar(
339
+ name=row['Model'],
340
+ r=values,
341
+ theta=categories_loop,
342
+ showlegend=True,
343
+ mode="lines",
344
+ line=dict(width=2, color=line_colors[i % len(line_colors)]),
345
+ fill="toself",
346
+ fillcolor=fill_colors[i % len(fill_colors)]
347
+ ))
348
+
349
+ fig.update_layout(
350
+ font=dict(size=13, color="black"),
351
+ template="plotly_white",
352
+ polar=dict(
353
+ radialaxis=dict(
354
+ visible=True,
355
+ gridcolor="black",
356
+ linecolor="rgba(0,0,0,0)",
357
+ gridwidth=1,
358
+ showticklabels=False,
359
+ ticks="",
360
+ range=[0, 1] # Ensure consistent range for scores
361
+ ),
362
+ angularaxis=dict(
363
+ gridcolor="black",
364
+ gridwidth=1.5,
365
+ linecolor="rgba(0,0,0,0)"
366
+ ),
367
+ ),
368
+ legend=dict(
369
+ orientation="h",
370
+ yanchor="bottom",
371
+ y=-0.35,
372
+ xanchor="center",
373
+ x=0.4,
374
+ itemwidth=30,
375
+ font=dict(size=13),
376
+ entrywidth=0.6,
377
+ entrywidthmode="fraction",
378
+ ),
379
+ margin=dict(l=0, r=16, t=30, b=30),
380
+ autosize=True,
381
+ )
382
+
383
+ return fig
384
+
385
+ def create_summary_df(df, retrieval_metrics, generation_metrics):
386
+ """Create a summary dataframe with averaged metrics for display."""
387
+ if df.empty:
388
+ return pd.DataFrame()
389
+
390
+ summary_df = df.copy()
391
+
392
+ # Add retrieval average
393
+ if retrieval_metrics:
394
+ retrieval_avg = summary_df[retrieval_metrics].mean(axis=1).round(4)
395
+ summary_df['Retrieval (avg)'] = retrieval_avg
396
+
397
+ # Add generation average
398
+ if generation_metrics:
399
+ generation_avg = summary_df[generation_metrics].mean(axis=1).round(4)
400
+ summary_df['Generation (avg)'] = generation_avg
401
+
402
+ # Add total score if both averages exist
403
+ if 'Retrieval (avg)' in summary_df.columns and 'Generation (avg)' in summary_df.columns:
404
+ summary_df['Total Score'] = summary_df['Retrieval (avg)'] + summary_df['Generation (avg)']
405
+ summary_df = summary_df.sort_values('Total Score', ascending=False)
406
+
407
+ # Select columns for display
408
+ summary_cols = ['Model', 'Embeddings', 'Retriever', 'Top-K']
409
+ if 'Retrieval (avg)' in summary_df.columns:
410
+ summary_cols.append('Retrieval (avg)')
411
+ if 'Generation (avg)' in summary_df.columns:
412
+ summary_cols.append('Generation (avg)')
413
+ if 'Total Score' in summary_df.columns:
414
+ summary_cols.append('Total Score')
415
+ if 'Versions' in summary_df.columns:
416
+ summary_cols.append('Versions')
417
+ if 'Last Updated' in summary_df.columns:
418
+ summary_cols.append('Last Updated')
419
+
420
+ return summary_df[summary_cols]
421
+
422
+ def create_category_df(df, category, retrieval_col, generation_col):
423
+ """Create a dataframe for a specific category with detailed metrics."""
424
+ if df.empty or retrieval_col not in df.columns or generation_col not in df.columns:
425
+ return pd.DataFrame()
426
+
427
+ category_df = df.copy()
428
+
429
+ # Calculate total score for this category
430
+ category_df[f'{category} Score'] = category_df[retrieval_col] + category_df[generation_col]
431
+
432
+ # Sort by total score
433
+ category_df = category_df.sort_values(f'{category} Score', ascending=False)
434
+
435
+ # Select columns for display
436
+ category_cols = ['Model', 'Embeddings', 'Retriever', retrieval_col, generation_col, f'{category} Score']
437
+
438
+ # Rename columns for display
439
+ category_df = category_df[category_cols].rename(columns={
440
+ retrieval_col: 'Retrieval',
441
+ generation_col: 'Generation'
442
+ })
443
+
444
+ return category_df
445
+
446
+ # Load initial data
447
+ results = load_results()
448
+ last_version = results.get("last_version", "1.0")
449
+ n_questions = results.get("n_questions", "100")
450
+ date_title = results.get("date_title", "---")
451
+
452
+ # Initial data processing
453
+ df, retrieval_metrics, generation_metrics, category_metrics = filter_and_process_results(
454
+ results, n_versions=1, only_actual_versions=True
455
+ )
456
+
457
+ # Pre-generate charts for initial display
458
+ default_models = df['Model'].head(5).tolist() if not df.empty else []
459
+ initial_gen_chart_title = LANGUAGES[DEFAULT_LANG]["radar_gen_title"]
460
+ initial_ret_chart_title = LANGUAGES[DEFAULT_LANG]["radar_ret_title"]
461
+ initial_gen_chart = create_radar_chart(df, default_models, generation_metrics, initial_gen_chart_title)
462
+ initial_ret_chart = create_radar_chart(df, default_models, retrieval_metrics, initial_ret_chart_title)
463
+
464
+ # Create summary dataframe
465
+ summary_df = create_summary_df(df, retrieval_metrics, generation_metrics)
466
+
467
+ with gr.Blocks(css="""
468
+ .title-container {
469
+ text-align: center;
470
+ margin-bottom: 10px;
471
+ }
472
+ .description-text {
473
+ text-align: left;
474
+ padding: 10px;
475
+ margin-bottom: 0px;
476
+ }
477
+ .version-info {
478
+ text-align: center;
479
+ padding: 10px;
480
+ background-color: #f0f0f0;
481
+ border-radius: 8px;
482
+ margin-bottom: 15px;
483
+ }
484
+ .version-selector {
485
+ padding: 15px;
486
+ border: 1px solid #ddd;
487
+ border-radius: 8px;
488
+ margin-bottom: 20px;
489
+ background-color: #f9f9f9;
490
+ height: 100%;
491
+ }
492
+ .citation-block {
493
+ padding: 15px;
494
+ border: 1px solid #ddd;
495
+ border-radius: 8px;
496
+ margin-bottom: 20px;
497
+ background-color: #f9f9f9;
498
+ font-family: monospace;
499
+ font-size: 14px;
500
+ overflow-x: auto;
501
+ height: 100%;
502
+ }
503
+ .flex-row-container {
504
+ display: flex;
505
+ justify-content: space-between;
506
+ gap: 20px;
507
+ width: 100%;
508
+ }
509
+ .charts-container {
510
+ display: flex;
511
+ gap: 20px;
512
+ margin-bottom: 20px;
513
+ }
514
+ .chart-box {
515
+ flex: 1;
516
+ border: 1px solid #eee;
517
+ border-radius: 8px;
518
+ padding: 10px;
519
+ background-color: white;
520
+ min-height: 550px; /* Increased height to accommodate legend at bottom */
521
+ }
522
+ .metrics-table {
523
+ border: 1px solid #eee;
524
+ border-radius: 8px;
525
+ padding: 15px;
526
+ background-color: white;
527
+ }
528
+ .info-text {
529
+ font-size: 0.9em;
530
+ font-style: italic;
531
+ color: #666;
532
+ margin-top: 5px;
533
+ }
534
+ footer {
535
+ text-align: center;
536
+ margin-top: 30px;
537
+ font-size: 0.9em;
538
+ color: #666;
539
+ }
540
+ /* Style for selected rows */
541
+ table tbody tr.selected {
542
+ background-color: rgba(25, 118, 210, 0.1) !important;
543
+ border-left: 3px solid #1976d2;
544
+ }
545
+ /* Add this class via JavaScript */
546
+ .gr-table tbody tr.selected td:first-child {
547
+ font-weight: bold;
548
+ color: #1976d2;
549
+ }
550
+ .category-tab {
551
+ padding: 10px;
552
+ }
553
+ .chart-title {
554
+ font-size: 1.2em;
555
+ font-weight: bold;
556
+ margin-bottom: 10px;
557
+ text-align: center;
558
+ }
559
+ .clear-charts-button {
560
+ display: flex;
561
+ justify-content: center;
562
+ margin-top: 10px;
563
+ margin-bottom: 20px;
564
+ }
565
+ .lang-selector {
566
+ width: fit-content; /* Adjust width to content */
567
+ margin-left: auto; /* Push to the right */
568
+ margin-right: 0; /* Keep it flush right */
569
+ margin-bottom: 15px; /* Keep bottom margin */
570
+ padding: 10px;
571
+ background-color: #f9f9f9;
572
+ border-radius: 8px;
573
+ border: none;
574
+ padding: 0 !important;
575
+ }
576
+ .lang-selector .form {
577
+ border: none !important;
578
+ }
579
+ """) as demo:
580
+ current_lang_dict = gr.State(LANGUAGES[DEFAULT_LANG])
581
+ current_language = gr.State(DEFAULT_LANG)
582
+
583
+ with gr.Row(elem_classes=["title-container"]):
584
+ main_title_md = gr.Markdown("# 🐙 Dynamic RAG Benchmark On News")
585
+
586
+ # Language Selector
587
+ with gr.Row(elem_classes=["lang-selector"]):
588
+ lang_selector = gr.Radio(
589
+ list(LANGUAGES.keys()),
590
+ label="",
591
+ value=DEFAULT_LANG,
592
+ interactive=True
593
+ )
594
+
595
+ # Description
596
+ with gr.Row(elem_classes=["description-text"]):
597
+ description_md = gr.Markdown(value=LANGUAGES[DEFAULT_LANG]["description"])
598
+
599
+ # Version info
600
+ with gr.Row(elem_classes=["version-info"]):
601
+ version_info_md = gr.Markdown(
602
+ value=LANGUAGES[DEFAULT_LANG]["version_info_template"].format(last_version, n_questions, date_title)
603
+ )
604
+
605
+ # Radar Charts
606
+ with gr.Row(elem_classes=["charts-container"]):
607
+ with gr.Column(elem_classes=["chart-box"]):
608
+ gen_chart_title_md = gr.Markdown(
609
+ value=LANGUAGES[DEFAULT_LANG]["gen_metrics_title"], elem_classes=["chart-title"]
610
+ )
611
+ generation_chart = gr.Plot(value=initial_gen_chart)
612
+
613
+ with gr.Column(elem_classes=["chart-box"]):
614
+ ret_chart_title_md = gr.Markdown(
615
+ value=LANGUAGES[DEFAULT_LANG]["ret_metrics_title"], elem_classes=["chart-title"]
616
+ )
617
+ retrieval_chart = gr.Plot(value=initial_ret_chart)
618
+
619
+ # Clear Charts Button
620
+ with gr.Row(elem_classes=["clear-charts-button"]):
621
+ clear_charts_btn = gr.Button(
622
+ value=LANGUAGES[DEFAULT_LANG]["clear_charts"],
623
+ variant="secondary"
624
+ )
625
+
626
+ # Metrics table with tabs
627
+ with gr.Tabs(elem_classes=["metrics-table"]) as metrics_tabs:
628
+ with gr.TabItem(label=LANGUAGES[DEFAULT_LANG]["overall_tab_title"]) as summary_tab:
629
+ selected_models = gr.State(default_models)
630
+ empty_data_md = gr.Markdown(
631
+ value=LANGUAGES[DEFAULT_LANG]["no_data_message"],
632
+ visible=df.empty # Initially visible only if df is empty
633
+ )
634
+ # Initialize metrics_table even if empty, but maybe hide it
635
+ metrics_table = gr.DataFrame(
636
+ value=summary_df if not df.empty else pd.DataFrame(),
637
+ headers=summary_df.columns.tolist() if not df.empty else [],
638
+ datatype=["str"] * (len(summary_df.columns) if not df.empty else 0),
639
+ row_count=(min(10, len(summary_df)) if not summary_df.empty else 0),
640
+ col_count=(len(summary_df.columns) if not summary_df.empty else 0),
641
+ interactive=False,
642
+ wrap=True,
643
+ visible=not df.empty # Initially visible only if df is not empty
644
+ )
645
+
646
+ with gr.TabItem(label=LANGUAGES[DEFAULT_LANG]["by_type_tab_title"]) as category_main_tab:
647
+ category_tabs = gr.Tabs()
648
+ category_tables = {}
649
+ category_tab_items = {} # Store TabItem components
650
+ category_no_data_mds = {} # Store "no data" Markdowns
651
+ category_title_mds = {} # Store category title Markdowns
652
+
653
+ # Get initial display names
654
+ initial_category_display_names = LANGUAGES[DEFAULT_LANG]["category_display_names"]
655
+
656
+ with category_tabs:
657
+ for category, _ in category_metrics:
658
+ display_name = initial_category_display_names.get(category, category.capitalize())
659
+ if f"{category}_retrieval" in df.columns and f"{category}_generation" in df.columns:
660
+ with gr.TabItem(label=display_name, elem_classes=["category-tab"]) as tab_item:
661
+ category_tab_items[category] = tab_item # Store the TabItem
662
+
663
+ # Create dataframe for this category
664
+ category_df = create_category_df(df, category, f"{category}_retrieval", f"{category}_generation")
665
+
666
+ category_no_data_mds[category] = gr.Markdown(
667
+ value=LANGUAGES[DEFAULT_LANG]["no_data_category_template"].format(display_name),
668
+ visible=category_df.empty
669
+ )
670
+ category_title_mds[category] = gr.Markdown(
671
+ value=LANGUAGES[DEFAULT_LANG]["category_performance_template"].format(display_name),
672
+ visible=not category_df.empty
673
+ )
674
+ category_tables[category] = gr.DataFrame(
675
+ value=category_df if not category_df.empty else pd.DataFrame(),
676
+ headers=category_df.columns.tolist() if not category_df.empty else [],
677
+ datatype=["str"] * (len(category_df.columns) if not category_df.empty else 0),
678
+ row_count=(min(10, len(category_df)) if not category_df.empty else 0),
679
+ col_count=(len(category_df.columns) if not category_df.empty else 0),
680
+ interactive=False,
681
+ wrap=True,
682
+ visible=not category_df.empty
683
+ )
684
+
685
+ # Version selector and Citation block in a flex container
686
+ with gr.Row():
687
+ # Citation block (left side)
688
+ with gr.Column(scale=1, elem_classes=["citation-block"]):
689
+ citation_title_md = gr.Markdown(value=LANGUAGES[DEFAULT_LANG]["citation_title"])
690
+ citation_desc_md = gr.Markdown(value=LANGUAGES[DEFAULT_LANG]["citation_description"])
691
+
692
+ # Version selector (right side)
693
+ with gr.Column(scale=1, elem_classes=["version-selector"]):
694
+ version_selector_title_md = gr.Markdown(value=LANGUAGES[DEFAULT_LANG]["version_selector_title"])
695
+ with gr.Column():
696
+ with gr.Row():
697
+ with gr.Column(scale=3):
698
+ only_actual_versions = gr.Checkbox(
699
+ label=LANGUAGES[DEFAULT_LANG]["only_actual_label"],
700
+ value=True,
701
+ info=LANGUAGES[DEFAULT_LANG]["only_actual_info"]
702
+ )
703
+ with gr.Column(scale=5):
704
+ n_versions_slider = gr.Slider(
705
+ minimum=1,
706
+ maximum=5,
707
+ value=1,
708
+ step=1,
709
+ label=LANGUAGES[DEFAULT_LANG]["n_versions_label"],
710
+ info=LANGUAGES[DEFAULT_LANG]["n_versions_info"]
711
+ )
712
+ with gr.Row():
713
+ filter_btn = gr.Button(value=LANGUAGES[DEFAULT_LANG]["filter_button"], variant="primary")
714
+
715
+ info_text_md = gr.Markdown(
716
+ value=LANGUAGES[DEFAULT_LANG]["info_text"],
717
+ elem_classes=["info-text"]
718
+ )
719
+
720
+ # Footer
721
+ with gr.Row():
722
+ footer_md = gr.Markdown(value=LANGUAGES[DEFAULT_LANG]["footer_text"])
723
+
724
+ # Handle row selection for radar charts
725
+ def update_charts(evt: gr.SelectData, selected_models, current_lang):
726
+ try:
727
+ # Get current data with the latest filters applied in update_data
728
+ current_df = df # Use the globally updated df
729
+ current_ret_metrics = retrieval_metrics
730
+ current_gen_metrics = generation_metrics
731
+
732
+ # Debug info
733
+ print(f"Selection event: {evt}, type: {type(evt)}")
734
+
735
+ selected_model = None
736
+
737
+ # Extract the selected model based on the row index
738
+ try:
739
+ component = evt.target
740
+ row_idx = evt.index[0] if isinstance(evt.index, list) else evt.index
741
+ print(f"Row index: {row_idx}, Component: {component}")
742
+
743
+ # Determine what type of data we're dealing with and extract model name
744
+ if component is metrics_table:
745
+ # Summary table was clicked
746
+ current_summary_df = create_summary_df(current_df, current_ret_metrics, current_gen_metrics)
747
+ if isinstance(current_summary_df, pd.DataFrame) and not current_summary_df.empty and 0 <= row_idx < len(current_summary_df):
748
+ selected_model = current_summary_df.iloc[row_idx]['Model']
749
+ print(f"Selected from summary table: {selected_model}")
750
+ else:
751
+ # Check if it's a category table
752
+ for category, table in category_tables.items():
753
+ if component is table:
754
+ category_df = create_category_df(
755
+ current_df,
756
+ category,
757
+ f"{category}_retrieval",
758
+ f"{category}_generation"
759
+ )
760
+ if isinstance(category_df, pd.DataFrame) and not category_df.empty and 0 <= row_idx < len(category_df):
761
+ selected_model = category_df.iloc[row_idx]['Model']
762
+ print(f"Selected from {category} table: {selected_model}")
763
+ break
764
+
765
+ # Fallback if model not found yet (should not happen often with explicit checks)
766
+ if selected_model is None and hasattr(evt, 'value') and evt.value:
767
+ selected_model = evt.value[0] # Assuming model name is the first column value in the selected cell data
768
+ print(f"Selected model using fallback evt.value: {selected_model}")
769
+
770
+ except IndexError:
771
+ print(f"IndexError: row_idx {row_idx} out of bounds for the component's data.")
772
+ # Potentially return current state without changes
773
+ gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
774
+ ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
775
+ return selected_models, gen_chart, ret_chart
776
+ except Exception as e:
777
+ print(f"Error extracting model name: {e}")
778
+ traceback.print_exc()
779
+
780
+ # If we found a model name, toggle its selection
781
+ if selected_model:
782
+ print(f"Selected model: {selected_model}")
783
+ available_models = current_df['Model'].tolist() if not current_df.empty else []
784
+
785
+ if selected_model in available_models:
786
+ new_selected_models = selected_models[:] # Create a copy
787
+ if selected_model in new_selected_models:
788
+ new_selected_models.remove(selected_model)
789
+ else:
790
+ new_selected_models.append(selected_model)
791
+
792
+ # Ensure only models from the current dataframe are included
793
+ new_selected_models = [model for model in new_selected_models if model in available_models]
794
+
795
+ # If no models are selected after filtering, select the top available model
796
+ if not new_selected_models and available_models:
797
+ new_selected_models = [available_models[0]]
798
+
799
+ selected_models = new_selected_models # Update the state
800
+ else:
801
+ print(f"Model {selected_model} not found in current dataframe")
802
+
803
+ # Create radar charts using the current dataframe and metrics
804
+ gen_chart = create_radar_chart(current_df, selected_models, current_gen_metrics, LANGUAGES[current_lang]["radar_gen_title"])
805
+ ret_chart = create_radar_chart(current_df, selected_models, current_ret_metrics, LANGUAGES[current_lang]["radar_ret_title"])
806
+
807
+ return selected_models, gen_chart, ret_chart
808
+ except Exception as e:
809
+ print(f"Error in update_charts: {e}")
810
+ print(traceback.format_exc())
811
+ # Return potentially existing chart values if error occurs
812
+ current_gen_chart = create_radar_chart(df, selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
813
+ current_ret_chart = create_radar_chart(df, selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
814
+ return selected_models, current_gen_chart, current_ret_chart
815
+
816
+
817
+ # Use custom event handler for row selection
818
+ # Make sure to pass current_language state
819
+ metrics_table.select(
820
+ fn=update_charts,
821
+ inputs=[selected_models, current_language],
822
+ outputs=[selected_models, generation_chart, retrieval_chart]
823
+ )
824
+
825
+ # Add selection handlers for category tables too
826
+ for category_table in category_tables.values():
827
+ category_table.select(
828
+ fn=update_charts,
829
+ inputs=[selected_models, current_language],
830
+ outputs=[selected_models, generation_chart, retrieval_chart]
831
+ )
832
+
833
+ # Handle version filter changes
834
+ def update_data(n_versions, only_actual, current_selected_models, current_lang):
835
+ try:
836
+ # Update global data (df, metrics)
837
+ global df, retrieval_metrics, generation_metrics
838
+ new_df, new_ret_metrics, new_gen_metrics, new_category_metrics = filter_and_process_results(
839
+ results, n_versions=n_versions, only_actual_versions=only_actual
840
+ )
841
+ # Update global references
842
+ df = new_df
843
+ retrieval_metrics = new_ret_metrics
844
+ generation_metrics = new_gen_metrics
845
+
846
+ available_models = df['Model'].tolist() if not df.empty else []
847
+
848
+ # Filter selected models
849
+ filtered_selected_models = [model for model in current_selected_models if model in available_models]
850
+ if not filtered_selected_models and available_models:
851
+ filtered_selected_models = available_models[:min(5, len(available_models))]
852
+
853
+ # Create charts with localized titles
854
+ gen_chart_val = create_radar_chart(df, filtered_selected_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
855
+ ret_chart_val = create_radar_chart(df, filtered_selected_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
856
+
857
+ # Create summary dataframe
858
+ summary_df_val = create_summary_df(df, retrieval_metrics, generation_metrics)
859
+
860
+ # Prepare outputs for tables and charts
861
+ outputs = {
862
+ metrics_table: gr.update(value=summary_df_val if not summary_df_val.empty else pd.DataFrame(), visible=not summary_df_val.empty),
863
+ empty_data_md: gr.update(visible=summary_df_val.empty),
864
+ generation_chart: gen_chart_val,
865
+ retrieval_chart: ret_chart_val,
866
+ selected_models: filtered_selected_models
867
+ }
868
+
869
+ # Update category tables
870
+ current_category_display_names = LANGUAGES[current_lang]["category_display_names"]
871
+ for category in category_tables.keys():
872
+ if f"{category}_retrieval" in df.columns and f"{category}_generation" in df.columns:
873
+ category_df_val = create_category_df(df, category, f"{category}_retrieval", f"{category}_generation")
874
+ display_name = current_category_display_names.get(category, category.capitalize())
875
+
876
+ outputs[category_tables[category]] = gr.update(value=category_df_val if not category_df_val.empty else pd.DataFrame(), visible=not category_df_val.empty)
877
+ outputs[category_no_data_mds[category]] = gr.update(visible=category_df_val.empty)
878
+ outputs[category_title_mds[category]] = gr.update(visible=not category_df_val.empty)
879
+ else:
880
+ # Hide table and titles if data for category doesn't exist with current filters
881
+ outputs[category_tables[category]] = gr.update(value=pd.DataFrame(), visible=False)
882
+ outputs[category_no_data_mds[category]] = gr.update(visible=True) # Show 'no data' instead? Or just hide all? Let's hide title too.
883
+ outputs[category_title_mds[category]] = gr.update(visible=False)
884
+
885
+
886
+ # Return updates in the correct order based on outputs list
887
+ output_list = [outputs[metrics_table], outputs[empty_data_md], outputs[generation_chart], outputs[retrieval_chart], outputs[selected_models]]
888
+ for category in category_tables.keys():
889
+ output_list.extend([
890
+ outputs[category_tables[category]],
891
+ outputs[category_no_data_mds[category]],
892
+ outputs[category_title_mds[category]]
893
+ ])
894
+
895
+ return output_list
896
+ except Exception as e:
897
+ print(f"Error in update_data: {e}")
898
+ print(traceback.format_exc())
899
+ # Return original values in case of error; construct a list of Nones matching output structure
900
+ num_category_outputs = len(category_tables.keys()) * 3
901
+ return [gr.update()]*5 + [gr.update()]*num_category_outputs # Return no changes
902
+
903
+ # Define filter button outputs
904
+ filter_outputs = [metrics_table, empty_data_md, generation_chart, retrieval_chart, selected_models]
905
+ for category in category_tables.keys():
906
+ filter_outputs.extend([category_tables[category], category_no_data_mds[category], category_title_mds[category]])
907
+
908
+ filter_btn.click(
909
+ fn=update_data,
910
+ inputs=[n_versions_slider, only_actual_versions, selected_models, current_language], # Pass language
911
+ outputs=filter_outputs
912
+ )
913
+
914
+ # Function to clear charts
915
+ def clear_charts_localized(current_lang): # Pass language
916
+ empty_models = []
917
+ # Create empty charts with localized titles
918
+ empty_gen_chart = create_radar_chart(df, empty_models, generation_metrics, LANGUAGES[current_lang]["radar_gen_title"])
919
+ empty_ret_chart = create_radar_chart(df, empty_models, retrieval_metrics, LANGUAGES[current_lang]["radar_ret_title"])
920
+ return empty_models, empty_gen_chart, empty_ret_chart
921
+
922
+ # Connect clear charts button
923
+ clear_charts_btn.click(
924
+ fn=clear_charts_localized,
925
+ inputs=[current_language], # Pass language
926
+ outputs=[selected_models, generation_chart, retrieval_chart]
927
+ )
928
+
929
+ # Function to update language-specific elements
930
+ def update_language(selected_lang):
931
+ lang_dict = LANGUAGES[selected_lang]
932
+ category_display_names = lang_dict.get("category_display_names", {})
933
+
934
+ updates = {
935
+ current_language: selected_lang, # Update the state holding the language key
936
+ current_lang_dict: lang_dict, # Update the state holding the translations
937
+ # lang_selector: gr.update(label=lang_dict["lang_selector_label"]),
938
+ description_md: gr.update(value=lang_dict["description"]),
939
+ version_info_md: gr.update(value=lang_dict["version_info_template"].format(last_version, n_questions, date_title)),
940
+ gen_chart_title_md: gr.update(value=lang_dict["gen_metrics_title"]),
941
+ ret_chart_title_md: gr.update(value=lang_dict["ret_metrics_title"]),
942
+ clear_charts_btn: gr.update(value=lang_dict["clear_charts"]),
943
+ summary_tab: gr.update(label=lang_dict["overall_tab_title"]),
944
+ empty_data_md: gr.update(value=lang_dict["no_data_message"]),
945
+ category_main_tab: gr.update(label=lang_dict["by_type_tab_title"]),
946
+ citation_title_md: gr.update(value=lang_dict["citation_title"]),
947
+ citation_desc_md: gr.update(value=lang_dict["citation_description"]),
948
+ version_selector_title_md: gr.update(value=lang_dict["version_selector_title"]),
949
+ only_actual_versions: gr.update(label=lang_dict["only_actual_label"], info=lang_dict["only_actual_info"]),
950
+ n_versions_slider: gr.update(label=lang_dict["n_versions_label"], info=lang_dict["n_versions_info"]),
951
+ filter_btn: gr.update(value=lang_dict["filter_button"]),
952
+ info_text_md: gr.update(value=lang_dict["info_text"]),
953
+ footer_md: gr.update(value=lang_dict["footer_text"]),
954
+ # Update category tab labels and conditional text templates
955
+ **{tab_item: gr.update(label=category_display_names.get(category, category.capitalize()))
956
+ for category, tab_item in category_tab_items.items()},
957
+ **{no_data_md: gr.update(value=lang_dict["no_data_category_template"].format(category_display_names.get(category, category.capitalize())))
958
+ for category, no_data_md in category_no_data_mds.items()},
959
+ **{title_md: gr.update(value=lang_dict["category_performance_template"].format(category_display_names.get(category, category.capitalize())))
960
+ for category, title_md in category_title_mds.items()},
961
+ # Update chart titles dynamically by re-plotting (needed if chart titles change)
962
+ generation_chart: create_radar_chart(df, selected_models.value, generation_metrics, lang_dict["radar_gen_title"]),
963
+ retrieval_chart: create_radar_chart(df, selected_models.value, retrieval_metrics, lang_dict["radar_ret_title"])
964
+ }
965
+
966
+ # Return updates in the correct order based on outputs list below
967
+ output_list = [
968
+ updates[current_language], updates[current_lang_dict],
969
+ updates[description_md], updates[version_info_md], updates[gen_chart_title_md], updates[ret_chart_title_md],
970
+ updates[clear_charts_btn], updates[summary_tab], updates[empty_data_md], updates[category_main_tab],
971
+ updates[citation_title_md], updates[citation_desc_md], updates[version_selector_title_md],
972
+ updates[only_actual_versions], updates[n_versions_slider], updates[filter_btn], updates[info_text_md],
973
+ updates[footer_md], updates[generation_chart], updates[retrieval_chart]
974
+ ]
975
+ # Add category tab items, no_data markdown, and title markdown updates
976
+ for category in category_tables.keys(): # Use category_tables as the source of truth for existing categories
977
+ if category in category_tab_items: output_list.append(updates[category_tab_items[category]])
978
+ if category in category_no_data_mds: output_list.append(updates[category_no_data_mds[category]])
979
+ if category in category_title_mds: output_list.append(updates[category_title_mds[category]])
980
+
981
+ return output_list
982
+
983
+ # Define the outputs for the language selector change event
984
+ lang_outputs = [
985
+ current_language, current_lang_dict, description_md, version_info_md,
986
+ gen_chart_title_md, ret_chart_title_md, clear_charts_btn, summary_tab, empty_data_md,
987
+ category_main_tab, citation_title_md, citation_desc_md, version_selector_title_md,
988
+ only_actual_versions, n_versions_slider, filter_btn, info_text_md, footer_md,
989
+ generation_chart, retrieval_chart # Charts need to be updated too if their titles change
990
+ ]
991
+ # Add category tab items, no_data markdown, and title markdown to outputs
992
+ for category in category_tables.keys():
993
+ if category in category_tab_items: lang_outputs.append(category_tab_items[category])
994
+ if category in category_no_data_mds: lang_outputs.append(category_no_data_mds[category])
995
+ if category in category_title_mds: lang_outputs.append(category_title_mds[category])
996
+
997
+
998
+ # Connect language selector change event
999
+ lang_selector.change(
1000
+ fn=update_language,
1001
+ inputs=[lang_selector],
1002
+ outputs=lang_outputs
1003
+ )
1004
+
1005
+ if __name__ == "__main__":
1006
+ demo.launch()