npuliga commited on
Commit
4c2722d
·
1 Parent(s): f551b90

updated files

Browse files
Files changed (2) hide show
  1. app.py +85 -15
  2. config.py +1 -2
app.py CHANGED
@@ -49,10 +49,28 @@ def get_dataset_choices():
49
  return []
50
 
51
  def get_data_preview():
52
- """Returns the raw dataframe for inspection"""
53
  if "data" not in DB:
54
- return pd.DataFrame()
55
- return DB["data"].head(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  def get_domain_state(dataset):
58
  empty_update = gr.update(visible=False, value=None, choices=[])
@@ -190,8 +208,8 @@ def plot_metrics_on_x_axis(dataset, f1_val, f2_val, f3_val):
190
 
191
  return fig_rmse, fig_perf
192
 
193
- def generate_inter_domain_comparison():
194
- """Generates comparison table and plot across all domains."""
195
  if "data" not in DB:
196
  return pd.DataFrame(), None
197
 
@@ -221,19 +239,51 @@ def generate_inter_domain_comparison():
221
 
222
  comp_df = pd.DataFrame(table_rows)
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  best_results = []
225
  for ds in datasets:
226
  subset = df[df['dataset_name'] == ds]
227
- if 'f1_score' in subset.columns:
228
- max_f1 = subset['f1_score'].max()
229
- best_idx = subset['f1_score'].idxmax()
 
 
 
 
230
  best_row = subset.loc[best_idx]
231
  best_results.append({
232
  "Domain": ds,
233
- "Max F1 Score": max_f1,
234
  "Best Config": best_row['config_purpose']
235
  })
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  if best_results:
238
  best_df = pd.DataFrame(best_results)
239
  fig_global = px.bar(
@@ -252,7 +302,7 @@ def generate_inter_domain_comparison():
252
  # --- 3. UI ---
253
  APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
254
 
255
- with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
256
  gr.Markdown("## RAG Pipeline Analytics")
257
  gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
258
 
@@ -279,12 +329,32 @@ with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
279
 
280
  # TAB 2: Data Inspector
281
  with gr.TabItem("Data Preview"):
282
- gr.Markdown("### Verify your data loaded correctly here")
283
- preview_table = gr.Dataframe(interactive=False)
 
 
 
 
 
 
 
284
  preview_btn = gr.Button("Refresh Data Preview")
285
 
286
  # TAB 3: Comparison
287
  with gr.TabItem("Inter-Domain Comparison"):
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  refresh_btn = gr.Button("Generate Comparison")
289
  gr.Markdown("### Configuration Differences")
290
  comp_table = gr.Dataframe(interactive=False)
@@ -317,11 +387,11 @@ with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
317
  )
318
 
319
  # Debug Preview Events
320
- preview_btn.click(get_data_preview, inputs=None, outputs=preview_table)
321
 
322
  refresh_btn.click(
323
  generate_inter_domain_comparison,
324
- inputs=None,
325
  outputs=[comp_table, global_plot]
326
  )
327
 
@@ -332,4 +402,4 @@ print(startup_status)
332
 
333
  # Launch Gradio app
334
  if __name__ == "__main__":
335
- demo.launch()
 
49
  return []
50
 
51
  def get_data_preview():
52
+ """Returns separate dataframes for each domain."""
53
  if "data" not in DB:
54
+ return {}, {}, {}, {}
55
+
56
+ df = DB["data"]
57
+ # Remove failed_samples column if it exists
58
+ if 'failed_samples' in df.columns:
59
+ df = df.drop(columns=['failed_samples'])
60
+
61
+ datasets = df['dataset_name'].unique()
62
+
63
+ # Create separate dataframes for each domain
64
+ results = {}
65
+ for ds in datasets:
66
+ results[ds] = df[df['dataset_name'] == ds]
67
+
68
+ # Return up to 4 domains (adjust if you have more)
69
+ domain_dfs = list(results.values())
70
+ while len(domain_dfs) < 4:
71
+ domain_dfs.append(pd.DataFrame())
72
+
73
+ return domain_dfs[0], domain_dfs[1], domain_dfs[2], domain_dfs[3]
74
 
75
  def get_domain_state(dataset):
76
  empty_update = gr.update(visible=False, value=None, choices=[])
 
208
 
209
  return fig_rmse, fig_perf
210
 
211
+ def generate_inter_domain_comparison(metric='f1_score'):
212
+ """Generates comparison table and plot across all domains for selected metric."""
213
  if "data" not in DB:
214
  return pd.DataFrame(), None
215
 
 
239
 
240
  comp_df = pd.DataFrame(table_rows)
241
 
242
+ # Metric display names
243
+ metric_names = {
244
+ 'rmse_relevance': 'RMSE Relevance',
245
+ 'rmse_utilization': 'RMSE Utilization',
246
+ 'rmse_completeness': 'RMSE Completeness',
247
+ 'f1_score': 'F1 Score',
248
+ 'aucroc': 'AUC-ROC'
249
+ }
250
+
251
+ metric_display = metric_names.get(metric, metric)
252
+ is_rmse = metric.startswith('rmse')
253
+ direction = "Lower is Better" if is_rmse else "Higher is Better"
254
+
255
  best_results = []
256
  for ds in datasets:
257
  subset = df[df['dataset_name'] == ds]
258
+ if metric in subset.columns:
259
+ if is_rmse:
260
+ best_val = subset[metric].min()
261
+ best_idx = subset[metric].idxmin()
262
+ else:
263
+ best_val = subset[metric].max()
264
+ best_idx = subset[metric].idxmax()
265
  best_row = subset.loc[best_idx]
266
  best_results.append({
267
  "Domain": ds,
268
+ metric_display: best_val,
269
  "Best Config": best_row['config_purpose']
270
  })
271
 
272
+ if best_results:
273
+ best_df = pd.DataFrame(best_results)
274
+ fig_global = px.bar(
275
+ best_df, x="Domain", y=metric_display,
276
+ color="Domain",
277
+ text_auto='.4f',
278
+ hover_data=["Best Config"],
279
+ title=f"Peak Performance per Domain: {metric_display} ({direction})"
280
+ )
281
+ fig_global.update_traces(textposition='outside')
282
+ else:
283
+ fig_global = None
284
+
285
+ return comp_df, fig_global
286
+
287
  if best_results:
288
  best_df = pd.DataFrame(best_results)
289
  fig_global = px.bar(
 
302
  # --- 3. UI ---
303
  APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
304
 
305
+ with gr.Blocks(title="RAG Analytics Pro") as demo:
306
  gr.Markdown("## RAG Pipeline Analytics")
307
  gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
308
 
 
329
 
330
  # TAB 2: Data Inspector
331
  with gr.TabItem("Data Preview"):
332
+ gr.Markdown("### All Test Configurations by Domain")
333
+ gr.Markdown("**Biomedical (PubMedQA)**")
334
+ preview_table_1 = gr.Dataframe(interactive=False, wrap=True)
335
+ gr.Markdown("**Finance (FinQA)**")
336
+ preview_table_2 = gr.Dataframe(interactive=False, wrap=True)
337
+ gr.Markdown("**General (MS MARCO)**")
338
+ preview_table_3 = gr.Dataframe(interactive=False, wrap=True)
339
+ gr.Markdown("**Legal (CUAD)**")
340
+ preview_table_4 = gr.Dataframe(interactive=False, wrap=True)
341
  preview_btn = gr.Button("Refresh Data Preview")
342
 
343
  # TAB 3: Comparison
344
  with gr.TabItem("Inter-Domain Comparison"):
345
+ gr.Markdown("### Select Metric to Compare")
346
+ metric_dropdown = gr.Dropdown(
347
+ label="Comparison Metric",
348
+ choices=[
349
+ ("F1 Score (Higher is Better)", "f1_score"),
350
+ ("AUC-ROC (Higher is Better)", "aucroc"),
351
+ ("RMSE Relevance (Lower is Better)", "rmse_relevance"),
352
+ ("RMSE Utilization (Lower is Better)", "rmse_utilization"),
353
+ ("RMSE Completeness (Lower is Better)", "rmse_completeness")
354
+ ],
355
+ value="f1_score",
356
+ interactive=True
357
+ )
358
  refresh_btn = gr.Button("Generate Comparison")
359
  gr.Markdown("### Configuration Differences")
360
  comp_table = gr.Dataframe(interactive=False)
 
387
  )
388
 
389
  # Debug Preview Events
390
+ preview_btn.click(get_data_preview, inputs=None, outputs=[preview_table_1, preview_table_2, preview_table_3, preview_table_4])
391
 
392
  refresh_btn.click(
393
  generate_inter_domain_comparison,
394
+ inputs=[metric_dropdown],
395
  outputs=[comp_table, global_plot]
396
  )
397
 
 
402
 
403
  # Launch Gradio app
404
  if __name__ == "__main__":
405
+ demo.launch(ssr_mode=False)
config.py CHANGED
@@ -19,8 +19,7 @@ METRIC_COLUMNS = [
19
  'rmse_utilization',
20
  'rmse_completeness',
21
  'f1_score',
22
- 'aucroc',
23
- 'failed_samples'
24
  ]
25
 
26
  # Numeric configuration columns (also need float conversion)
 
19
  'rmse_utilization',
20
  'rmse_completeness',
21
  'f1_score',
22
+ 'aucroc'
 
23
  ]
24
 
25
  # Numeric configuration columns (also need float conversion)