Spaces:
Sleeping
Sleeping
updated files
Browse files
app.py
CHANGED
|
@@ -49,10 +49,28 @@ def get_dataset_choices():
|
|
| 49 |
return []
|
| 50 |
|
| 51 |
def get_data_preview():
|
| 52 |
-
"""Returns
|
| 53 |
if "data" not in DB:
|
| 54 |
-
return
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
def get_domain_state(dataset):
|
| 58 |
empty_update = gr.update(visible=False, value=None, choices=[])
|
|
@@ -190,8 +208,8 @@ def plot_metrics_on_x_axis(dataset, f1_val, f2_val, f3_val):
|
|
| 190 |
|
| 191 |
return fig_rmse, fig_perf
|
| 192 |
|
| 193 |
-
def generate_inter_domain_comparison():
|
| 194 |
-
"""Generates comparison table and plot across all domains."""
|
| 195 |
if "data" not in DB:
|
| 196 |
return pd.DataFrame(), None
|
| 197 |
|
|
@@ -221,19 +239,51 @@ def generate_inter_domain_comparison():
|
|
| 221 |
|
| 222 |
comp_df = pd.DataFrame(table_rows)
|
| 223 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
best_results = []
|
| 225 |
for ds in datasets:
|
| 226 |
subset = df[df['dataset_name'] == ds]
|
| 227 |
-
if
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
best_row = subset.loc[best_idx]
|
| 231 |
best_results.append({
|
| 232 |
"Domain": ds,
|
| 233 |
-
|
| 234 |
"Best Config": best_row['config_purpose']
|
| 235 |
})
|
| 236 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
if best_results:
|
| 238 |
best_df = pd.DataFrame(best_results)
|
| 239 |
fig_global = px.bar(
|
|
@@ -252,7 +302,7 @@ def generate_inter_domain_comparison():
|
|
| 252 |
# --- 3. UI ---
|
| 253 |
APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
|
| 254 |
|
| 255 |
-
with gr.Blocks(title="RAG Analytics Pro"
|
| 256 |
gr.Markdown("## RAG Pipeline Analytics")
|
| 257 |
gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
|
| 258 |
|
|
@@ -279,12 +329,32 @@ with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
|
|
| 279 |
|
| 280 |
# TAB 2: Data Inspector
|
| 281 |
with gr.TabItem("Data Preview"):
|
| 282 |
-
gr.Markdown("###
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
preview_btn = gr.Button("Refresh Data Preview")
|
| 285 |
|
| 286 |
# TAB 3: Comparison
|
| 287 |
with gr.TabItem("Inter-Domain Comparison"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
refresh_btn = gr.Button("Generate Comparison")
|
| 289 |
gr.Markdown("### Configuration Differences")
|
| 290 |
comp_table = gr.Dataframe(interactive=False)
|
|
@@ -317,11 +387,11 @@ with gr.Blocks(title="RAG Analytics Pro", theme=gr.themes.Soft()) as demo:
|
|
| 317 |
)
|
| 318 |
|
| 319 |
# Debug Preview Events
|
| 320 |
-
preview_btn.click(get_data_preview, inputs=None, outputs=
|
| 321 |
|
| 322 |
refresh_btn.click(
|
| 323 |
generate_inter_domain_comparison,
|
| 324 |
-
inputs=
|
| 325 |
outputs=[comp_table, global_plot]
|
| 326 |
)
|
| 327 |
|
|
@@ -332,4 +402,4 @@ print(startup_status)
|
|
| 332 |
|
| 333 |
# Launch Gradio app
|
| 334 |
if __name__ == "__main__":
|
| 335 |
-
demo.launch()
|
|
|
|
| 49 |
return []
|
| 50 |
|
| 51 |
def get_data_preview():
|
| 52 |
+
"""Returns separate dataframes for each domain."""
|
| 53 |
if "data" not in DB:
|
| 54 |
+
return {}, {}, {}, {}
|
| 55 |
+
|
| 56 |
+
df = DB["data"]
|
| 57 |
+
# Remove failed_samples column if it exists
|
| 58 |
+
if 'failed_samples' in df.columns:
|
| 59 |
+
df = df.drop(columns=['failed_samples'])
|
| 60 |
+
|
| 61 |
+
datasets = df['dataset_name'].unique()
|
| 62 |
+
|
| 63 |
+
# Create separate dataframes for each domain
|
| 64 |
+
results = {}
|
| 65 |
+
for ds in datasets:
|
| 66 |
+
results[ds] = df[df['dataset_name'] == ds]
|
| 67 |
+
|
| 68 |
+
# Return up to 4 domains (adjust if you have more)
|
| 69 |
+
domain_dfs = list(results.values())
|
| 70 |
+
while len(domain_dfs) < 4:
|
| 71 |
+
domain_dfs.append(pd.DataFrame())
|
| 72 |
+
|
| 73 |
+
return domain_dfs[0], domain_dfs[1], domain_dfs[2], domain_dfs[3]
|
| 74 |
|
| 75 |
def get_domain_state(dataset):
|
| 76 |
empty_update = gr.update(visible=False, value=None, choices=[])
|
|
|
|
| 208 |
|
| 209 |
return fig_rmse, fig_perf
|
| 210 |
|
| 211 |
+
def generate_inter_domain_comparison(metric='f1_score'):
|
| 212 |
+
"""Generates comparison table and plot across all domains for selected metric."""
|
| 213 |
if "data" not in DB:
|
| 214 |
return pd.DataFrame(), None
|
| 215 |
|
|
|
|
| 239 |
|
| 240 |
comp_df = pd.DataFrame(table_rows)
|
| 241 |
|
| 242 |
+
# Metric display names
|
| 243 |
+
metric_names = {
|
| 244 |
+
'rmse_relevance': 'RMSE Relevance',
|
| 245 |
+
'rmse_utilization': 'RMSE Utilization',
|
| 246 |
+
'rmse_completeness': 'RMSE Completeness',
|
| 247 |
+
'f1_score': 'F1 Score',
|
| 248 |
+
'aucroc': 'AUC-ROC'
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
metric_display = metric_names.get(metric, metric)
|
| 252 |
+
is_rmse = metric.startswith('rmse')
|
| 253 |
+
direction = "Lower is Better" if is_rmse else "Higher is Better"
|
| 254 |
+
|
| 255 |
best_results = []
|
| 256 |
for ds in datasets:
|
| 257 |
subset = df[df['dataset_name'] == ds]
|
| 258 |
+
if metric in subset.columns:
|
| 259 |
+
if is_rmse:
|
| 260 |
+
best_val = subset[metric].min()
|
| 261 |
+
best_idx = subset[metric].idxmin()
|
| 262 |
+
else:
|
| 263 |
+
best_val = subset[metric].max()
|
| 264 |
+
best_idx = subset[metric].idxmax()
|
| 265 |
best_row = subset.loc[best_idx]
|
| 266 |
best_results.append({
|
| 267 |
"Domain": ds,
|
| 268 |
+
metric_display: best_val,
|
| 269 |
"Best Config": best_row['config_purpose']
|
| 270 |
})
|
| 271 |
|
| 272 |
+
if best_results:
|
| 273 |
+
best_df = pd.DataFrame(best_results)
|
| 274 |
+
fig_global = px.bar(
|
| 275 |
+
best_df, x="Domain", y=metric_display,
|
| 276 |
+
color="Domain",
|
| 277 |
+
text_auto='.4f',
|
| 278 |
+
hover_data=["Best Config"],
|
| 279 |
+
title=f"Peak Performance per Domain: {metric_display} ({direction})"
|
| 280 |
+
)
|
| 281 |
+
fig_global.update_traces(textposition='outside')
|
| 282 |
+
else:
|
| 283 |
+
fig_global = None
|
| 284 |
+
|
| 285 |
+
return comp_df, fig_global
|
| 286 |
+
|
| 287 |
if best_results:
|
| 288 |
best_df = pd.DataFrame(best_results)
|
| 289 |
fig_global = px.bar(
|
|
|
|
| 302 |
# --- 3. UI ---
|
| 303 |
APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
|
| 304 |
|
| 305 |
+
with gr.Blocks(title="RAG Analytics Pro") as demo:
|
| 306 |
gr.Markdown("## RAG Pipeline Analytics")
|
| 307 |
gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
|
| 308 |
|
|
|
|
| 329 |
|
| 330 |
# TAB 2: Data Inspector
|
| 331 |
with gr.TabItem("Data Preview"):
|
| 332 |
+
gr.Markdown("### All Test Configurations by Domain")
|
| 333 |
+
gr.Markdown("**Biomedical (PubMedQA)**")
|
| 334 |
+
preview_table_1 = gr.Dataframe(interactive=False, wrap=True)
|
| 335 |
+
gr.Markdown("**Finance (FinQA)**")
|
| 336 |
+
preview_table_2 = gr.Dataframe(interactive=False, wrap=True)
|
| 337 |
+
gr.Markdown("**General (MS MARCO)**")
|
| 338 |
+
preview_table_3 = gr.Dataframe(interactive=False, wrap=True)
|
| 339 |
+
gr.Markdown("**Legal (CUAD)**")
|
| 340 |
+
preview_table_4 = gr.Dataframe(interactive=False, wrap=True)
|
| 341 |
preview_btn = gr.Button("Refresh Data Preview")
|
| 342 |
|
| 343 |
# TAB 3: Comparison
|
| 344 |
with gr.TabItem("Inter-Domain Comparison"):
|
| 345 |
+
gr.Markdown("### Select Metric to Compare")
|
| 346 |
+
metric_dropdown = gr.Dropdown(
|
| 347 |
+
label="Comparison Metric",
|
| 348 |
+
choices=[
|
| 349 |
+
("F1 Score (Higher is Better)", "f1_score"),
|
| 350 |
+
("AUC-ROC (Higher is Better)", "aucroc"),
|
| 351 |
+
("RMSE Relevance (Lower is Better)", "rmse_relevance"),
|
| 352 |
+
("RMSE Utilization (Lower is Better)", "rmse_utilization"),
|
| 353 |
+
("RMSE Completeness (Lower is Better)", "rmse_completeness")
|
| 354 |
+
],
|
| 355 |
+
value="f1_score",
|
| 356 |
+
interactive=True
|
| 357 |
+
)
|
| 358 |
refresh_btn = gr.Button("Generate Comparison")
|
| 359 |
gr.Markdown("### Configuration Differences")
|
| 360 |
comp_table = gr.Dataframe(interactive=False)
|
|
|
|
| 387 |
)
|
| 388 |
|
| 389 |
# Debug Preview Events
|
| 390 |
+
preview_btn.click(get_data_preview, inputs=None, outputs=[preview_table_1, preview_table_2, preview_table_3, preview_table_4])
|
| 391 |
|
| 392 |
refresh_btn.click(
|
| 393 |
generate_inter_domain_comparison,
|
| 394 |
+
inputs=[metric_dropdown],
|
| 395 |
outputs=[comp_table, global_plot]
|
| 396 |
)
|
| 397 |
|
|
|
|
| 402 |
|
| 403 |
# Launch Gradio app
|
| 404 |
if __name__ == "__main__":
|
| 405 |
+
demo.launch(ssr_mode=False)
|
config.py
CHANGED
|
@@ -19,8 +19,7 @@ METRIC_COLUMNS = [
|
|
| 19 |
'rmse_utilization',
|
| 20 |
'rmse_completeness',
|
| 21 |
'f1_score',
|
| 22 |
-
'aucroc'
|
| 23 |
-
'failed_samples'
|
| 24 |
]
|
| 25 |
|
| 26 |
# Numeric configuration columns (also need float conversion)
|
|
|
|
| 19 |
'rmse_utilization',
|
| 20 |
'rmse_completeness',
|
| 21 |
'f1_score',
|
| 22 |
+
'aucroc'
|
|
|
|
| 23 |
]
|
| 24 |
|
| 25 |
# Numeric configuration columns (also need float conversion)
|