Spaces:
Sleeping
Sleeping
updated files
Browse files- app.py +21 -19
- config.py +2 -1
- data/Biomedical-pubmedqa.csv +7 -7
- data/Finance-finqa.csv +6 -6
- data/General-msmarco.csv +7 -7
- data/Legal-cuad.csv +5 -5
app.py
CHANGED
|
@@ -283,29 +283,32 @@ def generate_inter_domain_comparison(metric='f1_score'):
|
|
| 283 |
fig_global = None
|
| 284 |
|
| 285 |
return comp_df, fig_global
|
| 286 |
-
|
| 287 |
-
if best_results:
|
| 288 |
-
best_df = pd.DataFrame(best_results)
|
| 289 |
-
fig_global = px.bar(
|
| 290 |
-
best_df, x="Domain", y="Max F1 Score",
|
| 291 |
-
color="Domain",
|
| 292 |
-
text_auto='.4f',
|
| 293 |
-
hover_data=["Best Config"],
|
| 294 |
-
title="Peak Performance per Domain (Max F1 Score)"
|
| 295 |
-
)
|
| 296 |
-
fig_global.update_traces(textposition='outside')
|
| 297 |
-
else:
|
| 298 |
-
fig_global = None
|
| 299 |
-
|
| 300 |
-
return comp_df, fig_global
|
| 301 |
|
| 302 |
# --- 3. UI ---
|
| 303 |
-
APP_VERSION = "v2.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
|
| 305 |
with gr.Blocks(title="RAG Analytics Pro") as demo:
|
| 306 |
gr.Markdown("## RAG Pipeline Analytics")
|
| 307 |
gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
|
| 308 |
|
|
|
|
|
|
|
|
|
|
| 309 |
with gr.Row():
|
| 310 |
refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary")
|
| 311 |
status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3)
|
|
@@ -400,6 +403,5 @@ print(f"Loading data from {DATA_FOLDER}...")
|
|
| 400 |
startup_status = load_data()
|
| 401 |
print(startup_status)
|
| 402 |
|
| 403 |
-
# Launch Gradio app
|
| 404 |
-
|
| 405 |
-
demo.launch(ssr_mode=False)
|
|
|
|
| 283 |
fig_global = None
|
| 284 |
|
| 285 |
return comp_df, fig_global
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
# --- 3. UI ---
|
| 288 |
+
APP_VERSION = "v2.2.0"
|
| 289 |
+
|
| 290 |
+
# Global constants used across all experiments
|
| 291 |
+
GLOBAL_CONSTANTS = """
|
| 292 |
+
**Global Constants (Applied to All Domains):**
|
| 293 |
+
- **Summarization Model:** fangyuan/nq_abstractive_compressor
|
| 294 |
+
- **Generator Model:** llama-3.1-8b-instant
|
| 295 |
+
- **Generator Max Tokens:** 512
|
| 296 |
+
- **Generator Temperature:** 0.2
|
| 297 |
+
- **Generator API Provider:** Groq
|
| 298 |
+
- **Generation LLM Context Budget:** 2000
|
| 299 |
+
- **Judge Model:** llama-3.3-70b-versatile
|
| 300 |
+
- **Judge Max Tokens:** 1024
|
| 301 |
+
- **Judge Temperature:** 0
|
| 302 |
+
- **Judge Sentence Attribution:** ENABLED
|
| 303 |
+
"""
|
| 304 |
|
| 305 |
with gr.Blocks(title="RAG Analytics Pro") as demo:
|
| 306 |
gr.Markdown("## RAG Pipeline Analytics")
|
| 307 |
gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
|
| 308 |
|
| 309 |
+
with gr.Accordion("Global Experiment Configuration", open=False):
|
| 310 |
+
gr.Markdown(GLOBAL_CONSTANTS)
|
| 311 |
+
|
| 312 |
with gr.Row():
|
| 313 |
refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary")
|
| 314 |
status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3)
|
|
|
|
| 403 |
startup_status = load_data()
|
| 404 |
print(startup_status)
|
| 405 |
|
| 406 |
+
# Launch Gradio app (for Hugging Face Spaces, this runs on import)
|
| 407 |
+
demo.launch(ssr_mode=False)
|
|
|
config.py
CHANGED
|
@@ -43,6 +43,7 @@ COLUMN_MAP = {
|
|
| 43 |
'embeddingmodel': 'embedding_model',
|
| 44 |
'rerankermodel': 'reranker_model',
|
| 45 |
'summarizationmodel': 'summarization_model',
|
|
|
|
| 46 |
'chunkingstrategy': 'chunking_strategy',
|
| 47 |
'chunksize': 'chunk_size',
|
| 48 |
'overlap': 'overlap',
|
|
@@ -69,7 +70,7 @@ COLUMN_MAP = {
|
|
| 69 |
# Metadata columns (excluded from constant/variable analysis)
|
| 70 |
METADATA_COLUMNS = [
|
| 71 |
'rmse_relevance', 'rmse_utilization', 'rmse_completeness',
|
| 72 |
-
'aucroc', 'f1_score',
|
| 73 |
'test_id', 'config_purpose', 'dataset_name'
|
| 74 |
]
|
| 75 |
|
|
|
|
| 43 |
'embeddingmodel': 'embedding_model',
|
| 44 |
'rerankermodel': 'reranker_model',
|
| 45 |
'summarizationmodel': 'summarization_model',
|
| 46 |
+
'summarization': 'summarization', # New column name (enabled/disabled)
|
| 47 |
'chunkingstrategy': 'chunking_strategy',
|
| 48 |
'chunksize': 'chunk_size',
|
| 49 |
'overlap': 'overlap',
|
|
|
|
| 70 |
# Metadata columns (excluded from constant/variable analysis)
|
| 71 |
METADATA_COLUMNS = [
|
| 72 |
'rmse_relevance', 'rmse_utilization', 'rmse_completeness',
|
| 73 |
+
'aucroc', 'f1_score',
|
| 74 |
'test_id', 'config_purpose', 'dataset_name'
|
| 75 |
]
|
| 76 |
|
data/Biomedical-pubmedqa.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization
|
| 2 |
-
1,Efficiency Baseline,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 3 |
-
2,Chunking Proof,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 4 |
-
3,Reranking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,
|
| 5 |
-
4,Repacking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,
|
| 6 |
-
5,Prove Summarization,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,
|
| 7 |
-
6,Optimal Medical Hybrid,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,hard_cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3677,0.3011,0.5556,0.604,0.5049,32/100
|
| 3 |
+
2,Chunking Proof,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3632,0.2886,0.5074,0.604,0.5049,29/100
|
| 4 |
+
3,Reranking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3289,0.2663,0.6015,0.482,0.38,8/100
|
| 5 |
+
4,Repacking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,hard_cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.2752,0.252,0.6246,0.5951,0.449,8/100
|
| 6 |
+
5,Prove Summarization,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,enabled,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.4934,1.0537,0.5161,cannot compute,0,9/100
|
| 7 |
+
6,Optimal Medical Hybrid,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3223,0.2733,0.6561,0.5053,0.3542,13/100
|
data/Finance-finqa.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
-
1,Efficiency Baseline,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 3 |
-
2,Prove Chunking,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 4 |
-
3,Prove Hybrid/Rerank,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
| 5 |
-
4,Max Raw Context,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
| 6 |
-
5,Golden Setup,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
| 7 |
-
6,Optimized Financial,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
|
|
|
| 1 |
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level,512,50,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1409,0.0831,0.6365,0.4263,0.099,18/100
|
| 3 |
+
2,Prove Chunking,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1667,0.1188,0.6431,0.4316,0.1176,23/100
|
| 4 |
+
3,Prove Hybrid/Rerank,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.1316,0.0693,0.6763,0.4263,0.099,11/100
|
| 5 |
+
4,Max Raw Context,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,sentence-level,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.1947,0.0795,0.7239,0.4316,0.1176,14/100
|
| 6 |
+
5,Golden Setup,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,enabled,token-level,512,100,412,Hybrid,0.6,50,5,reverse,200,20,long_cot,0.4158,0.8363,0.7073,Cannot compute (insufficient class variance),0,4/100
|
| 7 |
+
6,Optimized Financial,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.8,50,3,forward,N/A,N/A,long,0.2468,0.1679,0.6177,0.5474,0.1731,6/100
|
data/General-msmarco.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization
|
| 2 |
-
1,Efficiency Baseline,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 3 |
-
2,Prove Chunking,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 4 |
-
3,Prove Hybrid/Rerank,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,
|
| 5 |
-
4,Prove Repacking,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,
|
| 6 |
-
5,Prove Summarization,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,
|
| 7 |
-
6,Optimized Hybrid,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,
|
|
|
|
| 1 |
+
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
|
| 2 |
+
1,Efficiency Baseline,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3252,0.1998,0.4362,0.5125,0.5954,23/100
|
| 3 |
+
2,Prove Chunking,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3449,0.1947,0.4248,0.495,0.5625,30/100
|
| 4 |
+
3,Prove Hybrid/Rerank,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,token-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3183,0.1793,0.407,0.5183,0.6061,22/100
|
| 5 |
+
4,Prove Repacking,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,sentence-level,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3416,0.1837,0.4491,0.559,0.6763,11/100
|
| 6 |
+
5,Prove Summarization,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,enabled,token-level,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.5066,0.8781,0.5049,N/A,0,3/100
|
| 7 |
+
6,Optimized Hybrid,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,sentence-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3292,0.1754,0.5477,0.4842,0.4706,0/100
|
data/Legal-cuad.csv
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,% Failed Sample
|
| 2 |
-
1,Efficiency Baseline,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 3 |
-
2,Prove Chunking,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,
|
| 4 |
-
3,Prove Hybrid/Rerank,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
| 5 |
-
4,Max Raw Context,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
| 6 |
-
5,Golden Setup,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,
|
|
|
|
| 1 |
Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,% Failed Sample
|
| 2 |
+
1,Efficiency Baseline,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level ,512,100,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2951,0.1697,0.6225,0.4321,0.3761,35.00%
|
| 3 |
+
2,Prove Chunking,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2927,0.1623,0.5612,0.4065,0.2609,32.00%
|
| 4 |
+
3,Prove Hybrid/Rerank,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.3087,0.1296,0.5315,0.5197,0.5543,15.00%
|
| 5 |
+
4,Max Raw Context,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,sentence-level,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.3287,0.1429,0.6583,0.4132,0.3859,17.00%
|
| 6 |
+
5,Golden Setup,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,enabled,token-level,512,100,412,Hybrid,0.6,50,5,reverse,250,50,long_cot,0.5048,0.7648,0.4832,0.0215,0.5054,17.00%
|