npuliga commited on
Commit
b6f27fa
·
1 Parent(s): 4c2722d

updated files

Browse files
app.py CHANGED
@@ -283,29 +283,32 @@ def generate_inter_domain_comparison(metric='f1_score'):
283
  fig_global = None
284
 
285
  return comp_df, fig_global
286
-
287
- if best_results:
288
- best_df = pd.DataFrame(best_results)
289
- fig_global = px.bar(
290
- best_df, x="Domain", y="Max F1 Score",
291
- color="Domain",
292
- text_auto='.4f',
293
- hover_data=["Best Config"],
294
- title="Peak Performance per Domain (Max F1 Score)"
295
- )
296
- fig_global.update_traces(textposition='outside')
297
- else:
298
- fig_global = None
299
-
300
- return comp_df, fig_global
301
 
302
  # --- 3. UI ---
303
- APP_VERSION = "v2.1.0-fixed" # Version stamp to verify code is updated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
304
 
305
  with gr.Blocks(title="RAG Analytics Pro") as demo:
306
  gr.Markdown("## RAG Pipeline Analytics")
307
  gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
308
 
 
 
 
309
  with gr.Row():
310
  refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary")
311
  status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3)
@@ -400,6 +403,5 @@ print(f"Loading data from {DATA_FOLDER}...")
400
  startup_status = load_data()
401
  print(startup_status)
402
 
403
- # Launch Gradio app
404
- if __name__ == "__main__":
405
- demo.launch(ssr_mode=False)
 
283
  fig_global = None
284
 
285
  return comp_df, fig_global
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  # --- 3. UI ---
288
+ APP_VERSION = "v2.2.0"
289
+
290
+ # Global constants used across all experiments
291
+ GLOBAL_CONSTANTS = """
292
+ **Global Constants (Applied to All Domains):**
293
+ - **Summarization Model:** fangyuan/nq_abstractive_compressor
294
+ - **Generator Model:** llama-3.1-8b-instant
295
+ - **Generator Max Tokens:** 512
296
+ - **Generator Temperature:** 0.2
297
+ - **Generator API Provider:** Groq
298
+ - **Generation LLM Context Budget:** 2000
299
+ - **Judge Model:** llama-3.3-70b-versatile
300
+ - **Judge Max Tokens:** 1024
301
+ - **Judge Temperature:** 0
302
+ - **Judge Sentence Attribution:** ENABLED
303
+ """
304
 
305
  with gr.Blocks(title="RAG Analytics Pro") as demo:
306
  gr.Markdown("## RAG Pipeline Analytics")
307
  gr.Markdown(f"**Data Source:** `{DATA_FOLDER}` | **Version:** {APP_VERSION}")
308
 
309
+ with gr.Accordion("Global Experiment Configuration", open=False):
310
+ gr.Markdown(GLOBAL_CONSTANTS)
311
+
312
  with gr.Row():
313
  refresh_data_btn = gr.Button("Load/Refresh Data", variant="primary")
314
  status = gr.Textbox(label="Status (Check here for debug info)", interactive=False, scale=3)
 
403
  startup_status = load_data()
404
  print(startup_status)
405
 
406
+ # Launch Gradio app (for Hugging Face Spaces, this runs on import)
407
+ demo.launch(ssr_mode=False)
 
config.py CHANGED
@@ -43,6 +43,7 @@ COLUMN_MAP = {
43
  'embeddingmodel': 'embedding_model',
44
  'rerankermodel': 'reranker_model',
45
  'summarizationmodel': 'summarization_model',
 
46
  'chunkingstrategy': 'chunking_strategy',
47
  'chunksize': 'chunk_size',
48
  'overlap': 'overlap',
@@ -69,7 +70,7 @@ COLUMN_MAP = {
69
  # Metadata columns (excluded from constant/variable analysis)
70
  METADATA_COLUMNS = [
71
  'rmse_relevance', 'rmse_utilization', 'rmse_completeness',
72
- 'aucroc', 'f1_score', 'failed_samples',
73
  'test_id', 'config_purpose', 'dataset_name'
74
  ]
75
 
 
43
  'embeddingmodel': 'embedding_model',
44
  'rerankermodel': 'reranker_model',
45
  'summarizationmodel': 'summarization_model',
46
+ 'summarization': 'summarization', # New column name (enabled/disabled)
47
  'chunkingstrategy': 'chunking_strategy',
48
  'chunksize': 'chunk_size',
49
  'overlap': 'overlap',
 
70
  # Metadata columns (excluded from constant/variable analysis)
71
  METADATA_COLUMNS = [
72
  'rmse_relevance', 'rmse_utilization', 'rmse_completeness',
73
+ 'aucroc', 'f1_score',
74
  'test_id', 'config_purpose', 'dataset_name'
75
  ]
76
 
data/Biomedical-pubmedqa.csv CHANGED
@@ -1,7 +1,7 @@
1
- Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
- 1,Efficiency Baseline,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,None,hard_cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3677,0.3011,0.5556,0.604,0.5049,32/100
3
- 2,Chunking Proof,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,None,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3632,0.2886,0.5074,0.604,0.5049,29/100
4
- 3,Reranking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,None,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3289,0.2663,0.6015,0.482,0.38,8/100
5
- 4,Repacking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,None,hard_cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.2752,0.252,0.6246,0.5951,0.449,8/100
6
- 5,Prove Summarization,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,fangyuan/nq_abstractive_compressor,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.4934,1.0537,0.5161,cannot compute,0,9/100
7
- 6,Optimal Medical Hybrid,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,N/A,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3223,0.2733,0.6561,0.5053,0.3542,13/100
 
1
+ Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
+ 1,Efficiency Baseline,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,hard_cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3677,0.3011,0.5556,0.604,0.5049,32/100
3
+ 2,Chunking Proof,pubmedqa,NeuML/pubmedbert-base-embeddings,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3632,0.2886,0.5074,0.604,0.5049,29/100
4
+ 3,Reranking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3289,0.2663,0.6015,0.482,0.38,8/100
5
+ 4,Repacking proof,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,hard_cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.2752,0.252,0.6246,0.5951,0.449,8/100
6
+ 5,Prove Summarization,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,enabled,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.4934,1.0537,0.5161,cannot compute,0,9/100
7
+ 6,Optimal Medical Hybrid,pubmedqa,NeuML/pubmedbert-base-embeddings,BAAI/bge-reranker-base,disabled,sliding_window,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3223,0.2733,0.6561,0.5053,0.3542,13/100
data/Finance-finqa.csv CHANGED
@@ -1,7 +1,7 @@
1
  Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
- 1,Efficiency Baseline,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut,512,50,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1409,0.0831,0.6365,0.4263,0.099,18/100
3
- 2,Prove Chunking,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1667,0.1188,0.6431,0.4316,0.1176,23/100
4
- 3,Prove Hybrid/Rerank,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.1316,0.0693,0.6763,0.4263,0.099,11/100
5
- 4,Max Raw Context,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,hard-cut,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.1947,0.0795,0.7239,0.4316,0.1176,14/100
6
- 5,Golden Setup,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,fangyuan/nq_abstractive_compressor,sliding-window,512,100,412,Hybrid,0.6,50,5,reverse,200,20,long_cot,0.4158,0.8363,0.7073,Cannot compute (insufficient class variance),0,4/100
7
- 6,Optimized Financial,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.8,50,3,forward,N/A,N/A,long,0.2468,0.1679,0.6177,0.5474,0.1731,6/100
 
1
  Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
+ 1,Efficiency Baseline,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level,512,50,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1409,0.0831,0.6365,0.4263,0.099,18/100
3
+ 2,Prove Chunking,finqa,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.1667,0.1188,0.6431,0.4316,0.1176,23/100
4
+ 3,Prove Hybrid/Rerank,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.1316,0.0693,0.6763,0.4263,0.099,11/100
5
+ 4,Max Raw Context,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,sentence-level,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.1947,0.0795,0.7239,0.4316,0.1176,14/100
6
+ 5,Golden Setup,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,enabled,token-level,512,100,412,Hybrid,0.6,50,5,reverse,200,20,long_cot,0.4158,0.8363,0.7073,Cannot compute (insufficient class variance),0,4/100
7
+ 6,Optimized Financial,finqa,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.8,50,3,forward,N/A,N/A,long,0.2468,0.1679,0.6177,0.5474,0.1731,6/100
data/General-msmarco.csv CHANGED
@@ -1,7 +1,7 @@
1
- Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
- 1,Efficiency Baseline,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3252,0.1998,0.4362,0.5125,0.5954,23/100
3
- 2,Prove Chunking,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3449,0.1947,0.4248,0.495,0.5625,30/100
4
- 3,Prove Hybrid/Rerank,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,sliding-window,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3183,0.1793,0.407,0.5183,0.6061,22/100
5
- 4,Prove Repacking,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,hard-cut,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3416,0.1837,0.4491,0.559,0.6763,11/100
6
- 5,Prove Summarization,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,fangyuan/nq_abstractive_compressor,sliding-window,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.5066,0.8781,0.5049,N/A,0,3/100
7
- 6,Optimized Hybrid,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,N/A,hard-cut,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3292,0.1754,0.5477,0.4842,0.4706,0/100
 
1
+ Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,# Failed/Total Samples
2
+ 1,Efficiency Baseline,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level,256,50,N/A,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3252,0.1998,0.4362,0.5125,0.5954,23/100
3
+ 2,Prove Chunking,msmarco,BAAI/bge-base-en-v1.5,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3449,0.1947,0.4248,0.495,0.5625,30/100
4
+ 3,Prove Hybrid/Rerank,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,token-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,short,0.3183,0.1793,0.407,0.5183,0.6061,22/100
5
+ 4,Prove Repacking,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,sentence-level,256,50,206,Hybrid,0.8,50,5,reverse,N/A,N/A,long,0.3416,0.1837,0.4491,0.559,0.6763,11/100
6
+ 5,Prove Summarization,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,enabled,token-level,256,50,206,Hybrid,0.8,50,5,reverse,150,20,long_cot,0.5066,0.8781,0.5049,N/A,0,3/100
7
+ 6,Optimized Hybrid,msmarco,BAAI/bge-base-en-v1.5,BAAI/bge-reranker-base,disabled,sentence-level,256,50,206,Hybrid,0.8,50,5,forward,N/A,N/A,long,0.3292,0.1754,0.5477,0.4842,0.4706,0/100
data/Legal-cuad.csv CHANGED
@@ -1,6 +1,6 @@
1
  Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,% Failed Sample
2
- 1,Efficiency Baseline,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,hard-cut/ token aware chunking ,512,100,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2951,0.1697,0.6225,0.4321,0.3761,35.00%
3
- 2,Prove Chunking,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2927,0.1623,0.5612,0.4065,0.2609,32.00%
4
- 3,Prove Hybrid/Rerank,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,sliding-window,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.3087,0.1296,0.5315,0.5197,0.5543,15.00%
5
- 4,Max Raw Context,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,N/A,hard-cut/ token aware chunking ,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.3287,0.1429,0.6583,0.4132,0.3859,17.00%
6
- 5,Golden Setup,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,fangyuan/nq_abstractive_compressor,sliding-window,512,100,412,Hybrid,0.6,50,5,reverse,250,50,long_cot,0.5048,0.7648,0.4832,0.0215,0.5054,17.00%
 
1
  Test #,Configuration purpose,Subset(s),Embedding Model,Reranker Model,Summarization Model,Chunking Strategy,Chunk size,Overlap,Stride,Retreival Strategy,Alpha,Retr. K,Final K,Repacking,Summ. Max,Summ. Min,8B GPT Label,RMSE=trace relevance,RMSE=trace utilization,RMSE=trace completeness,AUCROC,F1-score,% Failed Sample
2
+ 1,Efficiency Baseline,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,sentence-level ,512,100,N/A,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2951,0.1697,0.6225,0.4321,0.3761,35.00%
3
+ 2,Prove Chunking,cuad,BAAI/bge-m3,cross-encoder/ms-marco-MiniLM-L-6-v2,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,short,0.2927,0.1623,0.5612,0.4065,0.2609,32.00%
4
+ 3,Prove Hybrid/Rerank,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,token-level,512,100,412,Hybrid,0.6,50,5,forward,N/A,N/A,long,0.3087,0.1296,0.5315,0.5197,0.5543,15.00%
5
+ 4,Max Raw Context,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,disabled,sentence-level,512,100,412,Hybrid,0.6,50,5,reverse,N/A,N/A,long,0.3287,0.1429,0.6583,0.4132,0.3859,17.00%
6
+ 5,Golden Setup,cuad,BAAI/bge-m3,BAAI/bge-reranker-v2-m3,enabled,token-level,512,100,412,Hybrid,0.6,50,5,reverse,250,50,long_cot,0.5048,0.7648,0.4832,0.0215,0.5054,17.00%