Spaces:
Sleeping
Sleeping
File size: 9,558 Bytes
291fb52 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
import gradio as gr
import pandas as pd
# Paper Retrieval Benchmark Data (from SemanticBench)
retrieval_data = [
{"Model": "Qwen3-Coder-30B-Q3_K_M", "Type": "Agent", "Hit Rate": 0.80, "MRR": 0.627, "R@1": 0.58, "R@5": 0.66, "R@10": 0.74, "R@20": 0.78, "R@50": 0.80, "Time (s)": 22.2, "Steps": 1.42},
{"Model": "qwen3-coder:30b", "Type": "Agent", "Hit Rate": 0.80, "MRR": 0.518, "R@1": 0.46, "R@5": 0.52, "R@10": 0.72, "R@20": 0.76, "R@50": 0.80, "Time (s)": 21.1, "Steps": 1.34},
{"Model": "BM25", "Type": "Baseline", "Hit Rate": 0.78, "MRR": 0.541, "R@1": 0.48, "R@5": 0.60, "R@10": 0.66, "R@20": 0.78, "R@50": 0.78, "Time (s)": None, "Steps": None},
{"Model": "microcoder-deepseekr1-14.8b", "Type": "Agent", "Hit Rate": 0.73, "MRR": 0.453, "R@1": 0.38, "R@5": 0.46, "R@10": 0.65, "R@20": 0.69, "R@50": 0.73, "Time (s)": 107.4, "Steps": 4.15},
{"Model": "deepseek-coder-v3:16b", "Type": "Agent", "Hit Rate": 0.66, "MRR": 0.396, "R@1": 0.32, "R@5": 0.46, "R@10": 0.52, "R@20": 0.60, "R@50": 0.66, "Time (s)": 47.9, "Steps": 1.54},
{"Model": "qwen2.5-coder:3b", "Type": "Agent", "Hit Rate": 0.60, "MRR": 0.366, "R@1": 0.28, "R@5": 0.45, "R@10": 0.53, "R@20": 0.55, "R@50": 0.57, "Time (s)": 210.4, "Steps": 1.51},
{"Model": "qwen2.5-coder:14b", "Type": "Agent", "Hit Rate": 0.56, "MRR": 0.461, "R@1": 0.41, "R@5": 0.51, "R@10": 0.51, "R@20": 0.56, "R@50": 0.56, "Time (s)": 73.4, "Steps": 1.05},
{"Model": "Semantic (MiniLM-L6)", "Type": "Baseline", "Hit Rate": 0.54, "MRR": 0.279, "R@1": 0.22, "R@5": 0.32, "R@10": 0.38, "R@20": 0.52, "R@50": 0.54, "Time (s)": None, "Steps": None},
{"Model": "qwen2.5-coder:7b", "Type": "Agent", "Hit Rate": 0.54, "MRR": 0.311, "R@1": 0.26, "R@5": 0.36, "R@10": 0.40, "R@20": 0.52, "R@50": 0.54, "Time (s)": 59.3, "Steps": 0.84},
{"Model": "deepseek-coder:33b", "Type": "Agent", "Hit Rate": 0.12, "MRR": 0.087, "R@1": 0.08, "R@5": 0.08, "R@10": 0.12, "R@20": 0.12, "R@50": 0.12, "Time (s)": 180.4, "Steps": 0.14},
{"Model": "granite-code:34b", "Type": "Agent", "Hit Rate": 0.02, "MRR": 0.010, "R@1": 0.00, "R@5": 0.02, "R@10": 0.02, "R@20": 0.02, "R@50": 0.02, "Time (s)": 111.3, "Steps": 0.04},
]
# RAbench Results (500 queries)
rabench_data = [
{"Model": "Qwen3-Coder-30B-Q3_K_M", "Type": "Agent", "Hit Rate": 0.98, "MRR": 0.882, "R@1": 0.83, "R@5": 0.93, "R@10": 0.95, "R@20": 0.96, "R@50": 0.97, "Time (s)": 21.53, "Steps": 1.36},
]
# Ablation Study Data
ablation_data = [
{"Configuration": "Default (Full Agent)", "Queries": 500, "Hit Rate": 0.9818, "MRR": 0.8824, "R@1": 0.8381, "R@5": 0.9312, "Time (s)": 21.54},
{"Configuration": "With Filters & Offline", "Queries": 50, "Hit Rate": 0.9600, "MRR": 0.8485, "R@1": 0.7800, "R@5": 0.9000, "Time (s)": 22.76},
{"Configuration": "Offline Only", "Queries": 50, "Hit Rate": 0.9200, "MRR": 0.6476, "R@1": 0.5600, "R@5": 0.7400, "Time (s)": 41.45},
{"Configuration": "No Mentions", "Queries": 50, "Hit Rate": 0.6400, "MRR": 0.4316, "R@1": 0.3600, "R@5": 0.5200, "Time (s)": 38.35},
{"Configuration": "Online/Offline Mix", "Queries": 50, "Hit Rate": 0.6200, "MRR": 0.4595, "R@1": 0.4200, "R@5": 0.5000, "Time (s)": 38.50},
]
# Retrieval Baseline Ablations
baseline_ablation_data = [
{"Configuration": "BM25 Full", "Baseline": "bm25", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8629, "R@1": 0.80, "R@5": 0.92, "Time (s)": 33.75},
{"Configuration": "BM25 + Reranker", "Baseline": "bm25+reranker", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8692, "R@1": 0.80, "R@5": 0.94, "Time (s)": 935.07},
{"Configuration": "Hybrid Full", "Baseline": "hybrid", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8620, "R@1": 0.80, "R@5": 0.92, "Time (s)": 31.65},
{"Configuration": "Semantic Full", "Baseline": "semantic", "Structure": "full", "Hit Rate": 0.94, "MRR": 0.7097, "R@1": 0.62, "R@5": 0.88, "Time (s)": 31.28},
{"Configuration": "BM25 No Intent", "Baseline": "bm25", "Structure": "no_intent", "Hit Rate": 0.96, "MRR": 0.8554, "R@1": 0.80, "R@5": 0.92, "Time (s)": 31.47},
{"Configuration": "BM25 Minimal", "Baseline": "bm25", "Structure": "minimal", "Hit Rate": 0.96, "MRR": 0.8420, "R@1": 0.78, "R@5": 0.92, "Time (s)": 33.34},
]
# Dataset Statistics
dataset_stats = [
{"Conference": "ICLR", "Count": 12},
{"Conference": "NeurIPS", "Count": 39},
{"Conference": "ICML", "Count": 13},
{"Conference": "CVPR", "Count": 13},
{"Conference": "IROS", "Count": 25},
{"Conference": "ICRA", "Count": 25},
{"Conference": "AAAI", "Count": 5},
{"Conference": "ACL", "Count": 5},
{"Conference": "ICCV", "Count": 7},
{"Conference": "EMNLP", "Count": 4},
{"Conference": "Other", "Count": 144},
]
def create_retrieval_df():
df = pd.DataFrame(retrieval_data)
df = df.sort_values("MRR", ascending=False)
return df
def create_ablation_df():
return pd.DataFrame(ablation_data)
def create_baseline_ablation_df():
return pd.DataFrame(baseline_ablation_data)
def create_dataset_df():
return pd.DataFrame(dataset_stats)
def filter_by_type(model_type):
df = pd.DataFrame(retrieval_data)
if model_type != "All":
df = df[df["Type"] == model_type]
return df.sort_values("MRR", ascending=False)
with gr.Blocks(title="PC-Bench: Paper Discovery Benchmark") as demo:
gr.HTML("""
<div style="text-align: center; margin-bottom: 20px;">
<h1>PC-Bench: Paper Discovery Benchmark</h1>
<p style="color: #666;">Evaluating AI agents for academic paper retrieval and analysis</p>
<p>
<a href="https://github.com/MAXNORM8650/papercircle" target="_blank">
<img src="https://img.shields.io/badge/GitHub-Repository-blue?logo=github" alt="GitHub"/>
</a>
<img src="https://img.shields.io/badge/Papers-292-green" alt="Papers"/>
<img src="https://img.shields.io/badge/Queries-500+-orange" alt="Queries"/>
</p>
</div>
""")
with gr.Tabs():
with gr.TabItem("Model Leaderboard"):
gr.Markdown("### Multi-Agent Paper Retrieval (SemanticBench - 50 queries)")
gr.Markdown("Models ranked by Mean Reciprocal Rank (MRR). Higher is better.")
model_filter = gr.Dropdown(
choices=["All", "Agent", "Baseline"],
value="All",
label="Filter by Type"
)
leaderboard_table = gr.Dataframe(
value=create_retrieval_df(),
headers=["Model", "Type", "Hit Rate", "MRR", "R@1", "R@5", "R@10", "R@20", "R@50", "Time (s)", "Steps"],
interactive=False,
)
model_filter.change(
fn=filter_by_type,
inputs=[model_filter],
outputs=[leaderboard_table]
)
gr.Markdown("""
**Key Findings:**
- **Qwen3-Coder-30B** achieves best MRR (0.627) with 80% hit rate
- **BM25 baseline** remains competitive (78% hit rate, 0.541 MRR)
- Larger models (30B+) consistently outperform smaller variants
""")
with gr.TabItem("RAbench Results"):
gr.Markdown("### Extended Benchmark (RAbench - 500 queries)")
gr.Markdown("LLM-perturbed natural language queries")
gr.Dataframe(
value=pd.DataFrame(rabench_data),
headers=["Model", "Type", "Hit Rate", "MRR", "R@1", "R@5", "R@10", "R@20", "R@50", "Time (s)", "Steps"],
interactive=False,
)
gr.Markdown("""
**Observation:** RAbench shows higher performance than SemanticBench,
suggesting LLM-perturbed queries are easier for multi-agent retrieval.
""")
with gr.TabItem("Configuration Ablations"):
gr.Markdown("### Query Configuration Impact")
gr.Dataframe(
value=create_ablation_df(),
interactive=False,
)
gr.Markdown("### Retrieval Baseline Comparison")
gr.Dataframe(
value=create_baseline_ablation_df(),
interactive=False,
)
gr.Markdown("""
**Key Insights:**
- BM25 + Reranker achieves highest MRR (0.869) but is 28x slower
- No Intent configuration is fastest while maintaining 96% hit rate
- Semantic-only retrieval shows significant R@1 drop (0.62 vs 0.80)
""")
with gr.TabItem("Dataset"):
gr.Markdown("### Database Corpus Statistics")
gr.Markdown("Papers sourced from OpenReview across major ML/CS conferences")
gr.Dataframe(
value=create_dataset_df(),
interactive=False,
)
total = sum(d["Count"] for d in dataset_stats)
gr.Markdown(f"**Total Papers:** {total}")
gr.Markdown("""
---
### About
**Paper Circle** is a multi-agent research pipeline for intelligent paper discovery and analysis.
**Pipeline:** Query β Intent Agent β Search Agent β Sort Agent β Analysis Agent β Export
**Metrics:**
- **MRR** (Mean Reciprocal Rank): Ranking quality
- **R@K** (Recall at K): Found in top K results
- **Hit Rate**: Successful retrieval percentage
Built with [smolagents](https://github.com/huggingface/smolagents) and [LiteLLM](https://github.com/BerriAI/litellm)
""")
if __name__ == "__main__":
demo.launch()
|