File size: 9,558 Bytes
291fb52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import gradio as gr
import pandas as pd

# Paper Retrieval Benchmark Data (from SemanticBench)
retrieval_data = [
    {"Model": "Qwen3-Coder-30B-Q3_K_M", "Type": "Agent", "Hit Rate": 0.80, "MRR": 0.627, "R@1": 0.58, "R@5": 0.66, "R@10": 0.74, "R@20": 0.78, "R@50": 0.80, "Time (s)": 22.2, "Steps": 1.42},
    {"Model": "qwen3-coder:30b", "Type": "Agent", "Hit Rate": 0.80, "MRR": 0.518, "R@1": 0.46, "R@5": 0.52, "R@10": 0.72, "R@20": 0.76, "R@50": 0.80, "Time (s)": 21.1, "Steps": 1.34},
    {"Model": "BM25", "Type": "Baseline", "Hit Rate": 0.78, "MRR": 0.541, "R@1": 0.48, "R@5": 0.60, "R@10": 0.66, "R@20": 0.78, "R@50": 0.78, "Time (s)": None, "Steps": None},
    {"Model": "microcoder-deepseekr1-14.8b", "Type": "Agent", "Hit Rate": 0.73, "MRR": 0.453, "R@1": 0.38, "R@5": 0.46, "R@10": 0.65, "R@20": 0.69, "R@50": 0.73, "Time (s)": 107.4, "Steps": 4.15},
    {"Model": "deepseek-coder-v3:16b", "Type": "Agent", "Hit Rate": 0.66, "MRR": 0.396, "R@1": 0.32, "R@5": 0.46, "R@10": 0.52, "R@20": 0.60, "R@50": 0.66, "Time (s)": 47.9, "Steps": 1.54},
    {"Model": "qwen2.5-coder:3b", "Type": "Agent", "Hit Rate": 0.60, "MRR": 0.366, "R@1": 0.28, "R@5": 0.45, "R@10": 0.53, "R@20": 0.55, "R@50": 0.57, "Time (s)": 210.4, "Steps": 1.51},
    {"Model": "qwen2.5-coder:14b", "Type": "Agent", "Hit Rate": 0.56, "MRR": 0.461, "R@1": 0.41, "R@5": 0.51, "R@10": 0.51, "R@20": 0.56, "R@50": 0.56, "Time (s)": 73.4, "Steps": 1.05},
    {"Model": "Semantic (MiniLM-L6)", "Type": "Baseline", "Hit Rate": 0.54, "MRR": 0.279, "R@1": 0.22, "R@5": 0.32, "R@10": 0.38, "R@20": 0.52, "R@50": 0.54, "Time (s)": None, "Steps": None},
    {"Model": "qwen2.5-coder:7b", "Type": "Agent", "Hit Rate": 0.54, "MRR": 0.311, "R@1": 0.26, "R@5": 0.36, "R@10": 0.40, "R@20": 0.52, "R@50": 0.54, "Time (s)": 59.3, "Steps": 0.84},
    {"Model": "deepseek-coder:33b", "Type": "Agent", "Hit Rate": 0.12, "MRR": 0.087, "R@1": 0.08, "R@5": 0.08, "R@10": 0.12, "R@20": 0.12, "R@50": 0.12, "Time (s)": 180.4, "Steps": 0.14},
    {"Model": "granite-code:34b", "Type": "Agent", "Hit Rate": 0.02, "MRR": 0.010, "R@1": 0.00, "R@5": 0.02, "R@10": 0.02, "R@20": 0.02, "R@50": 0.02, "Time (s)": 111.3, "Steps": 0.04},
]

# RAbench Results (500 queries)
rabench_data = [
    {"Model": "Qwen3-Coder-30B-Q3_K_M", "Type": "Agent", "Hit Rate": 0.98, "MRR": 0.882, "R@1": 0.83, "R@5": 0.93, "R@10": 0.95, "R@20": 0.96, "R@50": 0.97, "Time (s)": 21.53, "Steps": 1.36},
]

# Ablation Study Data
ablation_data = [
    {"Configuration": "Default (Full Agent)", "Queries": 500, "Hit Rate": 0.9818, "MRR": 0.8824, "R@1": 0.8381, "R@5": 0.9312, "Time (s)": 21.54},
    {"Configuration": "With Filters & Offline", "Queries": 50, "Hit Rate": 0.9600, "MRR": 0.8485, "R@1": 0.7800, "R@5": 0.9000, "Time (s)": 22.76},
    {"Configuration": "Offline Only", "Queries": 50, "Hit Rate": 0.9200, "MRR": 0.6476, "R@1": 0.5600, "R@5": 0.7400, "Time (s)": 41.45},
    {"Configuration": "No Mentions", "Queries": 50, "Hit Rate": 0.6400, "MRR": 0.4316, "R@1": 0.3600, "R@5": 0.5200, "Time (s)": 38.35},
    {"Configuration": "Online/Offline Mix", "Queries": 50, "Hit Rate": 0.6200, "MRR": 0.4595, "R@1": 0.4200, "R@5": 0.5000, "Time (s)": 38.50},
]

# Retrieval Baseline Ablations
baseline_ablation_data = [
    {"Configuration": "BM25 Full", "Baseline": "bm25", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8629, "R@1": 0.80, "R@5": 0.92, "Time (s)": 33.75},
    {"Configuration": "BM25 + Reranker", "Baseline": "bm25+reranker", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8692, "R@1": 0.80, "R@5": 0.94, "Time (s)": 935.07},
    {"Configuration": "Hybrid Full", "Baseline": "hybrid", "Structure": "full", "Hit Rate": 0.96, "MRR": 0.8620, "R@1": 0.80, "R@5": 0.92, "Time (s)": 31.65},
    {"Configuration": "Semantic Full", "Baseline": "semantic", "Structure": "full", "Hit Rate": 0.94, "MRR": 0.7097, "R@1": 0.62, "R@5": 0.88, "Time (s)": 31.28},
    {"Configuration": "BM25 No Intent", "Baseline": "bm25", "Structure": "no_intent", "Hit Rate": 0.96, "MRR": 0.8554, "R@1": 0.80, "R@5": 0.92, "Time (s)": 31.47},
    {"Configuration": "BM25 Minimal", "Baseline": "bm25", "Structure": "minimal", "Hit Rate": 0.96, "MRR": 0.8420, "R@1": 0.78, "R@5": 0.92, "Time (s)": 33.34},
]

# Dataset Statistics
dataset_stats = [
    {"Conference": "ICLR", "Count": 12},
    {"Conference": "NeurIPS", "Count": 39},
    {"Conference": "ICML", "Count": 13},
    {"Conference": "CVPR", "Count": 13},
    {"Conference": "IROS", "Count": 25},
    {"Conference": "ICRA", "Count": 25},
    {"Conference": "AAAI", "Count": 5},
    {"Conference": "ACL", "Count": 5},
    {"Conference": "ICCV", "Count": 7},
    {"Conference": "EMNLP", "Count": 4},
    {"Conference": "Other", "Count": 144},
]

def create_retrieval_df():
    df = pd.DataFrame(retrieval_data)
    df = df.sort_values("MRR", ascending=False)
    return df

def create_ablation_df():
    return pd.DataFrame(ablation_data)

def create_baseline_ablation_df():
    return pd.DataFrame(baseline_ablation_data)

def create_dataset_df():
    return pd.DataFrame(dataset_stats)

def filter_by_type(model_type):
    df = pd.DataFrame(retrieval_data)
    if model_type != "All":
        df = df[df["Type"] == model_type]
    return df.sort_values("MRR", ascending=False)

with gr.Blocks(title="PC-Bench: Paper Discovery Benchmark") as demo:
    gr.HTML("""
        <div style="text-align: center; margin-bottom: 20px;">
            <h1>PC-Bench: Paper Discovery Benchmark</h1>
            <p style="color: #666;">Evaluating AI agents for academic paper retrieval and analysis</p>
            <p>
                <a href="https://github.com/MAXNORM8650/papercircle" target="_blank">
                    <img src="https://img.shields.io/badge/GitHub-Repository-blue?logo=github" alt="GitHub"/>
                </a>
                <img src="https://img.shields.io/badge/Papers-292-green" alt="Papers"/>
                <img src="https://img.shields.io/badge/Queries-500+-orange" alt="Queries"/>
            </p>
        </div>
    """)

    with gr.Tabs():
        with gr.TabItem("Model Leaderboard"):
            gr.Markdown("### Multi-Agent Paper Retrieval (SemanticBench - 50 queries)")
            gr.Markdown("Models ranked by Mean Reciprocal Rank (MRR). Higher is better.")

            model_filter = gr.Dropdown(
                choices=["All", "Agent", "Baseline"],
                value="All",
                label="Filter by Type"
            )

            leaderboard_table = gr.Dataframe(
                value=create_retrieval_df(),
                headers=["Model", "Type", "Hit Rate", "MRR", "R@1", "R@5", "R@10", "R@20", "R@50", "Time (s)", "Steps"],
                interactive=False,
            )

            model_filter.change(
                fn=filter_by_type,
                inputs=[model_filter],
                outputs=[leaderboard_table]
            )

            gr.Markdown("""
            **Key Findings:**
            - **Qwen3-Coder-30B** achieves best MRR (0.627) with 80% hit rate
            - **BM25 baseline** remains competitive (78% hit rate, 0.541 MRR)
            - Larger models (30B+) consistently outperform smaller variants
            """)

        with gr.TabItem("RAbench Results"):
            gr.Markdown("### Extended Benchmark (RAbench - 500 queries)")
            gr.Markdown("LLM-perturbed natural language queries")

            gr.Dataframe(
                value=pd.DataFrame(rabench_data),
                headers=["Model", "Type", "Hit Rate", "MRR", "R@1", "R@5", "R@10", "R@20", "R@50", "Time (s)", "Steps"],
                interactive=False,
            )

            gr.Markdown("""
            **Observation:** RAbench shows higher performance than SemanticBench,
            suggesting LLM-perturbed queries are easier for multi-agent retrieval.
            """)

        with gr.TabItem("Configuration Ablations"):
            gr.Markdown("### Query Configuration Impact")
            gr.Dataframe(
                value=create_ablation_df(),
                interactive=False,
            )

            gr.Markdown("### Retrieval Baseline Comparison")
            gr.Dataframe(
                value=create_baseline_ablation_df(),
                interactive=False,
            )

            gr.Markdown("""
            **Key Insights:**
            - BM25 + Reranker achieves highest MRR (0.869) but is 28x slower
            - No Intent configuration is fastest while maintaining 96% hit rate
            - Semantic-only retrieval shows significant R@1 drop (0.62 vs 0.80)
            """)

        with gr.TabItem("Dataset"):
            gr.Markdown("### Database Corpus Statistics")
            gr.Markdown("Papers sourced from OpenReview across major ML/CS conferences")

            gr.Dataframe(
                value=create_dataset_df(),
                interactive=False,
            )

            total = sum(d["Count"] for d in dataset_stats)
            gr.Markdown(f"**Total Papers:** {total}")

    gr.Markdown("""
    ---
    ### About

    **Paper Circle** is a multi-agent research pipeline for intelligent paper discovery and analysis.

    **Pipeline:** Query β†’ Intent Agent β†’ Search Agent β†’ Sort Agent β†’ Analysis Agent β†’ Export

    **Metrics:**
    - **MRR** (Mean Reciprocal Rank): Ranking quality
    - **R@K** (Recall at K): Found in top K results
    - **Hit Rate**: Successful retrieval percentage

    Built with [smolagents](https://github.com/huggingface/smolagents) and [LiteLLM](https://github.com/BerriAI/litellm)
    """)

if __name__ == "__main__":
    demo.launch()