| import streamlit as st |
| import pandas as pd |
| from PIL import Image |
|
|
| |
| st.set_page_config( |
| page_title="FactBench Leaderboard", |
| layout="wide" |
| ) |
|
|
| |
| image = Image.open("factEvalSteps.png") |
| logo_image = Image.open("Factbench_logo.png") |
|
|
| |
| st.markdown( |
| """ |
| <style> |
| @import url('https://fonts.googleapis.com/css2?family=Courier+Prime:wght@400&display=swap'); |
| |
| html, body, [class*="css"] { |
| font-family: 'Courier Prime', monospace; |
| } |
| |
| .title { |
| font-size: 42px; |
| font-weight: bold; |
| text-align: center; |
| color: #333; |
| margin-bottom: 5px; |
| } |
| |
| .description { |
| font-size: 22px; |
| text-align: center; |
| margin-bottom: 30px; |
| color: #555; |
| } |
| |
| .container { |
| max-width: 1000px; |
| margin: 0 auto; |
| padding: 20px; |
| } |
| |
| table { |
| width: 100%; |
| border-collapse: collapse; |
| border-radius: 10px; |
| overflow: hidden; |
| } |
| |
| th, td { |
| padding: 8px; |
| text-align: center; |
| border: 1px solid #ddd; |
| font-size: 14px; |
| transition: background-color 0.3s; |
| } |
| |
| th { |
| background-color: #f2f2f2; |
| font-weight: bold; |
| } |
| |
| td:hover { |
| background-color: #eaeaea; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True |
| ) |
|
|
| |
| st.markdown('<div class="container">', unsafe_allow_html=True) |
| st.image(logo_image, use_column_width=True) |
| |
| |
| st.markdown('<div class="description">Benchmark for LM Factuality Evaluation</div>', |
| unsafe_allow_html=True) |
| st.markdown('</div>', unsafe_allow_html=True) |
|
|
| |
| data_path = "tiered_models_data.csv" |
| df = pd.read_csv(data_path) |
|
|
| |
| df['rank'] = df.groupby('tier')['factuality_score'].rank( |
| ascending=False, method='min').astype(int) |
|
|
| |
| df.fillna('-', inplace=True) |
|
|
| df['original_order'] = df.groupby('tier').cumcount() |
|
|
| |
| tab1, tab2, tab3 = st.tabs( |
| ["Leaderboard", "Benchmark Details", "Submit your models"]) |
|
|
| |
| with tab1: |
| |
| |
| |
| |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
| st.markdown('## Metric Explanation') |
| st.markdown('@Farima populate here') |
|
|
| |
| tiers = ['All Tiers', 'Tier 1: Hard', 'Tier 2: Moderate', 'Tier 3: Easy'] |
| selected_tier = st.selectbox('Select Tier:', tiers) |
|
|
| |
| if selected_tier != 'All Tiers': |
| filtered_df = df[df['tier'] == selected_tier] |
| else: |
| filtered_df = df |
|
|
| sort_by_factuality = st.checkbox('Sort by Factuality Score') |
|
|
| |
| if sort_by_factuality: |
| updated_filtered_df = filtered_df.sort_values( |
| by=['tier', 'factuality_score'], ascending=[True, False] |
| ) |
| else: |
| updated_filtered_df = filtered_df.sort_values( |
| by=['tier', 'original_order'] |
| ) |
|
|
| |
| if selected_tier == 'All Tiers': |
| html = ''' |
| <table> |
| <thead> |
| <tr> |
| <th>Tier</th> |
| <th>Rank</th> |
| <th>Model</th> |
| <th>Factuality Score</th> |
| <th>Hallucination Score</th> |
| <th># Tokens</th> |
| <th># Factual</th> |
| <th># Undecidable</th> |
| <th># Unsupported</th> |
| </tr> |
| </thead> |
| <tbody> |
| ''' |
| else: |
| html = ''' |
| <table> |
| <thead> |
| <tr> |
| <th>Rank</th> |
| <th>Model</th> |
| <th>Factuality Score</th> |
| <th>Hallucination Score</th> |
| <th># Tokens</th> |
| <th># Factual</th> |
| <th># Undecidable</th> |
| <th># Unsupported</th> |
| </tr> |
| </thead> |
| <tbody> |
| ''' |
|
|
| |
| current_tier = None |
| for i, row in updated_filtered_df.iterrows(): |
| html += '<tr>' |
|
|
| |
| if selected_tier == 'All Tiers': |
| if row['tier'] != current_tier: |
| current_tier = row['tier'] |
| html += f'<td rowspan="7" style="vertical-align: middle;">{current_tier}</td>' |
|
|
| |
| html += f''' |
| <td>{row['rank']}</td> |
| <td>{row['model']}</td> |
| <td>{row['factuality_score']}</td> |
| <td>{row['hallucination_score']}</td> |
| <td>{row['avg_tokens']}</td> |
| <td>{row['avg_factual_units']}</td> |
| <td>{row['avg_undecidable_units']:.2f}</td> |
| <td>{row['avg_unsupported_units']:.2f}</td> |
| </tr> |
| ''' |
|
|
| |
| html += ''' |
| </table> |
| ''' |
|
|
| |
| st.markdown(html, unsafe_allow_html=True) |
|
|
| st.markdown('</div>', unsafe_allow_html=True) |
|
|
| |
| with tab2: |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
| st.markdown('<div class="title">Benchmark Details</div>', |
| unsafe_allow_html=True) |
| st.image(image, use_column_width=True) |
|
|
| st.markdown('### VERIFY: A Pipeline for Factuality Evaluation') |
| st.write( |
| "Language models (LMs) are widely used by an increasing number of users, " |
| "underscoring the challenge of maintaining factual accuracy across a broad range of topics. " |
| "We present VERIFY (Verification and Evidence Retrieval for Factuality evaluation), " |
| "a pipeline to evaluate LMs' factual accuracy in real-world user interactions." |
| ) |
|
|
| st.markdown('### Content Categorization') |
| st.write( |
| "VERIFY considers the verifiability of LM-generated content and categorizes content units as " |
| "`supported`, `unsupported`, or `undecidable` based on the retrieved web evidence. " |
| "Importantly, VERIFY's factuality judgments correlate better with human evaluations than existing methods." |
| ) |
|
|
| st.markdown('### Hallucination Prompts & FactBench Dataset') |
| st.write( |
| "Using VERIFY, we identify 'hallucination prompts' across diverse topics—those eliciting the highest rates of " |
| "incorrect or unverifiable LM responses. These prompts form FactBench, a dataset of 985 prompts across 213 " |
| "fine-grained topics. Our dataset captures emerging factuality challenges in real-world LM interactions and is " |
| "regularly updated with new prompts." |
| ) |
|
|
| st.markdown('</div>', unsafe_allow_html=True) |
|
|
| |
| with tab3: |
| st.markdown('<div class="tab-content">', unsafe_allow_html=True) |
|
|
| st.markdown('<div class="title">Submit your model information on our Github</div>', |
| unsafe_allow_html=True) |
|
|
| st.markdown( |
| '[Test your model locally!](https://github.com/FarimaFatahi/FactEval)') |
| st.markdown( |
| '[Submit results or issues!](https://github.com/FarimaFatahi/FactEval/issues/new)') |
|
|
| st.markdown('</div>', unsafe_allow_html=True) |
|
|