| | import json |
| |
|
| | import pandas as pd |
| | import streamlit as st |
| |
|
| | import plotly.graph_objects as go |
| | from plotly.subplots import make_subplots |
| |
|
| |
|
| | def display_results(results, setting='avg', rank_metric='Entailment(β)', is_auto=True): |
| | label_marker = { |
| | 'Entailment': dict( |
| | color='rgba(102, 204, 0, 0.6)'), |
| | 'Neutral': dict( |
| | color='rgba(255, 178, 102, 0.6)'), |
| | 'Contradiction': dict( |
| | color='rgba(255, 51, 51, 0.6)'), |
| | 'Abstain': dict( |
| | color='rgba(192, 192, 192, 0.6)') |
| | } |
| |
|
| | model_names= [] |
| | entails = [] |
| | neutrals = [] |
| | contras = [] |
| | abstains = [] |
| | for k, v in results.items(): |
| | model_names.append(k) |
| | entails.append(v[setting]['entailment']) |
| | neutrals.append(v[setting]['neutral']) |
| | contras.append(v[setting]['contradiction']) |
| | abstains.append(v[setting]['abstain']) |
| |
|
| | results = list(zip(model_names, entails, neutrals, contras, abstains)) |
| | label_order = None |
| | if rank_metric == 'Entailment(β)': |
| | results = sorted(results, key=lambda x: x[1]) |
| | label_order = ['Entailment', 'Neutral', 'Contradiction'] |
| | elif rank_metric == 'Neutral(β)': |
| | results = sorted(results, key=lambda x: x[2], reverse=True) |
| | label_order = ['Neutral', 'Contradiction', 'Entailment'] |
| | elif rank_metric == 'Contradiction(β)': |
| | results = sorted(results, key=lambda x: x[3], reverse=True) |
| | label_order = ['Contradiction', 'Neutral', 'Entailment'] |
| | elif rank_metric == 'Abstain(β)': |
| | results = sorted(results, key=lambda x: x[4], reverse=True) |
| | label_order = ['Contradiction', 'Neutral', 'Entailment'] |
| |
|
| |
|
| | label_to_results_idx = { |
| | 'Entailment': 1, |
| | 'Neutral': 2, |
| | 'Contradiction': 3, |
| | 'Abstain': 4 |
| | } |
| | |
| | |
| | fig = make_subplots(rows=1, cols=2, shared_yaxes=True, column_widths=[0.9, 0.1], horizontal_spacing=0) |
| | for label in label_order: |
| | text = [] |
| | fig.add_trace( |
| | go.Bar( |
| | y=[x[0] for rank, x in enumerate(results)], |
| | x=[x[label_to_results_idx[label]] for x in results], |
| | name=label, |
| | orientation='h', |
| | marker=label_marker[label], |
| | text=[round(x[label_to_results_idx[label]], 1) for x in results] |
| | ), |
| | row=1, |
| | col=1 |
| | ) |
| | |
| | fig.add_trace( |
| | go.Bar( |
| | y=[x[0] for rank, x in enumerate(results)], |
| | x=[x[label_to_results_idx['Abstain']] for x in results], |
| | name='Abstain', |
| | orientation='h', |
| | marker=label_marker['Abstain'], |
| | text=[round(x[label_to_results_idx['Abstain']], 1) for x in results] |
| | ), |
| | row=1, |
| | col=2 |
| | ) |
| |
|
| | fig.update_layout( |
| | barmode='stack', |
| | width=1000, |
| | height=900 if is_auto else 500, |
| | bargap=0.35, |
| | legend_font=dict(size=18), |
| | ) |
| | fig.update_yaxes(tickfont=dict(size=19, color='black')) |
| |
|
| | st.plotly_chart(fig) |
| |
|
| |
|
| | if __name__ == '__main__': |
| | st.set_page_config(layout='wide') |
| | st.title('LLMHallucination Leaderboard') |
| | st.write('[GitHub repo of LLMHallucination](https://github.com/LuoXiaoHeics/LLMHallucination)') |
| |
|
| | tab1 = st.tabs(['Auto-checked Leaderboard'])[0] |
| | with tab1: |
| | col1, col2 = st.columns([1, 7]) |
| | with col1: |
| | extractor = st.radio('Claim-Triplet Extractor', ['GPT-4', 'Claude 2']) |
| | checker = st.radio('Checker', ['Ensemble of 3 Checkers', 'GPT-4', 'Claude 2', 'RoBERTa-NLI']) |
| | model_map = { |
| | 'Ensemble of 3 Checkers': 'ensemble', |
| | 'GPT-4': 'gpt4', |
| | 'Claude 2': 'claude2', |
| | 'RoBERTa-NLI': 'nli' |
| | } |
| | extractor = model_map[extractor] |
| | checker = model_map[checker] |
| |
|
| | rank_metric = st.radio('Rank By:', ['Contradiction(β)', 'Neutral(β)', 'Entailment(β)', 'Abstain(β)']) |
| | with col2: |
| | results = json.load(open('auto_leaderboard_scores.json')) |
| | res_key = f'{extractor}###{checker}' |
| | if res_key not in results: |
| | st.write('Work in progress, please stay tuned π') |
| | else: |
| | results = results[res_key] |
| | tab_avg, tab_zero, tab_noisy, tab_accurate = \ |
| | st.tabs(['Average over Settings', 'Zero Context', 'Noisy Context', 'Accurate Context']) |
| |
|
| | with tab_avg: |
| | display_results(results, setting='avg', rank_metric=rank_metric) |
| | with tab_zero: |
| | display_results(results, setting='nq', rank_metric=rank_metric) |
| | with tab_noisy: |
| | display_results(results, setting='msmarco', rank_metric=rank_metric) |
| | with tab_accurate: |
| | display_results(results, setting='dolly', rank_metric=rank_metric) |
| | st.divider() |
| | st.write('\* The responses of Gemini Pro (Bard) are manually collected from [Google Bard](https://bard.google.com/) on December 7, 2023.') |
| | st.write('β The responses of Gemini Pro (API) are collected from its offical API without tools.') |
| | st.write('β£ Our project is executed using the tool of RefChecker (https://github.com/amazon-science/RefChecker).') |
| |
|
| |
|