| import gradio as gr |
| import pandas as pd |
| import random |
| import firebase_admin |
| from firebase_admin import credentials |
| from firebase_admin import firestore |
| from trueskill import Rating |
| import trueskill |
|
|
| CSV_FILE_PATH = "qa_pairs.csv" |
|
|
| cred = credentials.Certificate("unlpboard_f.json") |
| firebase_admin.initialize_app(cred) |
|
|
| def list_models(): |
| df = pd.read_csv(CSV_FILE_PATH) |
| return df['model'].unique().tolist() |
|
|
|
|
| def list_questions(): |
| df = pd.read_csv(CSV_FILE_PATH) |
| return df['question'].unique().tolist() |
|
|
| def fetch_questions(): |
| questions_ref = db.collection('questions') |
| docs = questions_ref.stream() |
| questions_list = [] |
| for doc in docs: |
| question = doc.to_dict() |
| questions_list.append(question) |
| return questions_list |
|
|
|
|
| def display_answers(question, model1, model2, df): |
| |
| answers = { |
| model1: "No answer available for Model 1", |
| model2: "No answer available for Model 2", |
| } |
| for model in [model1, model2]: |
| filtered_df = df[(df['question'] == question) & (df['model'] == model)] |
| if not filtered_df.empty: |
| answers[model] = f"**Answer:**\n{filtered_df['answer'].iloc[0]}" |
| return answers[model1], answers[model2] |
|
|
|
|
| def update_b(q,m1,a1,m2,a2): |
| print('Model1: ', random_model2) |
| print('Model2: ', random_model2) |
| q, m1, a1, m2, a2 = update_symbols(q, m1, a1, m2, a2) |
| b1 = gr.Button("Vote for Model 1",interactive=True) |
| b2 = gr.Button("It’s a tie!",interactive=True) |
| b3 = gr.Button("Vote for Model 2",interactive=True) |
| b4 = gr.Button("START!", visible = False) |
| return q, m1, a1, m2, a2, b1, b2, b3, b4 |
|
|
|
|
| def update_symbols1(q,m1,a1,m2,a2): |
| print("Voted for Model 1") |
| log_vote( |
| model1=m1, |
| model2=m2, |
| question=q, |
| output1=a1, |
| output2=a2, |
| outcome=m1 |
| ) |
| votes_ref = db.collection('votes') |
| vote_doc = votes_ref.document(m1).get() |
| elo_count_1 = vote_doc.get('elo_rating') |
| elo1 = Rating(elo_count_1) |
| if vote_doc.exists: |
| votes_ref.document(m1).update({'win_count': firestore.Increment(1)}) |
| else: |
| votes_ref.document(m1).set({'win_count': 1}) |
| vote_doc = votes_ref.document(m2).get() |
| elo_count_2 = vote_doc.get('elo_rating') |
| elo2 = Rating(elo_count_2) |
| elo1, elo2 = trueskill.rate_1vs1(elo1, elo2) |
| votes_ref.document(m2).update({'elo_rating': elo2.mu}) |
| votes_ref.document(m1).update({'elo_rating': elo1.mu}) |
| if vote_doc.exists: |
| votes_ref.document(m2).update({'loss_count': firestore.Increment(1)}) |
| else: |
| votes_ref.document(m2).set({'loss_count': 1}) |
|
|
| return update_symbols(q, m1, a1, m2, a2) |
|
|
|
|
| def update_symbols2(q, m1, a1, m2, a2): |
| print("Voted for Spare") |
| log_vote( |
| model1=m1, |
| model2=m2, |
| question=q, |
| output1=a1, |
| output2=a2, |
| outcome='tie' |
| ) |
| |
| return update_symbols(q, m1, a1, m2, a2) |
|
|
| def update_symbols3(q, m1, a1, m2, a2): |
| print("Voted for Model 2") |
| log_vote( |
| model1=m1, |
| model2=m2, |
| question=q, |
| output1=a1, |
| output2=a2, |
| outcome=m2 |
| ) |
| votes_ref = db.collection('votes') |
| vote_doc = votes_ref.document(m2).get() |
| elo_count_2 = vote_doc.get('elo_rating') |
| elo2 = Rating(elo_count_2) |
| if vote_doc.exists: |
| votes_ref.document(m2).update({'win_count': firestore.Increment(1)}) |
| else: |
| votes_ref.document(m2).set({'win_count': 1}) |
| vote_doc = votes_ref.document(m1).get() |
| elo_count_1 = vote_doc.get('elo_rating') |
| elo1 = Rating(elo_count_1) |
| elo1, elo2 = trueskill.rate_1vs1(elo2, elo1) |
| votes_ref.document(m2).update({'elo_rating': elo2.mu}) |
| votes_ref.document(m1).update({'elo_rating': elo1.mu}) |
| if vote_doc.exists: |
| votes_ref.document(m1).update({'loss_count': firestore.Increment(1)}) |
| else: |
| votes_ref.document(m1).set({'loss_count': 1}) |
| |
| return update_symbols(q, m1, a1, m2, a2) |
|
|
| def update_symbols(q,m1,a1,m2,a2): |
| random_question = random.choice(questions) |
| random_model1, random_model2 = random.sample(models, 2) |
| answer1, answer2 = display_answers(random_question, random_model1, random_model2, combined_df) |
| m1 = gr.Markdown(f"{random_model1}", visible=False) |
| a1 = gr.Markdown(answer1) |
| q = gr.Markdown(f"{random_question}") |
| m2 = gr.Markdown(f"{random_model2}", visible=False) |
| a2 = gr.Markdown(answer2) |
| return q,m1,a1,m2,a2 |
|
|
| def update_total_votes(): |
| votes_ref = db.collection('votes') |
| vote_doc = votes_ref.document('total').get() |
| if vote_doc.exists: |
| votes_ref.document('total').update({'count': firestore.Increment(1)}) |
| else: |
| votes_ref.document('total').set({'count': 1}) |
|
|
| def log_vote(model1, model2, question, output1, output2, outcome): |
| |
| votes_log_ref = db.collection('votes_log') |
|
|
| |
| vote_data = { |
| 'model1': model1, |
| 'model2': model2, |
| 'question': question, |
| 'output1': output1, |
| 'output2': output2, |
| 'outcome': outcome, |
| 'timestamp': firestore.SERVER_TIMESTAMP |
| } |
|
|
| |
| votes_log_ref.add(vote_data) |
|
|
|
|
| def fetch_and_format_leaderboard(): |
| vote_counts_ref = db.collection('votes') |
| docs = vote_counts_ref.stream() |
|
|
| leaderboard = [] |
| for doc in docs: |
| model_data = doc.to_dict() |
| model_name = doc.id |
| win_count = model_data.get('win_count', 0) |
| loss_count = model_data.get('loss_count', 0) |
| total_matches = win_count + loss_count |
| win_rate = (win_count / total_matches) * 100 if total_matches > 0 else 0 |
| elo_rating = model_data.get('elo_rating', 0) |
|
|
| leaderboard.append({ |
| "model": model_name, |
| "win_rate": win_rate, |
| "TrueSkill rating": elo_rating |
| }) |
|
|
| |
| leaderboard.sort(key=lambda x: x['win_rate'], reverse=True) |
| leaderboard_df = pd.DataFrame(leaderboard) |
| leaderboard_df['Rank'] = [1,2,3,4,5,6] |
|
|
| |
| leaderboard_df = leaderboard_df[['Rank', 'model', 'win_rate', 'TrueSkill rating' |
| ]] |
|
|
| |
| |
| return leaderboard_df |
|
|
| |
|
|
|
|
|
|
| db = firestore.client() |
|
|
| def fetch_questions_c(collection): |
| questions_ref = db.collection(collection) |
| docs = questions_ref.stream() |
| questions_list = [] |
| for doc in docs: |
| question = doc.to_dict() |
| questions_list.append(question) |
| return questions_list |
|
|
| codekobzar = fetch_questions_c('codekobzar') |
| gpt = fetch_questions_c('gpt-4') |
| llama = fetch_questions_c('llama-2-70b-chat') |
| sherlocknorag = fetch_questions_c('sherlock-no-rag') |
| sherlockrag = fetch_questions_c('sherlock-rag') |
| ukrainenow = fetch_questions_c('ukrainenow') |
|
|
| df1 = pd.DataFrame(codekobzar) |
| df2 = pd.DataFrame(gpt) |
| df3 = pd.DataFrame(llama) |
| df4 = pd.DataFrame(sherlocknorag) |
| df5 = pd.DataFrame(sherlockrag) |
| df6 = pd.DataFrame(ukrainenow) |
| df1['model'] = 'codekobzar' |
| df2['model'] = 'gpt-4' |
| df3['model'] = 'llama-2-70b-chat' |
| df4['model'] = 'sherlock-no-rag' |
| df5['model'] = 'sherlock-rag' |
| df6['model'] = 'ukrainenow' |
|
|
|
|
| combined_df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True) |
| combined_df.drop('input',axis=1,inplace=True) |
| combined_df.rename(columns={'instruction': 'question', 'output': 'answer'}, inplace=True) |
|
|
| models = ['codekobzar','gpt-4','llama-2-70b-chat','sherlock-no-rag','sherlock-rag','ukrainenow'] |
|
|
| votes_ref = db.collection('votes') |
| for model in models: |
| vote_doc = votes_ref.document(model).get() |
| if vote_doc.exists: |
| print("-------") |
| else: |
| votes_ref.document(model).set({'win_count': 0}) |
| votes_ref.document(model).set({'loss_count': 0}) |
| votes_ref.document(model).set({'elo_rating': 25}) |
|
|
|
|
|
|
| random_question = 'Click any button to start!' |
| random_model1, random_model2 = '1', '2' |
| answer1, answer2 = display_answers(random_question, random_model1, random_model2,combined_df) |
|
|
| questions = [] |
| questions_ = fetch_questions() |
| for question in questions_: |
| questions.append(question['question_text']) |
|
|
| votes_ref = db.collection('votes') |
|
|
|
|
| def create_app(): |
|
|
| print('-----------------------') |
| print(random_question) |
| print(random_model1) |
| print('-----!!!!!!!!!!!!!') |
|
|
| with gr.Blocks() as app: |
| q = gr.Markdown(f"### Question: {random_question}") |
|
|
| with gr.Row(): |
| with gr.Column(): |
| m1 = gr.Markdown(f"{random_model1}", visible=False) |
| a1 = gr.Markdown(answer1) |
|
|
| with gr.Column(): |
| m2 = gr.Markdown(f"{random_model2}", visible=False) |
| a2 = gr.Markdown(answer2) |
|
|
| with gr.Row(): |
| b1 = gr.Button("Vote for Model 1",interactive=False) |
| b2 = gr.Button("It’s a tie!",interactive=False) |
| b3 = gr.Button("Vote for Model 2",interactive=False) |
| with gr.Row(): |
| b4 = gr.Button("START!", interactive=True) |
| |
| |
|
|
| initial_leaderboard_data = fetch_and_format_leaderboard() |
| |
| leaderboard_display = gr.Dataframe(value=initial_leaderboard_data, label="Leaderboard") |
| |
|
|
| b4.click(update_b, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2,b1,b2,b3, b4]) |
| b1.click(update_symbols1, inputs=[q,m1,a1,m2,a2], outputs=[q,m1,a1,m2,a2]) |
| b2.click(update_symbols2, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) |
| b3.click(update_symbols3, inputs=[q, m1, a1, m2, a2], outputs=[q, m1, a1, m2, a2]) |
| leaderboard_button = gr.Button("Refresh Leaderboard") |
| leaderboard_button.click(fn=fetch_and_format_leaderboard, inputs=[], outputs=leaderboard_display) |
|
|
| return app |
|
|
| app = create_app() |
| app.launch() |
|
|