Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import json | |
| import os | |
| import numpy as np | |
| import re | |
| import gradio as gr | |
| tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"] | |
| validators = ["opentensor_foundation"] | |
| def clean_result(result, task): | |
| if ("hendrycks" in task): | |
| if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))): | |
| if result["result"][0] == "A": | |
| result["cleaned_result"] = "1" | |
| elif result["result"][0] == "B": | |
| result["cleaned_result"] = "2" | |
| elif result["result"][0] == "C": | |
| result["cleaned_result"] = "3" | |
| elif result["result"][0] == "D": | |
| result["cleaned_result"] = "4" | |
| else: | |
| result["cleaned_result"] = "N/A" | |
| else: | |
| result["cleaned_result"] = "N/A" | |
| elif (task == "truthfulqa_mc"): | |
| cleaned_result = [] | |
| for r in result['result']: | |
| if 'False' in r: | |
| cleaned_result.append(0) | |
| elif 'True' in r: | |
| cleaned_result.append(1) | |
| else: | |
| cleaned_result.append("N/A") | |
| result["cleaned_result"] = cleaned_result | |
| else: | |
| if (result["result"] != "") and (result["result"][0].isnumeric()): | |
| result["cleaned_result"] = result["result"][0] | |
| else: | |
| result["cleaned_result"] = "N/A" | |
| return result | |
| def mc2(doc): | |
| # Split on the first `0` as everything before it is true (`1`). | |
| split_idx = list(doc["mc2_targets"]["labels"]).index(0) | |
| lls = doc["cleaned_result"] | |
| # Compute the normalized probability mass for the correct answer. | |
| ll_true, ll_false = lls[:split_idx], lls[split_idx:] | |
| p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) | |
| p_true = p_true / (sum(p_true) + sum(p_false)) | |
| return sum(p_true) | |
| final_total_results = [] | |
| final_split_results = [] | |
| results_cumulative = [] | |
| for validator in validators: | |
| results_dir_file_list = os.listdir(f"""_results/few-shot/{validator}""") | |
| number_of_nas, number_of_results, inference_total = 0,0,0 | |
| for task in tasks: | |
| task_results_files = [result_file for result_file in results_dir_file_list if task in result_file] | |
| results = [] | |
| for task_results_file in task_results_files: | |
| results_file_dir = f"""_results/few-shot/{validator}/{task_results_file}""" | |
| f = open(results_file_dir) | |
| results += json.load(f) | |
| results = [clean_result(result, task) if "result" in result else result for result in results] | |
| results_cumulative += results | |
| # Total results | |
| number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])]) | |
| inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum() | |
| number_of_results += len([1 for result in results if ('cleaned_result' in result)]) | |
| # Indiviudal results | |
| result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2) | |
| na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2) | |
| inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2) | |
| if task == "truthfulqa_mc": | |
| metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2) | |
| else: | |
| metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2) | |
| final_split_results.append({ | |
| "task" : task, | |
| "coverage_%" :result_coverage, | |
| "na_%" : na_coverage, | |
| "inference_avg" : inference_avg, | |
| "metric" : metric | |
| }) | |
| print(final_split_results) | |
| final_total_results.append({ | |
| "Validator": validator.replace("_", " ").capitalize(), | |
| "N/A %" : round((number_of_nas/number_of_results)*100,2), | |
| "Avg Inference (s)" : round((inference_total/number_of_results),2), | |
| "Average ⬆️": 0, | |
| "ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"], | |
| "HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"], | |
| "MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"], | |
| "TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"] | |
| }) | |
| final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean() | |
| df = pd.DataFrame(results_cumulative) | |
| df = df[df["cleaned_result"] == "N/A"].groupby("result", as_index=False).count().sort_values(by = ["id"], ascending = False).head(10)[["result","id"]].rename(columns={"result": "Result", "id": "ID"}) | |
| demo = gr.Blocks() | |
| with demo: | |
| with gr.Row(): | |
| title = gr.Markdown(value=f"""# <p style="text-align: center;"> Bittensor LMEH Leaderboard</p>""") | |
| with gr.Row(): | |
| table_1 = gr.Dataframe(pd.DataFrame(final_total_results)) | |
| with gr.Row(): | |
| title = gr.Markdown(value=f"""# <p style="text-align: center;"> Analysis Of Top 10 N/A Responses</p>""") | |
| with gr.Row(): | |
| table_2 = gr.Dataframe(df) | |
| # with gr.Row(visible = False): | |
| # table_2 = gr.Dataframe(pd.DataFrame(final_split_results)) | |
| demo.queue(concurrency_count = 5) | |
| demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860) |