Spaces:
Running
Running
| import streamlit as st | |
| from app.draw_diagram import * | |
| def dashboard(): | |
| with st.container(): | |
| st.title("SeaEval") | |
| st.markdown(""" | |
| [gh]: https://github.com/SeaEval/SeaEval | |
| [][gh] | |
| [][gh] | |
| """) | |
| st.markdown("#### News") | |
| st.markdown("Nov, 2024: Update layout and support comparison between models with similar model sizes.") | |
| st.divider() | |
| seaeval_url = "https://seaeval.github.io/" | |
| st.markdown("#### What is [SeaEval](%s)?" % seaeval_url) | |
| with st.container(): | |
| left_co, cent_co,last_co = st.columns(3) | |
| with cent_co: | |
| st.image("./style/seaeval_overall.png", | |
| # caption="SeaEval data range", | |
| width=500) | |
| st.markdown(''' | |
| ''') | |
| st.markdown("##### A benchmark for multilingual, multicultral foundation model evaluation consisting of >30 dataset and we are keep expanding over time.") | |
| st.markdown(''':star: How models understand and reason with natural language? | |
| :balloon: Languages: English, Chinese, Malay, Spainish, Indonedian, Vietnamese, Filipino. | |
| ''') | |
| st.markdown(''':star: How models comprehend cultural practices, nuances and values? | |
| :balloon: 4 new datasets on Cultural Understanding. | |
| ''') | |
| st.markdown(''':star: How models perform across languages in terms of consistency? | |
| :balloon: 2 new datasets with curated metrics for Cross-Linugal Consistency. | |
| ''') | |
| with st.container(): | |
| left_co, cent_co,last_co = st.columns(3) | |
| with cent_co: | |
| st.image("./style/consistency.png", | |
| # caption="SeaEval data range", | |
| width=500) | |
| st.markdown("##### Evaluation with enhanced cross-lingual capabilities.") | |
| st.markdown(''':star: How models perform according to different (paraphrased) instructions? | |
| :balloon: Each dataset is equipped with 5 different prompts to avoid randomness introduced by instructions, | |
| which is non-negligible.. | |
| ''') | |
| st.markdown(''':star: Multilingual accuracy and performance consistency across languages. | |
| :balloon: If you can answer the question in your native language, can you answer the same question | |
| correctly in your second/third language? | |
| ''') | |
| st.divider() | |
| with st.container(): | |
| st.markdown("##### Citations") | |
| st.markdown(''' | |
| :round_pushpin: SeaEval Paper \n | |
| @article{SeaEval, | |
| title={SeaEval for Multilingual Foundation Models: From Cross-Lingual Alignment to Cultural Reasoning}, | |
| author={Wang, Bin and Liu, Zhengyuan and Huang, Xin and Jiao, Fangkai and Ding, Yang and Aw, Ai Ti and Chen, Nancy F.}, | |
| journal={NAACL}, | |
| year={2024} | |
| } | |
| ''') | |
| def cross_lingual_consistency(): | |
| st.title("Task: Cross-Lingual Consistency") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = ['Cross-MMLU', 'Cross-XQUAD', 'Cross-LogiQA'] | |
| category_one_dict = { | |
| 'Zero Shot': 'zero_shot', | |
| 'Few Shot' : 'few_shot' | |
| } | |
| category_two_dict = { | |
| 'Cross-MMLU' : 'cross_mmlu', | |
| 'Cross-XQUAD' : 'cross_xquad', | |
| 'Cross-LogiQA': 'cross_logiqa' | |
| } | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| with right: | |
| sort = st.selectbox('Sort (For Chart)', ['Accuracy','Cross-Lingual Consistency', 'AC3', | |
| 'English', 'Chinese', 'Spanish', 'Vietnamese']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sort or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('cross_lingual', category_one, category_two, sort, sortby, model_size_range) | |
| def cultural_reasoning(): | |
| st.title("Task: Cultural Reasoning") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = [ | |
| 'SG EVAL V2 MCQ', | |
| 'SG EVAL V2 Open Ended', | |
| 'SG EVAL', | |
| 'SG EVAL V1 Cleaned', | |
| 'CN EVAL', | |
| 'PH EVAL', | |
| 'US EVAL' | |
| ] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot' | |
| } | |
| category_two_dict = {'SG EVAL': 'sg_eval', | |
| 'SG EVAL V1 Cleaned' : 'sg_eval_v1_cleaned', | |
| 'SG EVAL V2 MCQ' : 'sg_eval_v2_mcq', | |
| 'SG EVAL V2 Open Ended': 'sg_eval_v2_open', | |
| 'US EVAL' : 'us_eval', | |
| 'CN EVAL' : 'cn_eval', | |
| 'PH EVAL' : 'ph_eval' | |
| } | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('cultural_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range) | |
| def general_reasoning(): | |
| st.title("Task: General Reasoning") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = [ | |
| 'MMLU', | |
| 'CMMLU', | |
| 'IndoMMLU', | |
| 'C Eval', | |
| 'ZBench', | |
| ] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot'} | |
| category_two_dict = {'MMLU': 'mmlu', | |
| 'C Eval': 'c_eval', | |
| 'CMMLU': 'cmmlu', | |
| 'ZBench': 'zbench', | |
| 'IndoMMLU': 'indommlu'} | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('general_reasoning', category_one, category_two, 'Accuracy', sortby, model_size_range) | |
| def flores(): | |
| st.title("Task: FLORES-Translation") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = ['Indonesian to English', | |
| 'Vitenamese to English', | |
| 'Chinese to English', | |
| 'Malay to English' | |
| ] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot'} | |
| category_two_dict = {'Indonesian to English': 'ind2eng', | |
| 'Vitenamese to English': 'vie2eng', | |
| 'Chinese to English': 'zho2eng', | |
| 'Malay to English': 'zsm2eng'} | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('flores_translation', category_one, category_two, 'BLEU', sortby, model_size_range) | |
| def emotion(): | |
| st.title("Task: Emotion") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = [ | |
| 'Indonesian Emotion Classification', | |
| 'SST2', | |
| ] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot'} | |
| category_two_dict = {'Indonesian Emotion Classification': 'ind_emotion', | |
| 'SST2': 'sst2'} | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('emotion', category_one, category_two, 'Accuracy', sortby, model_size_range) | |
| def dialogue(): | |
| st.title("Task: Dialogue") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = [ | |
| 'DREAM', | |
| 'SAMSum', | |
| 'DialogSum', | |
| ] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot'} | |
| category_two_dict = {'DREAM': 'dream', | |
| 'SAMSum': 'samsum', | |
| 'DialogSum': 'dialogsum'} | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with right: | |
| if category_two == 'DREAM': | |
| sort = st.selectbox('Sort', ['Accuracy']) | |
| else: | |
| sort = st.selectbox('Sort', ['Average', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L']) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sort or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('dialogue', category_one, category_two, sort, sortby, model_size_range) | |
| def fundamental_nlp_tasks(): | |
| st.title("Task: Fundamental NLP Tasks") | |
| filters_levelone = ['Zero Shot', 'Few Shot'] | |
| filters_leveltwo = ['OCNLI', 'C3', 'COLA', 'QQP', 'MNLI', 'QNLI', 'WNLI', 'RTE', 'MRPC'] | |
| category_one_dict = {'Zero Shot': 'zero_shot', | |
| 'Few Shot': 'few_shot'} | |
| category_two_dict = {'OCNLI': 'ocnli', | |
| 'C3': 'c3', | |
| 'COLA': 'cola', | |
| 'QQP': 'qqp', | |
| 'MNLI': 'mnli', | |
| 'QNLI': 'qnli', | |
| 'WNLI': 'wnli', | |
| 'RTE': 'rte', | |
| 'MRPC': 'mrpc'} | |
| left, center, middle, _, right = st.columns([0.2, 0.2, 0.2, 0.2 ,0.2]) | |
| with left: | |
| category_one = st.selectbox('Zero or Few Shot', filters_levelone) | |
| with center: | |
| category_two = st.selectbox('Dataset', filters_leveltwo) | |
| with middle: | |
| model_size_range = st.selectbox('Model Size', ['All', '<10B', '10B-30B', '>30B']) | |
| sortby = 'Ascending' | |
| if category_one or category_two or sortby: | |
| category_one = category_one_dict[category_one] | |
| category_two = category_two_dict[category_two] | |
| draw('fundamental_nlp_tasks', category_one, category_two, 'Accuracy', sortby, model_size_range) | |