File size: 4,408 Bytes
7feac49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import Type

import gradio as gr

from swift.ui.base import BaseUI
from swift.utils import get_logger

logger = get_logger()


class Eval(BaseUI):

    group = 'llm_eval'

    locale_dict = {
        'eval_backend': {
            'label': {
                'zh': '评测后端',
                'en': 'Eval backend'
            },
            'info': {
                'zh': '选择评测后端',
                'en': 'Select eval backend'
            }
        },
        'eval_dataset': {
            'label': {
                'zh': '评测数据集',
                'en': 'Evaluation dataset'
            },
            'info': {
                'zh': '选择评测数据集,支持多选 (先选择评测后端)',
                'en': 'Select eval dataset, multiple datasets supported (select eval backend first)'
            }
        },
        'eval_limit': {
            'label': {
                'zh': '评测数据个数',
                'en': 'Eval numbers for each dataset'
            },
            'info': {
                'zh': '每个评测集的取样数',
                'en': 'Number of rows sampled from each dataset'
            }
        },
        'eval_output_dir': {
            'label': {
                'zh': '评测输出目录',
                'en': 'Eval output dir'
            },
            'info': {
                'zh': '评测结果的输出目录',
                'en': 'The dir to save the eval results'
            }
        },
        'custom_eval_config': {
            'label': {
                'zh': '自定义数据集评测配置',
                'en': 'Custom eval config'
            },
            'info': {
                'zh': '可以使用该配置评测自己的数据集,详见github文档的评测部分',
                'en': 'Use this config to eval your own datasets, check the docs in github for details'
            }
        },
        'eval_url': {
            'label': {
                'zh': '评测链接',
                'en': 'The eval url'
            },
            'info': {
                'zh':
                'OpenAI样式的评测链接(如:http://localhost:8080/v1/chat/completions),用于评测接口(模型类型输入为实际模型类型)',
                'en':
                'The OpenAI style link(like: http://localhost:8080/v1/chat/completions) for '
                'evaluation(Input actual model type into model_type)'
            }
        },
        'api_key': {
            'label': {
                'zh': '接口token',
                'en': 'The url token'
            },
            'info': {
                'zh': 'eval_url的token',
                'en': 'The token used with eval_url'
            }
        },
        'infer_backend': {
            'label': {
                'zh': '推理框架',
                'en': 'Infer backend'
            },
        }
    }

    @classmethod
    def do_build_ui(cls, base_tab: Type['BaseUI']):
        try:
            from swift.llm.argument.eval_args import EvalArguments
            eval_dataset_dict = EvalArguments.list_eval_dataset()
            default_backend = EvalArguments.eval_backend
        except Exception as e:
            logger.warn(e)
            eval_dataset_dict = {}
            default_backend = None

        with gr.Row():
            gr.Dropdown(elem_id='eval_backend', choices=list(eval_dataset_dict.keys()), value=default_backend, scale=20)
            gr.Dropdown(
                elem_id='eval_dataset',
                is_list=True,
                choices=eval_dataset_dict.get(default_backend, []),
                multiselect=True,
                allow_custom_value=True,
                scale=20)
            gr.Textbox(elem_id='eval_limit', scale=20)
            gr.Dropdown(elem_id='infer_backend', scale=20)
        with gr.Row():
            gr.Textbox(elem_id='custom_eval_config', scale=20)
            gr.Textbox(elem_id='eval_output_dir', scale=20)
            gr.Textbox(elem_id='eval_url', scale=20)
            gr.Textbox(elem_id='api_key', scale=20)

        def update_eval_dataset(backend):
            return gr.update(choices=eval_dataset_dict[backend])

        cls.element('eval_backend').change(update_eval_dataset, [cls.element('eval_backend')],
                                           [cls.element('eval_dataset')])