File size: 6,117 Bytes
3d3d712
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import json
import os
import sys
import warnings
from typing import Any, Optional

sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

warnings.filterwarnings("ignore")

import pandas as pd
import yaml
from evaluator import Evaluator, ScoringPoint

from taskweaver.app.app import TaskWeaverApp


def format_output(response_obj: Any) -> str:
    assert hasattr(response_obj, "to_dict"), "to_dict method is not found"
    formatted_output = json.dumps(response_obj.to_dict())
    return formatted_output


def auto_evaluate_for_taskweaver(
    eval_case_file_path: str,
    interrupt_threshold: Optional[float] = None,
    event_handler: Optional[callable] = None,
) -> [float, float]:
    with open(eval_case_file_path, "r") as f:
        eval_meta_data = yaml.safe_load(f)

    app_dir = eval_meta_data["app_dir"]
    config_var = eval_meta_data.get("config_var", None)

    app = TaskWeaverApp(app_dir=app_dir, config=config_var)
    session = app.get_session()

    taskweaver_evaluator = Evaluator()

    score_list = []
    for idx, eval_query in enumerate(eval_meta_data["eval_query"]):
        user_query = eval_query["user_query"]
        print(f"Round-{idx} user query:\n", user_query)

        response_round = session.send_message(
            user_query,
            event_handler=event_handler if event_handler is not None else lambda x, y: print(f"{x}:\n{y}"),
        )

        post_index = eval_query.get("post_index", None)
        scoring_point_data = eval_query.get("scoring_points", None)
        if scoring_point_data is None:
            print("No scoring points are provided. Skip evaluation for this round.")
            continue
        scoring_points = []
        for scoring_point in scoring_point_data:
            scoring_point = ScoringPoint(**scoring_point)
            scoring_points.append(scoring_point)

        if isinstance(post_index, int):
            response = format_output(response_round.post_list[post_index])
        elif post_index is None:
            response = format_output(response_round)
        else:
            raise ValueError("Invalid post_index")
        print("Taskweaver response:\n", response)
        score, normalized_score = taskweaver_evaluator.evaluate(user_query, response, scoring_points)
        score_list.append((idx, score, normalized_score))
        if interrupt_threshold is not None and interrupt_threshold > 0:
            if normalized_score < interrupt_threshold:
                print(
                    f"Interrupted conversation testing "
                    f"because the normalized score is lower than the threshold {interrupt_threshold}.",
                )
                break

    return score_list


def batch_auto_evaluate_for_taskweaver(
    result_file_path: str,
    eval_case_dir: str,
    flush_result_file: bool = False,
    interrupt_threshold: Optional[float] = None,
):
    if not os.path.exists(result_file_path):
        df = pd.DataFrame(columns=["case_file", "round", "score", "normalized_score"])
        df.to_csv(result_file_path, index=False)

    results = pd.read_csv(result_file_path)
    evaluated_case_files = results["case_file"].tolist()
    if flush_result_file:
        evaluated_case_files = []
    print(f"Evaluated case files: {evaluated_case_files}")
    eval_config_files = os.listdir(eval_case_dir)
    print(f"Eval config files in case dir: {eval_config_files}")

    for eval_config_file in eval_config_files:
        if eval_config_file in evaluated_case_files:
            print(f"Skip {eval_config_file} because it has been evaluated.")
            continue
        print("------------Start evaluating------------", eval_config_file)
        eval_case_file_path = os.path.join(eval_case_dir, eval_config_file)
        score_list = auto_evaluate_for_taskweaver(
            eval_case_file_path,
            interrupt_threshold=interrupt_threshold,
        )
        for idx, score, normalized_score in score_list:
            print(f"Round-{idx} score: {score}, normalized score: {normalized_score}")
            new_res_row = pd.DataFrame(
                {
                    "case_file": eval_config_file,
                    "round": idx,
                    "score": score,
                    "normalized_score": normalized_score,
                },
                index=[0],
            )
            results = pd.concat([results, new_res_row], ignore_index=True)

        print("------------Finished evaluating------------", eval_config_file)

        results.to_csv(result_file_path, index=False)


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="Taskweaver auto evaluation script")
    parser.add_argument(
        "-m",
        "--mode",
        choices=["single", "batch"],
        required=True,
        help="Evaluation mode, single for evaluating a single case, " "batch for evaluating a batch of cases",
    )
    parser.add_argument(
        "-f",
        "--file",
        type=str,
        required=True,
        help="Path to the evaluation case file or directory containing evaluation case files",
    )
    parser.add_argument(
        "-r",
        "--result",
        type=str,
        default="sample_case_results.csv",
        help="Path to the result file for batch evaluation mode",
    )
    parser.add_argument(
        "-t",
        "--threshold",
        type=float,
        default=None,
        help="Interrupt threshold for multi-round chat",
    )
    parser.add_argument(
        "-flush",
        "--flush",
        action="store_true",
        help="Flush the result file",
    )

    args = parser.parse_args()

    if args.mode == "single":
        score_list = auto_evaluate_for_taskweaver(args.file, interrupt_threshold=None)
        for idx, score, normalized_score in score_list:
            print(f"Round-{idx} score: {score}, normalized score: {normalized_score}")
    elif args.mode == "batch":
        batch_auto_evaluate_for_taskweaver(
            args.result,
            args.file,
            flush_result_file=args.flush,
            interrupt_threshold=None,
        )