HoneyTian commited on
Commit
a984ba9
·
1 Parent(s): a0ec039
Files changed (37) hide show
  1. data/dataset/agent-bigseller-id-60-choice.jsonl +0 -0
  2. data/dataset/agent-lingoace-zh-375-v2-choice.jsonl +3 -0
  3. data/dataset/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
  4. data/eval_data/aliyun_choice/aliyun/qwen-plus-2025-12-01/shenzhen_sase/aliyun_api_key/20251208_120245/agent-lingoace-zh-400-choice.jsonl +3 -0
  5. data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_105202/agent-lingoace-zh-400-choice.jsonl +3 -0
  6. data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_133147/agent-lingoace-zh-400-choice.jsonl +3 -0
  7. data/eval_data/aliyun_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_113004/agent-lingoace-zh-400-choice.jsonl +3 -0
  8. data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_102934/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
  9. data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_110422/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
  10. data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_133511/agent-bingoplus-ph-90-choice.jsonl +3 -0
  11. data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_154723/agent-lingoace-zh-400-choice.jsonl +3 -0
  12. data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_140957/agent-bingoplus-ph-90-choice.jsonl +3 -0
  13. data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_170731/agent-lingoace-zh-400-choice.jsonl +3 -0
  14. data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_114857/agent-bingoplus-ph-90-choice.jsonl +3 -0
  15. data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_142846/agent-lingoace-zh-400-choice.jsonl +3 -0
  16. data/eval_data/google_anthropic/anthropic/claude-3-5-haiku@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100415/agent-lingoace-zh-400-choice.jsonl +3 -0
  17. data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet-v2@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100113/agent-lingoace-zh-400-choice.jsonl +0 -0
  18. data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet@20240620/shenzhen_sase/google_nxcloud_312303/20250910_100441/agent-lingoace-zh-400-choice.jsonl +0 -0
  19. data/eval_data/google_anthropic/anthropic/claude-3-7-sonnet@20250219/shenzhen_sase/google_nxcloud_312303/20250910_100042/agent-lingoace-zh-400-choice.jsonl +3 -0
  20. data/eval_data/google_anthropic/anthropic/claude-3-haiku@20240307/shenzhen_sase/google_nxcloud_312303/20250910_100501/agent-lingoace-zh-400-choice.jsonl +0 -0
  21. data/eval_data/google_anthropic/anthropic/claude-3-opus@20240229/shenzhen_sase/google_nxcloud_312303/20250910_100451/agent-lingoace-zh-400-choice.jsonl +0 -0
  22. data/eval_data/google_anthropic/anthropic/claude-opus-4-1@20250805/shenzhen_sase/google_nxcloud_312303/20250910_095955/agent-lingoace-zh-400-choice.jsonl +3 -0
  23. examples/ali_communication/make_dataset.py +85 -0
  24. examples/kms/get_aliyun_dev_apikey.py +41 -0
  25. examples/make_dataset/make_choice.py +70 -0
  26. examples/make_dataset/make_choice_lingoace_v2.py +141 -0
  27. examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py +2 -1
  28. examples/make_raw_dataset/step_3_filter_by_keywords.py +5 -0
  29. examples/make_raw_dataset/step_6_filter_by_choice.py +1 -1
  30. llm_eval_script/aliyun_choice.py +173 -0
  31. llm_eval_script/aliyun_nxcloud_v2_choice.py +233 -0
  32. llm_eval_script/aws.py +227 -0
  33. llm_eval_script/google_anthropic.py +10 -1
  34. main.py +3 -0
  35. requirements.txt +4 -0
  36. toolbox/aliyun_kms/__init__.py +6 -0
  37. toolbox/aliyun_kms/aliyun_kms.py +122 -0
data/dataset/agent-bigseller-id-60-choice.jsonl ADDED
File without changes
data/dataset/agent-lingoace-zh-375-v2-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a036052d750daf27450c520c1f5c7257077783a31d3a3fb43bf0e228ab22e80
3
+ size 1239647
data/dataset/agent-nxcloud-zh-375-v2-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4099cf306583c063b4bf69a485187ade46f45410843561154a3d8d50001b0bd3
3
+ size 1238502
data/eval_data/aliyun_choice/aliyun/qwen-plus-2025-12-01/shenzhen_sase/aliyun_api_key/20251208_120245/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bda5b0d743a600262716a089ec6eba8b21d6eb4bca2286443948f86035954f9b
3
+ size 1233921
data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_105202/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02b6dbf7cde1d9ee61289bb43af9082d695b38c966474e3b4e1015c54bdd7b2
3
+ size 1211356
data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_133147/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b39f27725a32fc91d5cdfa594ea2e67d2f6d1f533b7b6718faab62b93fa21e34
3
+ size 1233777
data/eval_data/aliyun_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_113004/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31ec4900d031c7f0fa507b38eb226d0e3c255e4e921669d4776d93685d78f0cc
3
+ size 1211172
data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_102934/agent-nxcloud-zh-375-v2-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e83e4cdcecfdd3b3444e5f2852d2f6dbd6db34ecd4e1a96a4ff1355185986869
3
+ size 1081363
data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_110422/agent-nxcloud-zh-375-v2-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f99aead6576f9f2409be441d891112c91d9b064a601523e434c7e852ed614a4a
3
+ size 1081322
data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_133511/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7dbdf8e6f5ebc2cc0fd3f32f297efb627c559e12656f603327ddf78f09a0c01
3
+ size 258621
data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_154723/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3699747d0b0888affdf832c462ae58b9aa06d080729ba3ba174125f873cd412f
3
+ size 1204448
data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_140957/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dadd78180802b49a905a850317eb54f0e183685dfbc6d4eee8e5ab5a7e50677
3
+ size 258563
data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_170731/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c54b42d8739211d1aa7da75041c3a754f4c88ca4b2f55e26908adb3a31f48565
3
+ size 1205834
data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_114857/agent-bingoplus-ph-90-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:951c1df603efe55563fbe2a824233805192adbcf1592b41b9b975070b406356b
3
+ size 258625
data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_142846/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed19fe093a4d6457297bef741fdf283313c462544ed014fe40e58c6331e5700e
3
+ size 1204189
data/eval_data/google_anthropic/anthropic/claude-3-5-haiku@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100415/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ed17f71bbde50f38be38a32f3caea001df9f0e2e22f2f189541f5ae1dbaa4b0
3
+ size 9134
data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet-v2@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100113/agent-lingoace-zh-400-choice.jsonl ADDED
File without changes
data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet@20240620/shenzhen_sase/google_nxcloud_312303/20250910_100441/agent-lingoace-zh-400-choice.jsonl ADDED
File without changes
data/eval_data/google_anthropic/anthropic/claude-3-7-sonnet@20250219/shenzhen_sase/google_nxcloud_312303/20250910_100042/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:017dd25dc264762cc26eb201149b9bc7a18f81381174b92b0d619665655cb93e
3
+ size 26705
data/eval_data/google_anthropic/anthropic/claude-3-haiku@20240307/shenzhen_sase/google_nxcloud_312303/20250910_100501/agent-lingoace-zh-400-choice.jsonl ADDED
File without changes
data/eval_data/google_anthropic/anthropic/claude-3-opus@20240229/shenzhen_sase/google_nxcloud_312303/20250910_100451/agent-lingoace-zh-400-choice.jsonl ADDED
File without changes
data/eval_data/google_anthropic/anthropic/claude-opus-4-1@20250805/shenzhen_sase/google_nxcloud_312303/20250910_095955/agent-lingoace-zh-400-choice.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a079d7731f824df3d2bd5e758d58c31b41b6412c5253cb4e8f21493724a85571
3
+ size 14137
examples/ali_communication/make_dataset.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+
6
+ import pandas as pd
7
+
8
+ from project_settings import environment, project_path
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--filename",
15
+ default="evaluation_results_max7.xlsx",
16
+ type=str
17
+ )
18
+ parser.add_argument(
19
+ "--dataset",
20
+ default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
21
+ type=str
22
+ )
23
+ args = parser.parse_args()
24
+ return args
25
+
26
+
27
+ def main():
28
+ args = get_args()
29
+
30
+ dataset = dict()
31
+ with open(args.dataset, "r", encoding="utf-8") as f:
32
+ for row in f:
33
+ row = json.loads(row)
34
+ idx = row["idx"]
35
+ prompt = row["prompt"]
36
+ response = row["response"]
37
+ dataset[idx] = row
38
+
39
+ result = list()
40
+ df = pd.read_excel(args.filename)
41
+ for i, row in df.iterrows():
42
+ # print(row)
43
+ idx = row["idx"]
44
+ conversation = row["conversation"]
45
+ expected = row["expected"]
46
+ actual_label = row["actual_label"]
47
+ actual_reason = row["actual_reason"]
48
+ correct = row["correct"]
49
+ note = row["note"]
50
+
51
+ if correct is False:
52
+ print(idx)
53
+ print(conversation)
54
+ print(expected, actual_label)
55
+ print(actual_reason)
56
+ print(note)
57
+ print("+" * 150)
58
+
59
+ dataset_ = dataset[idx]
60
+ prompt = dataset_["prompt"]
61
+ response = dataset_["response"]
62
+ print(prompt)
63
+ print(response)
64
+ print("-" * 150)
65
+
66
+ result.append({
67
+ "idx": idx,
68
+ "conversation": conversation,
69
+ "expected": expected,
70
+ "actual_label": actual_label,
71
+ "actual_reason": actual_reason,
72
+ "note": note,
73
+ "prompt": prompt,
74
+ "response": response,
75
+ "op": None,
76
+ "remark": None,
77
+
78
+ })
79
+ result = pd.DataFrame(result)
80
+ result.to_excel("result.xlsx", index=False)
81
+ return
82
+
83
+
84
+ if __name__ == "__main__":
85
+ main()
examples/kms/get_aliyun_dev_apikey.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from project_settings import environment, project_path
6
+ from toolbox.aliyun_kms.aliyun_kms import AliyunKMS
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument(
12
+ "--secret_name",
13
+ default="aliyun-chn-llm-dev",
14
+ type=str
15
+ )
16
+ args = parser.parse_args()
17
+ return args
18
+
19
+
20
+
21
+ def main():
22
+ args = get_args()
23
+
24
+ access_key_id = environment.get("ALIBABA_CLOUD_ACCESS_KEY_ID")
25
+ access_key_secret = environment.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
26
+
27
+ kms_manager = AliyunKMS(
28
+ access_key_id=access_key_id,
29
+ access_key_secret=access_key_secret,
30
+ )
31
+
32
+ js = kms_manager.get_secret_value(args.secret_name)
33
+ secret_data = js["body"]["SecretData"]
34
+
35
+ # sk-6728fced6fd848149ebbb7c3899cc043
36
+ print(secret_data)
37
+ return
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()
examples/make_dataset/make_choice.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import sys
8
+ import time
9
+
10
+ pwd = os.path.abspath(os.path.dirname(__file__))
11
+ sys.path.append(os.path.join(pwd, "../../"))
12
+
13
+ from project_settings import environment, project_path
14
+
15
+
16
+ def get_args():
17
+ parser = argparse.ArgumentParser()
18
+ parser.add_argument(
19
+ "--raw_dataset",
20
+ default=(project_path / "data/raw_dataset/agent-bigseller-id-60-choice").as_posix(),
21
+ type=str
22
+ )
23
+ parser.add_argument(
24
+ "--dataset",
25
+ default=(project_path / "data/dataset/agent-bigseller-id-60-choice.jsonl").as_posix(),
26
+ type=str
27
+ )
28
+ args = parser.parse_args()
29
+ return args
30
+
31
+
32
+ def main():
33
+ args = get_args()
34
+
35
+ raw_dataset = Path(args.raw_dataset)
36
+ dataset = Path(args.dataset)
37
+ dataset.parent.mkdir(parents=True, exist_ok=True)
38
+
39
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
40
+ for sample_dir in raw_dataset.glob("*"):
41
+ idx = sample_dir.parts[-1]
42
+ system_prompt_file = sample_dir / "system_prompt.txt"
43
+ user_prompt_file = sample_dir / "user_prompt.txt"
44
+ response_file = sample_dir / "response.txt"
45
+
46
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
47
+ system_prompt = f.read()
48
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
49
+ user_prompt = f.read()
50
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
51
+ response = f.read()
52
+
53
+ prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
54
+
55
+ print(f"{prompt}\n\n{response}")
56
+ print("-" * 150)
57
+
58
+ row_ = {
59
+ "idx": idx,
60
+ "prompt": prompt,
61
+ "response": response,
62
+ }
63
+ row_ = json.dumps(row_, ensure_ascii=False)
64
+ fout.write(f"{row_}\n")
65
+
66
+ return
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
examples/make_dataset/make_choice_lingoace_v2.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import re
8
+ import sys
9
+ import time
10
+
11
+ pwd = os.path.abspath(os.path.dirname(__file__))
12
+ sys.path.append(os.path.join(pwd, "../../"))
13
+
14
+ from project_settings import environment, project_path
15
+
16
+
17
+ def get_args():
18
+ parser = argparse.ArgumentParser()
19
+ parser.add_argument(
20
+ "--raw_dataset",
21
+ default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(),
22
+ type=str
23
+ )
24
+ parser.add_argument(
25
+ "--dataset",
26
+ default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(),
27
+ type=str
28
+ )
29
+ args = parser.parse_args()
30
+ return args
31
+
32
+
33
+ def main():
34
+ args = get_args()
35
+
36
+ raw_dataset = Path(args.raw_dataset)
37
+ dataset = Path(args.dataset)
38
+ dataset.parent.mkdir(parents=True, exist_ok=True)
39
+
40
+ with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
41
+ for sample_dir in raw_dataset.glob("*"):
42
+ idx = sample_dir.parts[-1]
43
+ system_prompt_file = sample_dir / "system_prompt.txt"
44
+ user_prompt_file = sample_dir / "user_prompt.txt"
45
+ response_file = sample_dir / "response.txt"
46
+
47
+ with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
48
+ system_prompt = f.read()
49
+ with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
50
+ user_prompt = f.read()
51
+ with open(response_file.as_posix(), "r", encoding="utf-8") as f:
52
+ response = f.read()
53
+
54
+ # conversation
55
+ pattern = r"\*Conversation starts\*(.*)\*Conversation ends\*"
56
+ match = re.search(pattern, user_prompt, flags=re.DOTALL)
57
+ if match is None:
58
+ raise AssertionError
59
+ conversation = match.group(1)
60
+ pattern = r'(client:|customer service:)([^\n]*)'
61
+ matches = re.findall(pattern, conversation)
62
+ conversation_ = list()
63
+ for speaker, content in matches:
64
+ if speaker == "customer service:":
65
+ speaker = "assistant"
66
+ elif speaker == "client:":
67
+ speaker = "user"
68
+ else:
69
+ raise AssertionError(speaker)
70
+ conversation_.append({
71
+ "role": speaker,
72
+ "content": content,
73
+ })
74
+
75
+ # examples
76
+ pattern = r"\*Conversation ends\*(.*)\*\*Output\*\*"
77
+ match = re.search(pattern, user_prompt, flags=re.DOTALL)
78
+ if match is not None:
79
+ examples = match.group(0)
80
+ else:
81
+ examples = ""
82
+
83
+ examples_ = list()
84
+ pattern = re.compile(r'(?m)^\[(用户|你)\]:\s*"([^"]*)"\s*$|^输出:\s*(\S+)\s*$|^解释:\s*(.+)\s*$')
85
+ example_conversation_ = list()
86
+ outputs = dict()
87
+ for m in pattern.finditer(examples):
88
+ speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4)
89
+ if speaker:
90
+ if speaker == "你":
91
+ # speaker = "customer service"
92
+ speaker = "assistant"
93
+ elif speaker == "用户":
94
+ # speaker = "client"
95
+ speaker = "user"
96
+ else:
97
+ raise AssertionError
98
+ conversation_turn = {"role": speaker, "content": content}
99
+ example_conversation_.append(conversation_turn)
100
+ elif out:
101
+ outputs["output"] = out
102
+ elif explanation:
103
+ outputs["explanation"] = explanation
104
+ examples_.append({
105
+ "conversation": example_conversation_,
106
+ "outputs": outputs,
107
+ })
108
+ example_conversation_ = list()
109
+ outputs = dict()
110
+
111
+ splits = user_prompt.split("**Output**")
112
+ choice = splits[1].strip()
113
+ pattern = r'If (.*?)output ([A-F])'
114
+ matches = re.findall(pattern, choice, re.DOTALL)
115
+ choices_ = list()
116
+ for condition, output_letter in matches:
117
+ condition_ = f"If {condition[:-2]}"
118
+ choice_letter = output_letter
119
+ row = {
120
+ "condition": condition_,
121
+ "choice_letter": choice_letter,
122
+ }
123
+ choices_.append(row)
124
+
125
+ row = {
126
+ "idx": idx,
127
+ "system_prompt": system_prompt,
128
+ "conversation": conversation_,
129
+ "examples": examples_,
130
+ "choices": choices_,
131
+ "response": response,
132
+ }
133
+ row = json.dumps(row, ensure_ascii=False)
134
+ fout.write(f"{row}\n")
135
+ fout.flush()
136
+
137
+ return
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py CHANGED
@@ -133,7 +133,8 @@ def main():
133
  row = extract(row)
134
  except Exception as e:
135
  print(row)
136
- raise e
 
137
  call_id = row["call_id"]
138
  system_prompt = row.get("system_prompt")
139
  conversation = row.get("conversation")
 
133
  row = extract(row)
134
  except Exception as e:
135
  print(row)
136
+ # raise e
137
+ continue
138
  call_id = row["call_id"]
139
  system_prompt = row.get("system_prompt")
140
  conversation = row.get("conversation")
examples/make_raw_dataset/step_3_filter_by_keywords.py CHANGED
@@ -69,6 +69,11 @@ def main():
69
  (["作为VIP客户"], "vip"),
70
  (["FedEx"], "fedex"),
71
  (["Chinese laser cutting"], "laser"),
 
 
 
 
 
72
  ]
73
 
74
  flag = False
 
69
  (["作为VIP客户"], "vip"),
70
  (["FedEx"], "fedex"),
71
  (["Chinese laser cutting"], "laser"),
72
+ (["Bigseller"], "bigseller"),
73
+ (["BigSeller"], "bigseller"),
74
+ (["ERP"], "bigseller"),
75
+ (["product"], "promote"),
76
+ (["川芎红花苗灸液"], "promote"),
77
  ]
78
 
79
  flag = False
examples/make_raw_dataset/step_6_filter_by_choice.py CHANGED
@@ -12,7 +12,7 @@ def get_args():
12
  parser = argparse.ArgumentParser()
13
  parser.add_argument(
14
  "--data_dir",
15
- default=(project_path / "data/llm-log-hk/extract-dataset/choice-nxpay").as_posix(),
16
  type=str
17
  )
18
  args = parser.parse_args()
 
12
  parser = argparse.ArgumentParser()
13
  parser.add_argument(
14
  "--data_dir",
15
+ default=(project_path / "data/llm-log-hk/extract-dataset/choice-promote").as_posix(),
16
  type=str
17
  )
18
  args = parser.parse_args()
llm_eval_script/aliyun_choice.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ from datetime import datetime
5
+ import json
6
+ import os
7
+ from pathlib import Path
8
+ import sys
9
+ import time
10
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
11
+
12
+ pwd = os.path.abspath(os.path.dirname(__file__))
13
+ sys.path.append(os.path.join(pwd, "../"))
14
+
15
+ from openai import OpenAI
16
+
17
+ from project_settings import environment, project_path
18
+
19
+
20
+ def get_args():
21
+ parser = argparse.ArgumentParser()
22
+ parser.add_argument(
23
+ "--model_name",
24
+ default="qwen3-max-2025-09-23",
25
+ # default="qwen3-max-preview",
26
+ # default="qwen-plus-2025-12-01",
27
+ type=str
28
+ )
29
+ parser.add_argument(
30
+ "--eval_dataset_name",
31
+ default="agent-lingoace-zh-400-choice.jsonl",
32
+ type=str
33
+ )
34
+ parser.add_argument(
35
+ "--eval_dataset_dir",
36
+ default=(project_path / "data/dataset").as_posix(),
37
+ type=str
38
+ )
39
+ parser.add_argument(
40
+ "--eval_data_dir",
41
+ default=(project_path / "data/eval_data").as_posix(),
42
+ type=str
43
+ )
44
+ parser.add_argument(
45
+ "--client",
46
+ default="shenzhen_sase",
47
+ type=str
48
+ )
49
+ parser.add_argument(
50
+ "--service",
51
+ default="aliyun_api_key",
52
+ type=str
53
+ )
54
+ parser.add_argument(
55
+ "--create_time_str",
56
+ default="null",
57
+ # default="20250812_092418",
58
+ type=str
59
+ )
60
+ parser.add_argument(
61
+ "--interval",
62
+ default=1,
63
+ type=int
64
+ )
65
+ args = parser.parse_args()
66
+ return args
67
+
68
+
69
+ def main():
70
+ args = get_args()
71
+
72
+ eval_dataset_dir = Path(args.eval_dataset_dir)
73
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
74
+ eval_data_dir = Path(args.eval_data_dir)
75
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
76
+
77
+ if args.create_time_str == "null":
78
+ tz = ZoneInfo("Asia/Shanghai")
79
+ now = datetime.now(tz)
80
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
81
+ # create_time_str = "20250724_090615"
82
+ else:
83
+ create_time_str = args.create_time_str
84
+
85
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
86
+
87
+ model_name_ = args.model_name.replace("/", "#")
88
+ output_file = eval_data_dir / f"aliyun_choice/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
89
+ output_file.parent.mkdir(parents=True, exist_ok=True)
90
+
91
+ api_key = environment.get(args.service, dtype=str)
92
+ client = OpenAI(
93
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
94
+ # Read your Ark API Key from the environment variable.
95
+ api_key=api_key
96
+ )
97
+
98
+ total = 0
99
+ total_correct = 0
100
+
101
+ # finished
102
+ finished_idx_set = set()
103
+ if os.path.exists(output_file.as_posix()):
104
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
105
+ for row in f:
106
+ row = json.loads(row)
107
+ idx = row["idx"]
108
+ total = row["total"]
109
+ total_correct = row["total_correct"]
110
+ finished_idx_set.add(idx)
111
+ print(f"finished count: {len(finished_idx_set)}")
112
+
113
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
114
+ for row in fin:
115
+ row = json.loads(row)
116
+ idx = row["idx"]
117
+ prompt = row["prompt"]
118
+ response = row["response"]
119
+
120
+ if idx in finished_idx_set:
121
+ continue
122
+ finished_idx_set.add(idx)
123
+
124
+ try:
125
+ time.sleep(args.interval)
126
+ print(f"sleep: {args.interval}")
127
+ time_begin = time.time()
128
+ completion = client.chat.completions.create(
129
+ model=args.model_name,
130
+ messages=[
131
+ {"role": "user", "content": prompt},
132
+ ],
133
+ # 由于 enable_thinking 非 OpenAI 标准参数,需要通过 extra_body 传入
134
+ extra_body={"enable_thinking": False},
135
+ stream=False,
136
+ )
137
+ time_cost = time.time() - time_begin
138
+ print(f"time_cost: {time_cost}")
139
+ except Exception as e:
140
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
141
+ continue
142
+
143
+ # print(f"completion: {completion}")
144
+ prediction = completion.choices[0].message.content
145
+ rid = completion.id
146
+
147
+ correct = 1 if prediction == response else 0
148
+
149
+ total += 1
150
+ total_correct += correct
151
+ score = total_correct / total
152
+
153
+ row_ = {
154
+ "idx": idx,
155
+ "rid": rid,
156
+ "prompt": prompt,
157
+ "response": response,
158
+ "prediction": prediction,
159
+ "correct": correct,
160
+ "total": total,
161
+ "total_correct": total_correct,
162
+ "score": score,
163
+ "time_cost": time_cost,
164
+ }
165
+ row_ = json.dumps(row_, ensure_ascii=False)
166
+ fout.write(f"{row_}\n")
167
+ fout.flush()
168
+
169
+ return
170
+
171
+
172
+ if __name__ == "__main__":
173
+ main()
llm_eval_script/aliyun_nxcloud_v2_choice.py ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://help.aliyun.com/zh/model-studio/qwen-api-reference
5
+ https://help.aliyun.com/zh/model-studio/models
6
+ https://help.aliyun.com/zh/model-studio/models?spm=a2c4g.11186623.0.i4#d4ccf72f23jh9
7
+
8
+ https://help.aliyun.com/zh/model-studio/text-generation?spm=a2c4g.11186623.0.0.6b772e068nnT1J#24e54b27d4agt
9
+
10
+ Deep-Thinking
11
+ https://help.aliyun.com/zh/model-studio/deep-thinking?spm=a2c4g.11186623.0.0.56076f58IJd4mP
12
+
13
+ """
14
+ import argparse
15
+ from datetime import datetime
16
+ import json
17
+ import os
18
+ from pathlib import Path
19
+ import sys
20
+ import time
21
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
22
+
23
+ pwd = os.path.abspath(os.path.dirname(__file__))
24
+ sys.path.append(os.path.join(pwd, "../"))
25
+
26
+ from openai import OpenAI
27
+
28
+ from project_settings import environment, project_path
29
+
30
+
31
+ def get_args():
32
+ parser = argparse.ArgumentParser()
33
+ parser.add_argument(
34
+ "--model_name",
35
+ # default="qwen3-max-2025-09-23",
36
+ default="qwen3-max-preview",
37
+ # default="qwen-plus-2025-12-01",
38
+ type=str
39
+ )
40
+ parser.add_argument(
41
+ "--eval_dataset_name",
42
+ default="agent-nxcloud-zh-375-choice-v2.jsonl",
43
+ type=str
44
+ )
45
+ parser.add_argument(
46
+ "--eval_dataset_dir",
47
+ default=(project_path / "data/dataset").as_posix(),
48
+ type=str
49
+ )
50
+ parser.add_argument(
51
+ "--eval_data_dir",
52
+ default=(project_path / "data/eval_data").as_posix(),
53
+ type=str
54
+ )
55
+ parser.add_argument(
56
+ "--client",
57
+ default="shenzhen_sase",
58
+ type=str
59
+ )
60
+ parser.add_argument(
61
+ "--service",
62
+ default="aliyun_api_key",
63
+ type=str
64
+ )
65
+ parser.add_argument(
66
+ "--create_time_str",
67
+ default="null",
68
+ # default="20250812_092418",
69
+ type=str
70
+ )
71
+ parser.add_argument(
72
+ "--interval",
73
+ default=1,
74
+ type=int
75
+ )
76
+ args = parser.parse_args()
77
+ return args
78
+
79
+
80
+ def conversation_to_str(conversation: list):
81
+ conversation_str = ""
82
+ for turn in conversation:
83
+ role = turn["role"]
84
+ content = turn["content"]
85
+ row_ = f"{role}: {content}\n"
86
+ conversation_str += row_
87
+
88
+ return conversation_str
89
+
90
+
91
+ def main():
92
+ args = get_args()
93
+
94
+ eval_dataset_dir = Path(args.eval_dataset_dir)
95
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
96
+ eval_data_dir = Path(args.eval_data_dir)
97
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
98
+
99
+ if args.create_time_str == "null":
100
+ tz = ZoneInfo("Asia/Shanghai")
101
+ now = datetime.now(tz)
102
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
103
+ # create_time_str = "20250724_090615"
104
+ else:
105
+ create_time_str = args.create_time_str
106
+
107
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
108
+
109
+ model_name_ = args.model_name.replace("/", "#")
110
+ output_file = eval_data_dir / f"aliyun_nxcloud_v2_choice/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
111
+ output_file.parent.mkdir(parents=True, exist_ok=True)
112
+
113
+ api_key = environment.get(args.service, dtype=str)
114
+ client = OpenAI(
115
+ base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
116
+ # Read your Ark API Key from the environment variable.
117
+ api_key=api_key
118
+ )
119
+
120
+ total = 0
121
+ total_correct = 0
122
+
123
+ # finished
124
+ finished_idx_set = set()
125
+ if os.path.exists(output_file.as_posix()):
126
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
127
+ for row in f:
128
+ row = json.loads(row)
129
+ idx = row["idx"]
130
+ total = row["total"]
131
+ total_correct = row["total_correct"]
132
+ finished_idx_set.add(idx)
133
+ print(f"finished count: {len(finished_idx_set)}")
134
+
135
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
136
+ for row in fin:
137
+ row = json.loads(row)
138
+ idx = row["idx"]
139
+ system_prompt = row["system_prompt"]
140
+ conversation = row["conversation"]
141
+ examples = row["examples"]
142
+ choices = row["choices"]
143
+ response = row["response"]
144
+
145
+ if idx in finished_idx_set:
146
+ continue
147
+
148
+ # conversation
149
+ conversation_str = conversation_to_str(conversation)
150
+
151
+ examples_str = ""
152
+ for example in examples:
153
+ conversation_ = example["conversation"]
154
+ outputs = example["outputs"]
155
+ output = outputs["output"]
156
+ explanation = outputs["explanation"]
157
+
158
+ examples_str += conversation_to_str(conversation_)
159
+ examples_str += f"Output: {output}\n"
160
+ examples_str += f"Explanation: {explanation}\n\n"
161
+
162
+ # print(examples_str)
163
+
164
+ choices_str = ""
165
+ for choice in choices:
166
+ condition = choice["condition"]
167
+ choice_letter = choice["choice_letter"]
168
+
169
+ row_ = f"{condition}, output: {choice_letter}\n"
170
+ choices_str += row_
171
+ choices_str += "\nRemember to output ONLY the corresponding letter.\nYour output is:"
172
+
173
+ # prompt = f"{system_prompt}\n\n**Output**\n{choices_}\n**Examples**\n{examples_}"
174
+ prompt1 = f"{system_prompt}\n\n**Examples**\n{examples_str}"
175
+ prompt2 = f"**Conversation**\n{conversation_str}\n\n**Output**\n{choices_str}"
176
+ print(prompt1)
177
+ print(prompt2)
178
+
179
+ messages = list()
180
+ messages.append(
181
+ {"role": "system", "content": prompt1},
182
+ )
183
+ messages.append(
184
+ {"role": "user", "content": prompt2},
185
+ )
186
+ print(f"messages: {json.dumps(messages, ensure_ascii=False, indent=4)}")
187
+
188
+ try:
189
+ time.sleep(args.interval)
190
+ print(f"sleep: {args.interval}")
191
+ time_begin = time.time()
192
+ completion = client.chat.completions.create(
193
+ model=args.model_name,
194
+ messages=messages,
195
+ # 由于 enable_thinking 非 OpenAI 标准参数,需要通过 extra_body 传入
196
+ extra_body={"enable_thinking": False},
197
+ stream=False,
198
+ )
199
+ time_cost = time.time() - time_begin
200
+ print(f"time_cost: {time_cost}")
201
+ except Exception as e:
202
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
203
+ continue
204
+
205
+ # print(f"completion: {completion}")
206
+ prediction = completion.choices[0].message.content
207
+
208
+ correct = 1 if prediction == response else 0
209
+
210
+ total += 1
211
+ total_correct += correct
212
+ score = total_correct / total
213
+
214
+ row_ = {
215
+ "idx": idx,
216
+ "messages": messages,
217
+ "response": response,
218
+ "prediction": prediction,
219
+ "correct": correct,
220
+ "total": total,
221
+ "total_correct": total_correct,
222
+ "score": score,
223
+ "time_cost": time_cost,
224
+ }
225
+ row_ = json.dumps(row_, ensure_ascii=False)
226
+ fout.write(f"{row_}\n")
227
+ fout.flush()
228
+
229
+ return
230
+
231
+
232
+ if __name__ == "__main__":
233
+ main()
llm_eval_script/aws.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
5
+
6
+ https://docs.aws.amazon.com/nova/latest/userguide/using-invoke-api.html?utm_source=chatgpt.com
7
+ """
8
+ import argparse
9
+ from datetime import datetime
10
+ import json
11
+ import os
12
+ from pathlib import Path
13
+ import sys
14
+ import time
15
+ from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
16
+
17
+ pwd = os.path.abspath(os.path.dirname(__file__))
18
+ sys.path.append(os.path.join(pwd, "../"))
19
+
20
+ import boto3
21
+
22
+ from project_settings import environment, project_path
23
+
24
+
25
+ def get_args():
26
+ """
27
+ python3 aws_claude.py --model_name anthropic.claude-instant-v1 \
28
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
29
+ --client "us_west(47.88.76.239)" \
30
+ --create_time_str 20250723-interval-10 \
31
+ --interval 10
32
+
33
+ python3 aws_claude.py --model_name anthropic.claude-v2 \
34
+ --eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
35
+ --client "us_west(47.88.76.239)" \
36
+ --create_time_str 20250723-interval-10 \
37
+ --interval 10
38
+
39
+ """
40
+ parser = argparse.ArgumentParser()
41
+ parser.add_argument(
42
+ "--model_name",
43
+ # default="ai21.jamba-1-5-large-v1:0",
44
+ # default="ai21.jamba-1-5-mini-v1:0",
45
+ # default="amazon.nova-canvas-v1:0",
46
+ # default="amazon.nova-premier-v1:0",
47
+
48
+ # default="amazon.nova-pro-v1:0",
49
+ # default="amazon.nova-lite-v1:0",
50
+ default="amazon.nova-micro-v1:0",
51
+
52
+ # default="amazon.nova-reel-v1:0",
53
+ # default="amazon.nova-reel-v1:1",
54
+ # default="amazon.nova-sonic-v1:0",
55
+ type=str
56
+ )
57
+ parser.add_argument(
58
+ "--eval_dataset_name",
59
+ # default="agent-bingoplus-ph-90-choice.jsonl",
60
+ default="agent-lingoace-zh-400-choice.jsonl",
61
+ # default="arc-easy-1000-choice.jsonl",
62
+ type=str
63
+ )
64
+ parser.add_argument(
65
+ "--eval_dataset_dir",
66
+ default=(project_path / "data/dataset").as_posix(),
67
+ type=str
68
+ )
69
+ parser.add_argument(
70
+ "--eval_data_dir",
71
+ default=(project_path / "data/eval_data").as_posix(),
72
+ type=str
73
+ )
74
+ parser.add_argument(
75
+ "--client",
76
+ default="shenzhen_sase",
77
+ type=str
78
+ )
79
+ parser.add_argument(
80
+ "--service",
81
+ default="aws_us_east",
82
+ type=str
83
+ )
84
+ parser.add_argument(
85
+ "--create_time_str",
86
+ default="null",
87
+ type=str
88
+ )
89
+ parser.add_argument(
90
+ "--interval",
91
+ default=10,
92
+ type=int
93
+ )
94
+ args = parser.parse_args()
95
+ return args
96
+
97
+
98
+ def main():
99
+ args = get_args()
100
+
101
+ service = environment.get(key=args.service, dtype=json.loads)
102
+ aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
103
+ aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
104
+ aws_default_region = service["AWS_DEFAULT_REGION"]
105
+
106
+ os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
107
+ os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
108
+ os.environ["AWS_DEFAULT_REGION"] = aws_default_region
109
+
110
+ client = boto3.client(
111
+ service_name="bedrock-runtime",
112
+ region_name=aws_default_region
113
+ )
114
+
115
+ eval_dataset_dir = Path(args.eval_dataset_dir)
116
+ eval_dataset_dir.mkdir(parents=True, exist_ok=True)
117
+ eval_data_dir = Path(args.eval_data_dir)
118
+ eval_data_dir.mkdir(parents=True, exist_ok=True)
119
+
120
+ if args.create_time_str == "null":
121
+ tz = ZoneInfo("Asia/Shanghai")
122
+ now = datetime.now(tz)
123
+ create_time_str = now.strftime("%Y%m%d_%H%M%S")
124
+ # create_time_str = "20250722_173400"
125
+ else:
126
+ create_time_str = args.create_time_str
127
+
128
+ eval_dataset = eval_dataset_dir / args.eval_dataset_name
129
+
130
+ model_name_ = args.model_name
131
+ model_name_ = model_name_.replace("/", "#")
132
+ model_name_ = model_name_.replace(":", "-")
133
+
134
+ output_file = eval_data_dir / f"aws/aws/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
135
+ output_file.parent.mkdir(parents=True, exist_ok=True)
136
+
137
+ total = 0
138
+ total_correct = 0
139
+
140
+ # finished
141
+ finished_idx_set = set()
142
+ if os.path.exists(output_file.as_posix()):
143
+ with open(output_file.as_posix(), "r", encoding="utf-8") as f:
144
+ for row in f:
145
+ row = json.loads(row)
146
+ idx = row["idx"]
147
+ total = row["total"]
148
+ total_correct = row["total_correct"]
149
+ finished_idx_set.add(idx)
150
+ print(f"finished count: {len(finished_idx_set)}")
151
+
152
+ with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
153
+ for row in fin:
154
+ row = json.loads(row)
155
+ idx = row["idx"]
156
+ prompt = row["prompt"]
157
+ response = row["response"]
158
+
159
+ if idx in finished_idx_set:
160
+ continue
161
+ finished_idx_set.add(idx)
162
+
163
+ body = {
164
+ "schemaVersion": "messages-v1",
165
+ "messages": [
166
+ {
167
+ "role": "user",
168
+ "content": [{"text": prompt}]
169
+ }
170
+ ],
171
+ "inferenceConfig": {
172
+ "maxTokens": 1,
173
+ "temperature": 0.5,
174
+ "topP": 0.95,
175
+ # 可选 topK 如果用额外字段
176
+ }
177
+ }
178
+
179
+ try:
180
+ # client.converse()
181
+ time.sleep(args.interval)
182
+ print(f"sleep: {args.interval}")
183
+ time_begin = time.time()
184
+ llm_response = client.invoke_model(
185
+ modelId=args.model_name,
186
+ body=json.dumps(body),
187
+ contentType="application/json"
188
+ )
189
+
190
+ llm_response = json.loads(llm_response["body"].read())
191
+ # print(result['content'][0]['text'])
192
+ time_cost = time.time() - time_begin
193
+ print(f"time_cost: {time_cost}")
194
+
195
+ except Exception as e:
196
+ print(f"request failed, error type: {type(e)}, error text: {str(e)}")
197
+ continue
198
+
199
+ llm_response = llm_response["output"]["message"]
200
+ prediction = llm_response["content"][0]["text"]
201
+
202
+ correct = 1 if prediction == response else 0
203
+
204
+ total += 1
205
+ total_correct += correct
206
+ score = total_correct / total
207
+
208
+ row_ = {
209
+ "idx": idx,
210
+ "prompt": prompt,
211
+ "response": response,
212
+ "prediction": prediction,
213
+ "correct": correct,
214
+ "total": total,
215
+ "total_correct": total_correct,
216
+ "score": score,
217
+ "time_cost": time_cost,
218
+ }
219
+ row_ = json.dumps(row_, ensure_ascii=False)
220
+ fout.write(f"{row_}\n")
221
+ fout.flush()
222
+
223
+ return
224
+
225
+
226
+ if __name__ == "__main__":
227
+ main()
llm_eval_script/google_anthropic.py CHANGED
@@ -27,8 +27,17 @@ def get_args():
27
  parser = argparse.ArgumentParser()
28
  parser.add_argument(
29
  "--model_name",
30
- default="claude-opus-4@20250514",
 
31
  # default="claude-sonnet-4@20250514",
 
 
 
 
 
 
 
 
32
  type=str
33
  )
34
  parser.add_argument(
 
27
  parser = argparse.ArgumentParser()
28
  parser.add_argument(
29
  "--model_name",
30
+ # default="claude-opus-4-1@20250805",
31
+ # default="claude-opus-4@20250514",
32
  # default="claude-sonnet-4@20250514",
33
+ # default="claude-3-7-sonnet@20250219",
34
+ # default="claude-3-5-haiku@20241022",
35
+
36
+ # default="claude-3-5-sonnet-v2@20241022",
37
+ # default="claude-3-opus@20240229",
38
+
39
+ # default="claude-3-5-sonnet@20240620",
40
+ default="claude-3-haiku@20240307",
41
  type=str
42
  )
43
  parser.add_argument(
main.py CHANGED
@@ -146,6 +146,7 @@ def load_board():
146
  if total == 0:
147
  continue
148
  score = np.mean(score_list)
 
149
  time_cost_mean = np.mean(time_cost_list)
150
  time_cost_var = np.var(time_cost_list)
151
 
@@ -158,6 +159,7 @@ def load_board():
158
  "model_name": model_name,
159
  "dataset": dataset,
160
  "score": round(score, 4),
 
161
  "time_cost(mean)": round(time_cost_mean, 4),
162
  "time_cost(var)": round(time_cost_var, 4),
163
  "time_cost(75%)": round(time_cost_p75, 4),
@@ -238,6 +240,7 @@ def when_click_view_chat_button(filename: str):
238
 
239
  board_columns_choices = [
240
  "company", "model_name", "dataset", "score",
 
241
  "time_cost(mean)",
242
  "time_cost(var)",
243
  "time_cost(75%)", "time_cost(95%)", "time_cost(99%)",
 
146
  if total == 0:
147
  continue
148
  score = np.mean(score_list)
149
+ time_cost_min = np.min(time_cost_list)
150
  time_cost_mean = np.mean(time_cost_list)
151
  time_cost_var = np.var(time_cost_list)
152
 
 
159
  "model_name": model_name,
160
  "dataset": dataset,
161
  "score": round(score, 4),
162
+ "time_cost(min)": round(time_cost_min, 4),
163
  "time_cost(mean)": round(time_cost_mean, 4),
164
  "time_cost(var)": round(time_cost_var, 4),
165
  "time_cost(75%)": round(time_cost_p75, 4),
 
240
 
241
  board_columns_choices = [
242
  "company", "model_name", "dataset", "score",
243
+ "time_cost(min)",
244
  "time_cost(mean)",
245
  "time_cost(var)",
246
  "time_cost(75%)", "time_cost(95%)", "time_cost(99%)",
requirements.txt CHANGED
@@ -9,3 +9,7 @@ smithy-aws-core>=0.0.1
9
  aws_sdk_bedrock_runtime
10
  boto3
11
  anthropic
 
 
 
 
 
9
  aws_sdk_bedrock_runtime
10
  boto3
11
  anthropic
12
+ alibabacloud_kms20160120
13
+ alibabacloud_credentials
14
+ alibabacloud_tea_openapi
15
+ alibabacloud_tea_util
toolbox/aliyun_kms/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+
5
+ if __name__ == "__main__":
6
+ pass
toolbox/aliyun_kms/aliyun_kms.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+
7
+ from alibabacloud_kms20160120.client import Client as Kms20160120Client
8
+ from alibabacloud_credentials.client import Client as CredentialClient
9
+ from alibabacloud_kms20160120.models import GetSecretValueResponse
10
+ from alibabacloud_tea_openapi import models as open_api_models
11
+ from alibabacloud_kms20160120 import models as kms_20160120_models
12
+ from alibabacloud_tea_util import models as util_models
13
+ from alibabacloud_tea_util.client import Client as UtilClient
14
+ from alibabacloud_credentials.models import Config
15
+
16
+
17
+ class AliyunKMS(object):
18
+ """
19
+ https://help.aliyun.com/zh/sdk/developer-reference/v2-manage-python-access-credentials
20
+ """
21
+ def __init__(self,
22
+ access_key_id: str = None,
23
+ access_key_secret: str = None,
24
+ endpoint: str = "kms.ap-southeast-1.aliyuncs.com",
25
+ ):
26
+ self.access_key_id = access_key_id
27
+ self.access_key_secret = access_key_secret
28
+ self.endpoint = endpoint
29
+
30
+ self.client = self.get_client()
31
+
32
+ def get_client(self):
33
+ credential = CredentialClient(
34
+ config=Config(
35
+ type="access_key",
36
+ access_key_id=self.access_key_id,
37
+ access_key_secret=self.access_key_secret,
38
+ )
39
+ )
40
+ config = open_api_models.Config(
41
+ credential=credential
42
+ )
43
+ # Endpoint 请参考 https://api.aliyun.com/product/Kms
44
+ config.endpoint = self.endpoint
45
+
46
+ client = Kms20160120Client(config)
47
+ return client
48
+
49
+ def create_secret(self, secret_name: str, secret_data: str, version_id: str):
50
+ """
51
+ https://next.api.aliyun.com/api/Kms/2016-01-20/CreateSecret
52
+ """
53
+ create_secret_request = kms_20160120_models.CreateSecretRequest(
54
+ secret_name=secret_name,
55
+ secret_data=secret_data,
56
+ version_id=version_id,
57
+ )
58
+ runtime = util_models.RuntimeOptions()
59
+
60
+ result = self.client.create_secret_with_options(create_secret_request, runtime)
61
+ return result
62
+
63
+ def get_secret_value(self, secret_name: str, version_id: str = None):
64
+ """
65
+ https://next.api.aliyun.com/api/Kms/2016-01-20/GetSecretValue
66
+ """
67
+ get_secret_value_request = kms_20160120_models.GetSecretValueRequest(
68
+ secret_name=secret_name,
69
+ version_id=version_id,
70
+ )
71
+ runtime = util_models.RuntimeOptions()
72
+
73
+ response: GetSecretValueResponse = self.client.get_secret_value_with_options(get_secret_value_request, runtime)
74
+ js = response.to_map()
75
+ return js
76
+
77
+ async def async_get_secret_value(self, secret_name: str, version_id: str = None):
78
+ """
79
+ https://next.api.aliyun.com/api/Kms/2016-01-20/GetSecretValue
80
+ """
81
+ get_secret_value_request = kms_20160120_models.GetSecretValueRequest(
82
+ secret_name=secret_name,
83
+ version_id=version_id,
84
+ )
85
+ runtime = util_models.RuntimeOptions()
86
+
87
+ response: GetSecretValueResponse = await self.client.get_secret_value_with_options_async(get_secret_value_request, runtime)
88
+ js = response.to_map()
89
+ return js
90
+
91
+
92
+ def main():
93
+ from settings import environment
94
+
95
+ access_key_id = environment.get(key="ALIBABA_CLOUD_ACCESS_KEY_ID", dtype=str)
96
+ access_key_secret = environment.get(key="ALIBABA_CLOUD_ACCESS_KEY_SECRET", dtype=str)
97
+ print(f"access_key_id: {access_key_id}")
98
+ print(f"access_key_secret: {access_key_secret}")
99
+
100
+ # os.environ["ALIBABA_CLOUD_ACCESS_KEY_ID"] = access_key_id
101
+ # os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"] = access_key_secret
102
+
103
+ manager = AliyunKMS(
104
+ access_key_id=access_key_id,
105
+ access_key_secret=access_key_secret,
106
+ )
107
+
108
+ # result = manager.get_secret_value(
109
+ # secret_name="azure-east-asia-asr-dev",
110
+ # version_id="v1",
111
+ # )
112
+ # print(result)
113
+ result = manager.get_secret_value(
114
+ secret_name="aliyun-nxai123-oss-dev",
115
+ # version_id="d5b82ac1ee63d748b25bf7be6c75695e",
116
+ )
117
+ print(result)
118
+ return
119
+
120
+
121
+ if __name__ == "__main__":
122
+ main()