Spaces:
Running
Running
update
Browse files- data/dataset/agent-bigseller-id-60-choice.jsonl +0 -0
- data/dataset/agent-lingoace-zh-375-v2-choice.jsonl +3 -0
- data/dataset/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
- data/eval_data/aliyun_choice/aliyun/qwen-plus-2025-12-01/shenzhen_sase/aliyun_api_key/20251208_120245/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_105202/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_133147/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aliyun_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_113004/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_102934/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
- data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_110422/agent-nxcloud-zh-375-v2-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_133511/agent-bingoplus-ph-90-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_154723/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_140957/agent-bingoplus-ph-90-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_170731/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_114857/agent-bingoplus-ph-90-choice.jsonl +3 -0
- data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_142846/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/google_anthropic/anthropic/claude-3-5-haiku@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100415/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet-v2@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100113/agent-lingoace-zh-400-choice.jsonl +0 -0
- data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet@20240620/shenzhen_sase/google_nxcloud_312303/20250910_100441/agent-lingoace-zh-400-choice.jsonl +0 -0
- data/eval_data/google_anthropic/anthropic/claude-3-7-sonnet@20250219/shenzhen_sase/google_nxcloud_312303/20250910_100042/agent-lingoace-zh-400-choice.jsonl +3 -0
- data/eval_data/google_anthropic/anthropic/claude-3-haiku@20240307/shenzhen_sase/google_nxcloud_312303/20250910_100501/agent-lingoace-zh-400-choice.jsonl +0 -0
- data/eval_data/google_anthropic/anthropic/claude-3-opus@20240229/shenzhen_sase/google_nxcloud_312303/20250910_100451/agent-lingoace-zh-400-choice.jsonl +0 -0
- data/eval_data/google_anthropic/anthropic/claude-opus-4-1@20250805/shenzhen_sase/google_nxcloud_312303/20250910_095955/agent-lingoace-zh-400-choice.jsonl +3 -0
- examples/ali_communication/make_dataset.py +85 -0
- examples/kms/get_aliyun_dev_apikey.py +41 -0
- examples/make_dataset/make_choice.py +70 -0
- examples/make_dataset/make_choice_lingoace_v2.py +141 -0
- examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py +2 -1
- examples/make_raw_dataset/step_3_filter_by_keywords.py +5 -0
- examples/make_raw_dataset/step_6_filter_by_choice.py +1 -1
- llm_eval_script/aliyun_choice.py +173 -0
- llm_eval_script/aliyun_nxcloud_v2_choice.py +233 -0
- llm_eval_script/aws.py +227 -0
- llm_eval_script/google_anthropic.py +10 -1
- main.py +3 -0
- requirements.txt +4 -0
- toolbox/aliyun_kms/__init__.py +6 -0
- toolbox/aliyun_kms/aliyun_kms.py +122 -0
data/dataset/agent-bigseller-id-60-choice.jsonl
ADDED
|
File without changes
|
data/dataset/agent-lingoace-zh-375-v2-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a036052d750daf27450c520c1f5c7257077783a31d3a3fb43bf0e228ab22e80
|
| 3 |
+
size 1239647
|
data/dataset/agent-nxcloud-zh-375-v2-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4099cf306583c063b4bf69a485187ade46f45410843561154a3d8d50001b0bd3
|
| 3 |
+
size 1238502
|
data/eval_data/aliyun_choice/aliyun/qwen-plus-2025-12-01/shenzhen_sase/aliyun_api_key/20251208_120245/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bda5b0d743a600262716a089ec6eba8b21d6eb4bca2286443948f86035954f9b
|
| 3 |
+
size 1233921
|
data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_105202/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e02b6dbf7cde1d9ee61289bb43af9082d695b38c966474e3b4e1015c54bdd7b2
|
| 3 |
+
size 1211356
|
data/eval_data/aliyun_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_133147/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b39f27725a32fc91d5cdfa594ea2e67d2f6d1f533b7b6718faab62b93fa21e34
|
| 3 |
+
size 1233777
|
data/eval_data/aliyun_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_113004/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:31ec4900d031c7f0fa507b38eb226d0e3c255e4e921669d4776d93685d78f0cc
|
| 3 |
+
size 1211172
|
data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-2025-09-23/shenzhen_sase/aliyun_api_key/20251208_102934/agent-nxcloud-zh-375-v2-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e83e4cdcecfdd3b3444e5f2852d2f6dbd6db34ecd4e1a96a4ff1355185986869
|
| 3 |
+
size 1081363
|
data/eval_data/aliyun_nxcloud_v2_choice/aliyun/qwen3-max-preview/shenzhen_sase/aliyun_api_key/20251208_110422/agent-nxcloud-zh-375-v2-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f99aead6576f9f2409be441d891112c91d9b064a601523e434c7e852ed614a4a
|
| 3 |
+
size 1081322
|
data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_133511/agent-bingoplus-ph-90-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7dbdf8e6f5ebc2cc0fd3f32f297efb627c559e12656f603327ddf78f09a0c01
|
| 3 |
+
size 258621
|
data/eval_data/aws/aws/amazon.nova-lite-v1-0/shenzhen_sase/aws_us_east/20250916_154723/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3699747d0b0888affdf832c462ae58b9aa06d080729ba3ba174125f873cd412f
|
| 3 |
+
size 1204448
|
data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_140957/agent-bingoplus-ph-90-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5dadd78180802b49a905a850317eb54f0e183685dfbc6d4eee8e5ab5a7e50677
|
| 3 |
+
size 258563
|
data/eval_data/aws/aws/amazon.nova-micro-v1-0/shenzhen_sase/aws_us_east/20250916_170731/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c54b42d8739211d1aa7da75041c3a754f4c88ca4b2f55e26908adb3a31f48565
|
| 3 |
+
size 1205834
|
data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_114857/agent-bingoplus-ph-90-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:951c1df603efe55563fbe2a824233805192adbcf1592b41b9b975070b406356b
|
| 3 |
+
size 258625
|
data/eval_data/aws/aws/amazon.nova-pro-v1-0/shenzhen_sase/aws_us_east/20250916_142846/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ed19fe093a4d6457297bef741fdf283313c462544ed014fe40e58c6331e5700e
|
| 3 |
+
size 1204189
|
data/eval_data/google_anthropic/anthropic/claude-3-5-haiku@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100415/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9ed17f71bbde50f38be38a32f3caea001df9f0e2e22f2f189541f5ae1dbaa4b0
|
| 3 |
+
size 9134
|
data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet-v2@20241022/shenzhen_sase/google_nxcloud_312303/20250910_100113/agent-lingoace-zh-400-choice.jsonl
ADDED
|
File without changes
|
data/eval_data/google_anthropic/anthropic/claude-3-5-sonnet@20240620/shenzhen_sase/google_nxcloud_312303/20250910_100441/agent-lingoace-zh-400-choice.jsonl
ADDED
|
File without changes
|
data/eval_data/google_anthropic/anthropic/claude-3-7-sonnet@20250219/shenzhen_sase/google_nxcloud_312303/20250910_100042/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:017dd25dc264762cc26eb201149b9bc7a18f81381174b92b0d619665655cb93e
|
| 3 |
+
size 26705
|
data/eval_data/google_anthropic/anthropic/claude-3-haiku@20240307/shenzhen_sase/google_nxcloud_312303/20250910_100501/agent-lingoace-zh-400-choice.jsonl
ADDED
|
File without changes
|
data/eval_data/google_anthropic/anthropic/claude-3-opus@20240229/shenzhen_sase/google_nxcloud_312303/20250910_100451/agent-lingoace-zh-400-choice.jsonl
ADDED
|
File without changes
|
data/eval_data/google_anthropic/anthropic/claude-opus-4-1@20250805/shenzhen_sase/google_nxcloud_312303/20250910_095955/agent-lingoace-zh-400-choice.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a079d7731f824df3d2bd5e758d58c31b41b6412c5253cb4e8f21493724a85571
|
| 3 |
+
size 14137
|
examples/ali_communication/make_dataset.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from project_settings import environment, project_path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def get_args():
|
| 12 |
+
parser = argparse.ArgumentParser()
|
| 13 |
+
parser.add_argument(
|
| 14 |
+
"--filename",
|
| 15 |
+
default="evaluation_results_max7.xlsx",
|
| 16 |
+
type=str
|
| 17 |
+
)
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--dataset",
|
| 20 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-400-choice.jsonl").as_posix(),
|
| 21 |
+
type=str
|
| 22 |
+
)
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
return args
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def main():
|
| 28 |
+
args = get_args()
|
| 29 |
+
|
| 30 |
+
dataset = dict()
|
| 31 |
+
with open(args.dataset, "r", encoding="utf-8") as f:
|
| 32 |
+
for row in f:
|
| 33 |
+
row = json.loads(row)
|
| 34 |
+
idx = row["idx"]
|
| 35 |
+
prompt = row["prompt"]
|
| 36 |
+
response = row["response"]
|
| 37 |
+
dataset[idx] = row
|
| 38 |
+
|
| 39 |
+
result = list()
|
| 40 |
+
df = pd.read_excel(args.filename)
|
| 41 |
+
for i, row in df.iterrows():
|
| 42 |
+
# print(row)
|
| 43 |
+
idx = row["idx"]
|
| 44 |
+
conversation = row["conversation"]
|
| 45 |
+
expected = row["expected"]
|
| 46 |
+
actual_label = row["actual_label"]
|
| 47 |
+
actual_reason = row["actual_reason"]
|
| 48 |
+
correct = row["correct"]
|
| 49 |
+
note = row["note"]
|
| 50 |
+
|
| 51 |
+
if correct is False:
|
| 52 |
+
print(idx)
|
| 53 |
+
print(conversation)
|
| 54 |
+
print(expected, actual_label)
|
| 55 |
+
print(actual_reason)
|
| 56 |
+
print(note)
|
| 57 |
+
print("+" * 150)
|
| 58 |
+
|
| 59 |
+
dataset_ = dataset[idx]
|
| 60 |
+
prompt = dataset_["prompt"]
|
| 61 |
+
response = dataset_["response"]
|
| 62 |
+
print(prompt)
|
| 63 |
+
print(response)
|
| 64 |
+
print("-" * 150)
|
| 65 |
+
|
| 66 |
+
result.append({
|
| 67 |
+
"idx": idx,
|
| 68 |
+
"conversation": conversation,
|
| 69 |
+
"expected": expected,
|
| 70 |
+
"actual_label": actual_label,
|
| 71 |
+
"actual_reason": actual_reason,
|
| 72 |
+
"note": note,
|
| 73 |
+
"prompt": prompt,
|
| 74 |
+
"response": response,
|
| 75 |
+
"op": None,
|
| 76 |
+
"remark": None,
|
| 77 |
+
|
| 78 |
+
})
|
| 79 |
+
result = pd.DataFrame(result)
|
| 80 |
+
result.to_excel("result.xlsx", index=False)
|
| 81 |
+
return
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
examples/kms/get_aliyun_dev_apikey.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
|
| 5 |
+
from project_settings import environment, project_path
|
| 6 |
+
from toolbox.aliyun_kms.aliyun_kms import AliyunKMS
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def get_args():
|
| 10 |
+
parser = argparse.ArgumentParser()
|
| 11 |
+
parser.add_argument(
|
| 12 |
+
"--secret_name",
|
| 13 |
+
default="aliyun-chn-llm-dev",
|
| 14 |
+
type=str
|
| 15 |
+
)
|
| 16 |
+
args = parser.parse_args()
|
| 17 |
+
return args
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
args = get_args()
|
| 23 |
+
|
| 24 |
+
access_key_id = environment.get("ALIBABA_CLOUD_ACCESS_KEY_ID")
|
| 25 |
+
access_key_secret = environment.get("ALIBABA_CLOUD_ACCESS_KEY_SECRET")
|
| 26 |
+
|
| 27 |
+
kms_manager = AliyunKMS(
|
| 28 |
+
access_key_id=access_key_id,
|
| 29 |
+
access_key_secret=access_key_secret,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
js = kms_manager.get_secret_value(args.secret_name)
|
| 33 |
+
secret_data = js["body"]["SecretData"]
|
| 34 |
+
|
| 35 |
+
# sk-6728fced6fd848149ebbb7c3899cc043
|
| 36 |
+
print(secret_data)
|
| 37 |
+
return
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
main()
|
examples/make_dataset/make_choice.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 11 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
| 12 |
+
|
| 13 |
+
from project_settings import environment, project_path
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def get_args():
|
| 17 |
+
parser = argparse.ArgumentParser()
|
| 18 |
+
parser.add_argument(
|
| 19 |
+
"--raw_dataset",
|
| 20 |
+
default=(project_path / "data/raw_dataset/agent-bigseller-id-60-choice").as_posix(),
|
| 21 |
+
type=str
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--dataset",
|
| 25 |
+
default=(project_path / "data/dataset/agent-bigseller-id-60-choice.jsonl").as_posix(),
|
| 26 |
+
type=str
|
| 27 |
+
)
|
| 28 |
+
args = parser.parse_args()
|
| 29 |
+
return args
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def main():
|
| 33 |
+
args = get_args()
|
| 34 |
+
|
| 35 |
+
raw_dataset = Path(args.raw_dataset)
|
| 36 |
+
dataset = Path(args.dataset)
|
| 37 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
| 40 |
+
for sample_dir in raw_dataset.glob("*"):
|
| 41 |
+
idx = sample_dir.parts[-1]
|
| 42 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
| 43 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
| 44 |
+
response_file = sample_dir / "response.txt"
|
| 45 |
+
|
| 46 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 47 |
+
system_prompt = f.read()
|
| 48 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 49 |
+
user_prompt = f.read()
|
| 50 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
| 51 |
+
response = f.read()
|
| 52 |
+
|
| 53 |
+
prompt = f"""{system_prompt}\n\n{user_prompt}""".strip()
|
| 54 |
+
|
| 55 |
+
print(f"{prompt}\n\n{response}")
|
| 56 |
+
print("-" * 150)
|
| 57 |
+
|
| 58 |
+
row_ = {
|
| 59 |
+
"idx": idx,
|
| 60 |
+
"prompt": prompt,
|
| 61 |
+
"response": response,
|
| 62 |
+
}
|
| 63 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 64 |
+
fout.write(f"{row_}\n")
|
| 65 |
+
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
main()
|
examples/make_dataset/make_choice_lingoace_v2.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import re
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
|
| 11 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 12 |
+
sys.path.append(os.path.join(pwd, "../../"))
|
| 13 |
+
|
| 14 |
+
from project_settings import environment, project_path
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def get_args():
|
| 18 |
+
parser = argparse.ArgumentParser()
|
| 19 |
+
parser.add_argument(
|
| 20 |
+
"--raw_dataset",
|
| 21 |
+
default=(project_path / "data/raw_dataset/finished/agent-lingoace-zh-375-choice-v2").as_posix(),
|
| 22 |
+
type=str
|
| 23 |
+
)
|
| 24 |
+
parser.add_argument(
|
| 25 |
+
"--dataset",
|
| 26 |
+
default=(project_path / "data/dataset/agent-lingoace-zh-375-choice-v2.jsonl").as_posix(),
|
| 27 |
+
type=str
|
| 28 |
+
)
|
| 29 |
+
args = parser.parse_args()
|
| 30 |
+
return args
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def main():
|
| 34 |
+
args = get_args()
|
| 35 |
+
|
| 36 |
+
raw_dataset = Path(args.raw_dataset)
|
| 37 |
+
dataset = Path(args.dataset)
|
| 38 |
+
dataset.parent.mkdir(parents=True, exist_ok=True)
|
| 39 |
+
|
| 40 |
+
with open(dataset.as_posix(), "w", encoding="utf-8") as fout:
|
| 41 |
+
for sample_dir in raw_dataset.glob("*"):
|
| 42 |
+
idx = sample_dir.parts[-1]
|
| 43 |
+
system_prompt_file = sample_dir / "system_prompt.txt"
|
| 44 |
+
user_prompt_file = sample_dir / "user_prompt.txt"
|
| 45 |
+
response_file = sample_dir / "response.txt"
|
| 46 |
+
|
| 47 |
+
with open(system_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 48 |
+
system_prompt = f.read()
|
| 49 |
+
with open(user_prompt_file.as_posix(), "r", encoding="utf-8") as f:
|
| 50 |
+
user_prompt = f.read()
|
| 51 |
+
with open(response_file.as_posix(), "r", encoding="utf-8") as f:
|
| 52 |
+
response = f.read()
|
| 53 |
+
|
| 54 |
+
# conversation
|
| 55 |
+
pattern = r"\*Conversation starts\*(.*)\*Conversation ends\*"
|
| 56 |
+
match = re.search(pattern, user_prompt, flags=re.DOTALL)
|
| 57 |
+
if match is None:
|
| 58 |
+
raise AssertionError
|
| 59 |
+
conversation = match.group(1)
|
| 60 |
+
pattern = r'(client:|customer service:)([^\n]*)'
|
| 61 |
+
matches = re.findall(pattern, conversation)
|
| 62 |
+
conversation_ = list()
|
| 63 |
+
for speaker, content in matches:
|
| 64 |
+
if speaker == "customer service:":
|
| 65 |
+
speaker = "assistant"
|
| 66 |
+
elif speaker == "client:":
|
| 67 |
+
speaker = "user"
|
| 68 |
+
else:
|
| 69 |
+
raise AssertionError(speaker)
|
| 70 |
+
conversation_.append({
|
| 71 |
+
"role": speaker,
|
| 72 |
+
"content": content,
|
| 73 |
+
})
|
| 74 |
+
|
| 75 |
+
# examples
|
| 76 |
+
pattern = r"\*Conversation ends\*(.*)\*\*Output\*\*"
|
| 77 |
+
match = re.search(pattern, user_prompt, flags=re.DOTALL)
|
| 78 |
+
if match is not None:
|
| 79 |
+
examples = match.group(0)
|
| 80 |
+
else:
|
| 81 |
+
examples = ""
|
| 82 |
+
|
| 83 |
+
examples_ = list()
|
| 84 |
+
pattern = re.compile(r'(?m)^\[(用户|你)\]:\s*"([^"]*)"\s*$|^输出:\s*(\S+)\s*$|^解释:\s*(.+)\s*$')
|
| 85 |
+
example_conversation_ = list()
|
| 86 |
+
outputs = dict()
|
| 87 |
+
for m in pattern.finditer(examples):
|
| 88 |
+
speaker, content, out, explanation = m.group(1), m.group(2), m.group(3), m.group(4)
|
| 89 |
+
if speaker:
|
| 90 |
+
if speaker == "你":
|
| 91 |
+
# speaker = "customer service"
|
| 92 |
+
speaker = "assistant"
|
| 93 |
+
elif speaker == "用户":
|
| 94 |
+
# speaker = "client"
|
| 95 |
+
speaker = "user"
|
| 96 |
+
else:
|
| 97 |
+
raise AssertionError
|
| 98 |
+
conversation_turn = {"role": speaker, "content": content}
|
| 99 |
+
example_conversation_.append(conversation_turn)
|
| 100 |
+
elif out:
|
| 101 |
+
outputs["output"] = out
|
| 102 |
+
elif explanation:
|
| 103 |
+
outputs["explanation"] = explanation
|
| 104 |
+
examples_.append({
|
| 105 |
+
"conversation": example_conversation_,
|
| 106 |
+
"outputs": outputs,
|
| 107 |
+
})
|
| 108 |
+
example_conversation_ = list()
|
| 109 |
+
outputs = dict()
|
| 110 |
+
|
| 111 |
+
splits = user_prompt.split("**Output**")
|
| 112 |
+
choice = splits[1].strip()
|
| 113 |
+
pattern = r'If (.*?)output ([A-F])'
|
| 114 |
+
matches = re.findall(pattern, choice, re.DOTALL)
|
| 115 |
+
choices_ = list()
|
| 116 |
+
for condition, output_letter in matches:
|
| 117 |
+
condition_ = f"If {condition[:-2]}"
|
| 118 |
+
choice_letter = output_letter
|
| 119 |
+
row = {
|
| 120 |
+
"condition": condition_,
|
| 121 |
+
"choice_letter": choice_letter,
|
| 122 |
+
}
|
| 123 |
+
choices_.append(row)
|
| 124 |
+
|
| 125 |
+
row = {
|
| 126 |
+
"idx": idx,
|
| 127 |
+
"system_prompt": system_prompt,
|
| 128 |
+
"conversation": conversation_,
|
| 129 |
+
"examples": examples_,
|
| 130 |
+
"choices": choices_,
|
| 131 |
+
"response": response,
|
| 132 |
+
}
|
| 133 |
+
row = json.dumps(row, ensure_ascii=False)
|
| 134 |
+
fout.write(f"{row}\n")
|
| 135 |
+
fout.flush()
|
| 136 |
+
|
| 137 |
+
return
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
main()
|
examples/make_raw_dataset/step_1_make_hk_dataset_by_log.py
CHANGED
|
@@ -133,7 +133,8 @@ def main():
|
|
| 133 |
row = extract(row)
|
| 134 |
except Exception as e:
|
| 135 |
print(row)
|
| 136 |
-
raise e
|
|
|
|
| 137 |
call_id = row["call_id"]
|
| 138 |
system_prompt = row.get("system_prompt")
|
| 139 |
conversation = row.get("conversation")
|
|
|
|
| 133 |
row = extract(row)
|
| 134 |
except Exception as e:
|
| 135 |
print(row)
|
| 136 |
+
# raise e
|
| 137 |
+
continue
|
| 138 |
call_id = row["call_id"]
|
| 139 |
system_prompt = row.get("system_prompt")
|
| 140 |
conversation = row.get("conversation")
|
examples/make_raw_dataset/step_3_filter_by_keywords.py
CHANGED
|
@@ -69,6 +69,11 @@ def main():
|
|
| 69 |
(["作为VIP客户"], "vip"),
|
| 70 |
(["FedEx"], "fedex"),
|
| 71 |
(["Chinese laser cutting"], "laser"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
]
|
| 73 |
|
| 74 |
flag = False
|
|
|
|
| 69 |
(["作为VIP客户"], "vip"),
|
| 70 |
(["FedEx"], "fedex"),
|
| 71 |
(["Chinese laser cutting"], "laser"),
|
| 72 |
+
(["Bigseller"], "bigseller"),
|
| 73 |
+
(["BigSeller"], "bigseller"),
|
| 74 |
+
(["ERP"], "bigseller"),
|
| 75 |
+
(["product"], "promote"),
|
| 76 |
+
(["川芎红花苗灸液"], "promote"),
|
| 77 |
]
|
| 78 |
|
| 79 |
flag = False
|
examples/make_raw_dataset/step_6_filter_by_choice.py
CHANGED
|
@@ -12,7 +12,7 @@ def get_args():
|
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
parser.add_argument(
|
| 14 |
"--data_dir",
|
| 15 |
-
default=(project_path / "data/llm-log-hk/extract-dataset/choice-
|
| 16 |
type=str
|
| 17 |
)
|
| 18 |
args = parser.parse_args()
|
|
|
|
| 12 |
parser = argparse.ArgumentParser()
|
| 13 |
parser.add_argument(
|
| 14 |
"--data_dir",
|
| 15 |
+
default=(project_path / "data/llm-log-hk/extract-dataset/choice-promote").as_posix(),
|
| 16 |
type=str
|
| 17 |
)
|
| 18 |
args = parser.parse_args()
|
llm_eval_script/aliyun_choice.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
import sys
|
| 9 |
+
import time
|
| 10 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 11 |
+
|
| 12 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 13 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 14 |
+
|
| 15 |
+
from openai import OpenAI
|
| 16 |
+
|
| 17 |
+
from project_settings import environment, project_path
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def get_args():
|
| 21 |
+
parser = argparse.ArgumentParser()
|
| 22 |
+
parser.add_argument(
|
| 23 |
+
"--model_name",
|
| 24 |
+
default="qwen3-max-2025-09-23",
|
| 25 |
+
# default="qwen3-max-preview",
|
| 26 |
+
# default="qwen-plus-2025-12-01",
|
| 27 |
+
type=str
|
| 28 |
+
)
|
| 29 |
+
parser.add_argument(
|
| 30 |
+
"--eval_dataset_name",
|
| 31 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 32 |
+
type=str
|
| 33 |
+
)
|
| 34 |
+
parser.add_argument(
|
| 35 |
+
"--eval_dataset_dir",
|
| 36 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 37 |
+
type=str
|
| 38 |
+
)
|
| 39 |
+
parser.add_argument(
|
| 40 |
+
"--eval_data_dir",
|
| 41 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 42 |
+
type=str
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
"--client",
|
| 46 |
+
default="shenzhen_sase",
|
| 47 |
+
type=str
|
| 48 |
+
)
|
| 49 |
+
parser.add_argument(
|
| 50 |
+
"--service",
|
| 51 |
+
default="aliyun_api_key",
|
| 52 |
+
type=str
|
| 53 |
+
)
|
| 54 |
+
parser.add_argument(
|
| 55 |
+
"--create_time_str",
|
| 56 |
+
default="null",
|
| 57 |
+
# default="20250812_092418",
|
| 58 |
+
type=str
|
| 59 |
+
)
|
| 60 |
+
parser.add_argument(
|
| 61 |
+
"--interval",
|
| 62 |
+
default=1,
|
| 63 |
+
type=int
|
| 64 |
+
)
|
| 65 |
+
args = parser.parse_args()
|
| 66 |
+
return args
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def main():
|
| 70 |
+
args = get_args()
|
| 71 |
+
|
| 72 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 73 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 74 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 75 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 76 |
+
|
| 77 |
+
if args.create_time_str == "null":
|
| 78 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 79 |
+
now = datetime.now(tz)
|
| 80 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 81 |
+
# create_time_str = "20250724_090615"
|
| 82 |
+
else:
|
| 83 |
+
create_time_str = args.create_time_str
|
| 84 |
+
|
| 85 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 86 |
+
|
| 87 |
+
model_name_ = args.model_name.replace("/", "#")
|
| 88 |
+
output_file = eval_data_dir / f"aliyun_choice/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 89 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 90 |
+
|
| 91 |
+
api_key = environment.get(args.service, dtype=str)
|
| 92 |
+
client = OpenAI(
|
| 93 |
+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
| 94 |
+
# Read your Ark API Key from the environment variable.
|
| 95 |
+
api_key=api_key
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
total = 0
|
| 99 |
+
total_correct = 0
|
| 100 |
+
|
| 101 |
+
# finished
|
| 102 |
+
finished_idx_set = set()
|
| 103 |
+
if os.path.exists(output_file.as_posix()):
|
| 104 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 105 |
+
for row in f:
|
| 106 |
+
row = json.loads(row)
|
| 107 |
+
idx = row["idx"]
|
| 108 |
+
total = row["total"]
|
| 109 |
+
total_correct = row["total_correct"]
|
| 110 |
+
finished_idx_set.add(idx)
|
| 111 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 112 |
+
|
| 113 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 114 |
+
for row in fin:
|
| 115 |
+
row = json.loads(row)
|
| 116 |
+
idx = row["idx"]
|
| 117 |
+
prompt = row["prompt"]
|
| 118 |
+
response = row["response"]
|
| 119 |
+
|
| 120 |
+
if idx in finished_idx_set:
|
| 121 |
+
continue
|
| 122 |
+
finished_idx_set.add(idx)
|
| 123 |
+
|
| 124 |
+
try:
|
| 125 |
+
time.sleep(args.interval)
|
| 126 |
+
print(f"sleep: {args.interval}")
|
| 127 |
+
time_begin = time.time()
|
| 128 |
+
completion = client.chat.completions.create(
|
| 129 |
+
model=args.model_name,
|
| 130 |
+
messages=[
|
| 131 |
+
{"role": "user", "content": prompt},
|
| 132 |
+
],
|
| 133 |
+
# 由于 enable_thinking 非 OpenAI 标准参数,需要通过 extra_body 传入
|
| 134 |
+
extra_body={"enable_thinking": False},
|
| 135 |
+
stream=False,
|
| 136 |
+
)
|
| 137 |
+
time_cost = time.time() - time_begin
|
| 138 |
+
print(f"time_cost: {time_cost}")
|
| 139 |
+
except Exception as e:
|
| 140 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
# print(f"completion: {completion}")
|
| 144 |
+
prediction = completion.choices[0].message.content
|
| 145 |
+
rid = completion.id
|
| 146 |
+
|
| 147 |
+
correct = 1 if prediction == response else 0
|
| 148 |
+
|
| 149 |
+
total += 1
|
| 150 |
+
total_correct += correct
|
| 151 |
+
score = total_correct / total
|
| 152 |
+
|
| 153 |
+
row_ = {
|
| 154 |
+
"idx": idx,
|
| 155 |
+
"rid": rid,
|
| 156 |
+
"prompt": prompt,
|
| 157 |
+
"response": response,
|
| 158 |
+
"prediction": prediction,
|
| 159 |
+
"correct": correct,
|
| 160 |
+
"total": total,
|
| 161 |
+
"total_correct": total_correct,
|
| 162 |
+
"score": score,
|
| 163 |
+
"time_cost": time_cost,
|
| 164 |
+
}
|
| 165 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 166 |
+
fout.write(f"{row_}\n")
|
| 167 |
+
fout.flush()
|
| 168 |
+
|
| 169 |
+
return
|
| 170 |
+
|
| 171 |
+
|
| 172 |
+
if __name__ == "__main__":
|
| 173 |
+
main()
|
llm_eval_script/aliyun_nxcloud_v2_choice.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://help.aliyun.com/zh/model-studio/qwen-api-reference
|
| 5 |
+
https://help.aliyun.com/zh/model-studio/models
|
| 6 |
+
https://help.aliyun.com/zh/model-studio/models?spm=a2c4g.11186623.0.i4#d4ccf72f23jh9
|
| 7 |
+
|
| 8 |
+
https://help.aliyun.com/zh/model-studio/text-generation?spm=a2c4g.11186623.0.0.6b772e068nnT1J#24e54b27d4agt
|
| 9 |
+
|
| 10 |
+
Deep-Thinking
|
| 11 |
+
https://help.aliyun.com/zh/model-studio/deep-thinking?spm=a2c4g.11186623.0.0.56076f58IJd4mP
|
| 12 |
+
|
| 13 |
+
"""
|
| 14 |
+
import argparse
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
import sys
|
| 20 |
+
import time
|
| 21 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 22 |
+
|
| 23 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 24 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 25 |
+
|
| 26 |
+
from openai import OpenAI
|
| 27 |
+
|
| 28 |
+
from project_settings import environment, project_path
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_args():
|
| 32 |
+
parser = argparse.ArgumentParser()
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
"--model_name",
|
| 35 |
+
# default="qwen3-max-2025-09-23",
|
| 36 |
+
default="qwen3-max-preview",
|
| 37 |
+
# default="qwen-plus-2025-12-01",
|
| 38 |
+
type=str
|
| 39 |
+
)
|
| 40 |
+
parser.add_argument(
|
| 41 |
+
"--eval_dataset_name",
|
| 42 |
+
default="agent-nxcloud-zh-375-choice-v2.jsonl",
|
| 43 |
+
type=str
|
| 44 |
+
)
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
"--eval_dataset_dir",
|
| 47 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 48 |
+
type=str
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
"--eval_data_dir",
|
| 52 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 53 |
+
type=str
|
| 54 |
+
)
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
"--client",
|
| 57 |
+
default="shenzhen_sase",
|
| 58 |
+
type=str
|
| 59 |
+
)
|
| 60 |
+
parser.add_argument(
|
| 61 |
+
"--service",
|
| 62 |
+
default="aliyun_api_key",
|
| 63 |
+
type=str
|
| 64 |
+
)
|
| 65 |
+
parser.add_argument(
|
| 66 |
+
"--create_time_str",
|
| 67 |
+
default="null",
|
| 68 |
+
# default="20250812_092418",
|
| 69 |
+
type=str
|
| 70 |
+
)
|
| 71 |
+
parser.add_argument(
|
| 72 |
+
"--interval",
|
| 73 |
+
default=1,
|
| 74 |
+
type=int
|
| 75 |
+
)
|
| 76 |
+
args = parser.parse_args()
|
| 77 |
+
return args
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def conversation_to_str(conversation: list):
|
| 81 |
+
conversation_str = ""
|
| 82 |
+
for turn in conversation:
|
| 83 |
+
role = turn["role"]
|
| 84 |
+
content = turn["content"]
|
| 85 |
+
row_ = f"{role}: {content}\n"
|
| 86 |
+
conversation_str += row_
|
| 87 |
+
|
| 88 |
+
return conversation_str
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def main():
|
| 92 |
+
args = get_args()
|
| 93 |
+
|
| 94 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 95 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 96 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 97 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 98 |
+
|
| 99 |
+
if args.create_time_str == "null":
|
| 100 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 101 |
+
now = datetime.now(tz)
|
| 102 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 103 |
+
# create_time_str = "20250724_090615"
|
| 104 |
+
else:
|
| 105 |
+
create_time_str = args.create_time_str
|
| 106 |
+
|
| 107 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 108 |
+
|
| 109 |
+
model_name_ = args.model_name.replace("/", "#")
|
| 110 |
+
output_file = eval_data_dir / f"aliyun_nxcloud_v2_choice/aliyun/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 111 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 112 |
+
|
| 113 |
+
api_key = environment.get(args.service, dtype=str)
|
| 114 |
+
client = OpenAI(
|
| 115 |
+
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
|
| 116 |
+
# Read your Ark API Key from the environment variable.
|
| 117 |
+
api_key=api_key
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
total = 0
|
| 121 |
+
total_correct = 0
|
| 122 |
+
|
| 123 |
+
# finished
|
| 124 |
+
finished_idx_set = set()
|
| 125 |
+
if os.path.exists(output_file.as_posix()):
|
| 126 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 127 |
+
for row in f:
|
| 128 |
+
row = json.loads(row)
|
| 129 |
+
idx = row["idx"]
|
| 130 |
+
total = row["total"]
|
| 131 |
+
total_correct = row["total_correct"]
|
| 132 |
+
finished_idx_set.add(idx)
|
| 133 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 134 |
+
|
| 135 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 136 |
+
for row in fin:
|
| 137 |
+
row = json.loads(row)
|
| 138 |
+
idx = row["idx"]
|
| 139 |
+
system_prompt = row["system_prompt"]
|
| 140 |
+
conversation = row["conversation"]
|
| 141 |
+
examples = row["examples"]
|
| 142 |
+
choices = row["choices"]
|
| 143 |
+
response = row["response"]
|
| 144 |
+
|
| 145 |
+
if idx in finished_idx_set:
|
| 146 |
+
continue
|
| 147 |
+
|
| 148 |
+
# conversation
|
| 149 |
+
conversation_str = conversation_to_str(conversation)
|
| 150 |
+
|
| 151 |
+
examples_str = ""
|
| 152 |
+
for example in examples:
|
| 153 |
+
conversation_ = example["conversation"]
|
| 154 |
+
outputs = example["outputs"]
|
| 155 |
+
output = outputs["output"]
|
| 156 |
+
explanation = outputs["explanation"]
|
| 157 |
+
|
| 158 |
+
examples_str += conversation_to_str(conversation_)
|
| 159 |
+
examples_str += f"Output: {output}\n"
|
| 160 |
+
examples_str += f"Explanation: {explanation}\n\n"
|
| 161 |
+
|
| 162 |
+
# print(examples_str)
|
| 163 |
+
|
| 164 |
+
choices_str = ""
|
| 165 |
+
for choice in choices:
|
| 166 |
+
condition = choice["condition"]
|
| 167 |
+
choice_letter = choice["choice_letter"]
|
| 168 |
+
|
| 169 |
+
row_ = f"{condition}, output: {choice_letter}\n"
|
| 170 |
+
choices_str += row_
|
| 171 |
+
choices_str += "\nRemember to output ONLY the corresponding letter.\nYour output is:"
|
| 172 |
+
|
| 173 |
+
# prompt = f"{system_prompt}\n\n**Output**\n{choices_}\n**Examples**\n{examples_}"
|
| 174 |
+
prompt1 = f"{system_prompt}\n\n**Examples**\n{examples_str}"
|
| 175 |
+
prompt2 = f"**Conversation**\n{conversation_str}\n\n**Output**\n{choices_str}"
|
| 176 |
+
print(prompt1)
|
| 177 |
+
print(prompt2)
|
| 178 |
+
|
| 179 |
+
messages = list()
|
| 180 |
+
messages.append(
|
| 181 |
+
{"role": "system", "content": prompt1},
|
| 182 |
+
)
|
| 183 |
+
messages.append(
|
| 184 |
+
{"role": "user", "content": prompt2},
|
| 185 |
+
)
|
| 186 |
+
print(f"messages: {json.dumps(messages, ensure_ascii=False, indent=4)}")
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
time.sleep(args.interval)
|
| 190 |
+
print(f"sleep: {args.interval}")
|
| 191 |
+
time_begin = time.time()
|
| 192 |
+
completion = client.chat.completions.create(
|
| 193 |
+
model=args.model_name,
|
| 194 |
+
messages=messages,
|
| 195 |
+
# 由于 enable_thinking 非 OpenAI 标准参数,需要通过 extra_body 传入
|
| 196 |
+
extra_body={"enable_thinking": False},
|
| 197 |
+
stream=False,
|
| 198 |
+
)
|
| 199 |
+
time_cost = time.time() - time_begin
|
| 200 |
+
print(f"time_cost: {time_cost}")
|
| 201 |
+
except Exception as e:
|
| 202 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 203 |
+
continue
|
| 204 |
+
|
| 205 |
+
# print(f"completion: {completion}")
|
| 206 |
+
prediction = completion.choices[0].message.content
|
| 207 |
+
|
| 208 |
+
correct = 1 if prediction == response else 0
|
| 209 |
+
|
| 210 |
+
total += 1
|
| 211 |
+
total_correct += correct
|
| 212 |
+
score = total_correct / total
|
| 213 |
+
|
| 214 |
+
row_ = {
|
| 215 |
+
"idx": idx,
|
| 216 |
+
"messages": messages,
|
| 217 |
+
"response": response,
|
| 218 |
+
"prediction": prediction,
|
| 219 |
+
"correct": correct,
|
| 220 |
+
"total": total,
|
| 221 |
+
"total_correct": total_correct,
|
| 222 |
+
"score": score,
|
| 223 |
+
"time_cost": time_cost,
|
| 224 |
+
}
|
| 225 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 226 |
+
fout.write(f"{row_}\n")
|
| 227 |
+
fout.flush()
|
| 228 |
+
|
| 229 |
+
return
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
if __name__ == "__main__":
|
| 233 |
+
main()
|
llm_eval_script/aws.py
ADDED
|
@@ -0,0 +1,227 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
"""
|
| 4 |
+
https://docs.aws.amazon.com/bedrock/latest/userguide/models-supported.html
|
| 5 |
+
|
| 6 |
+
https://docs.aws.amazon.com/nova/latest/userguide/using-invoke-api.html?utm_source=chatgpt.com
|
| 7 |
+
"""
|
| 8 |
+
import argparse
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import sys
|
| 14 |
+
import time
|
| 15 |
+
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装
|
| 16 |
+
|
| 17 |
+
pwd = os.path.abspath(os.path.dirname(__file__))
|
| 18 |
+
sys.path.append(os.path.join(pwd, "../"))
|
| 19 |
+
|
| 20 |
+
import boto3
|
| 21 |
+
|
| 22 |
+
from project_settings import environment, project_path
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_args():
|
| 26 |
+
"""
|
| 27 |
+
python3 aws_claude.py --model_name anthropic.claude-instant-v1 \
|
| 28 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 29 |
+
--client "us_west(47.88.76.239)" \
|
| 30 |
+
--create_time_str 20250723-interval-10 \
|
| 31 |
+
--interval 10
|
| 32 |
+
|
| 33 |
+
python3 aws_claude.py --model_name anthropic.claude-v2 \
|
| 34 |
+
--eval_dataset_name agent-lingoace-zh-400-choice.jsonl \
|
| 35 |
+
--client "us_west(47.88.76.239)" \
|
| 36 |
+
--create_time_str 20250723-interval-10 \
|
| 37 |
+
--interval 10
|
| 38 |
+
|
| 39 |
+
"""
|
| 40 |
+
parser = argparse.ArgumentParser()
|
| 41 |
+
parser.add_argument(
|
| 42 |
+
"--model_name",
|
| 43 |
+
# default="ai21.jamba-1-5-large-v1:0",
|
| 44 |
+
# default="ai21.jamba-1-5-mini-v1:0",
|
| 45 |
+
# default="amazon.nova-canvas-v1:0",
|
| 46 |
+
# default="amazon.nova-premier-v1:0",
|
| 47 |
+
|
| 48 |
+
# default="amazon.nova-pro-v1:0",
|
| 49 |
+
# default="amazon.nova-lite-v1:0",
|
| 50 |
+
default="amazon.nova-micro-v1:0",
|
| 51 |
+
|
| 52 |
+
# default="amazon.nova-reel-v1:0",
|
| 53 |
+
# default="amazon.nova-reel-v1:1",
|
| 54 |
+
# default="amazon.nova-sonic-v1:0",
|
| 55 |
+
type=str
|
| 56 |
+
)
|
| 57 |
+
parser.add_argument(
|
| 58 |
+
"--eval_dataset_name",
|
| 59 |
+
# default="agent-bingoplus-ph-90-choice.jsonl",
|
| 60 |
+
default="agent-lingoace-zh-400-choice.jsonl",
|
| 61 |
+
# default="arc-easy-1000-choice.jsonl",
|
| 62 |
+
type=str
|
| 63 |
+
)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--eval_dataset_dir",
|
| 66 |
+
default=(project_path / "data/dataset").as_posix(),
|
| 67 |
+
type=str
|
| 68 |
+
)
|
| 69 |
+
parser.add_argument(
|
| 70 |
+
"--eval_data_dir",
|
| 71 |
+
default=(project_path / "data/eval_data").as_posix(),
|
| 72 |
+
type=str
|
| 73 |
+
)
|
| 74 |
+
parser.add_argument(
|
| 75 |
+
"--client",
|
| 76 |
+
default="shenzhen_sase",
|
| 77 |
+
type=str
|
| 78 |
+
)
|
| 79 |
+
parser.add_argument(
|
| 80 |
+
"--service",
|
| 81 |
+
default="aws_us_east",
|
| 82 |
+
type=str
|
| 83 |
+
)
|
| 84 |
+
parser.add_argument(
|
| 85 |
+
"--create_time_str",
|
| 86 |
+
default="null",
|
| 87 |
+
type=str
|
| 88 |
+
)
|
| 89 |
+
parser.add_argument(
|
| 90 |
+
"--interval",
|
| 91 |
+
default=10,
|
| 92 |
+
type=int
|
| 93 |
+
)
|
| 94 |
+
args = parser.parse_args()
|
| 95 |
+
return args
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def main():
|
| 99 |
+
args = get_args()
|
| 100 |
+
|
| 101 |
+
service = environment.get(key=args.service, dtype=json.loads)
|
| 102 |
+
aws_access_key_id = service["AWS_ACCESS_KEY_ID"]
|
| 103 |
+
aws_secret_access_key = service["AWS_SECRET_ACCESS_KEY"]
|
| 104 |
+
aws_default_region = service["AWS_DEFAULT_REGION"]
|
| 105 |
+
|
| 106 |
+
os.environ["AWS_ACCESS_KEY_ID"] = aws_access_key_id
|
| 107 |
+
os.environ["AWS_SECRET_ACCESS_KEY"] = aws_secret_access_key
|
| 108 |
+
os.environ["AWS_DEFAULT_REGION"] = aws_default_region
|
| 109 |
+
|
| 110 |
+
client = boto3.client(
|
| 111 |
+
service_name="bedrock-runtime",
|
| 112 |
+
region_name=aws_default_region
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
eval_dataset_dir = Path(args.eval_dataset_dir)
|
| 116 |
+
eval_dataset_dir.mkdir(parents=True, exist_ok=True)
|
| 117 |
+
eval_data_dir = Path(args.eval_data_dir)
|
| 118 |
+
eval_data_dir.mkdir(parents=True, exist_ok=True)
|
| 119 |
+
|
| 120 |
+
if args.create_time_str == "null":
|
| 121 |
+
tz = ZoneInfo("Asia/Shanghai")
|
| 122 |
+
now = datetime.now(tz)
|
| 123 |
+
create_time_str = now.strftime("%Y%m%d_%H%M%S")
|
| 124 |
+
# create_time_str = "20250722_173400"
|
| 125 |
+
else:
|
| 126 |
+
create_time_str = args.create_time_str
|
| 127 |
+
|
| 128 |
+
eval_dataset = eval_dataset_dir / args.eval_dataset_name
|
| 129 |
+
|
| 130 |
+
model_name_ = args.model_name
|
| 131 |
+
model_name_ = model_name_.replace("/", "#")
|
| 132 |
+
model_name_ = model_name_.replace(":", "-")
|
| 133 |
+
|
| 134 |
+
output_file = eval_data_dir / f"aws/aws/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}"
|
| 135 |
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 136 |
+
|
| 137 |
+
total = 0
|
| 138 |
+
total_correct = 0
|
| 139 |
+
|
| 140 |
+
# finished
|
| 141 |
+
finished_idx_set = set()
|
| 142 |
+
if os.path.exists(output_file.as_posix()):
|
| 143 |
+
with open(output_file.as_posix(), "r", encoding="utf-8") as f:
|
| 144 |
+
for row in f:
|
| 145 |
+
row = json.loads(row)
|
| 146 |
+
idx = row["idx"]
|
| 147 |
+
total = row["total"]
|
| 148 |
+
total_correct = row["total_correct"]
|
| 149 |
+
finished_idx_set.add(idx)
|
| 150 |
+
print(f"finished count: {len(finished_idx_set)}")
|
| 151 |
+
|
| 152 |
+
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout:
|
| 153 |
+
for row in fin:
|
| 154 |
+
row = json.loads(row)
|
| 155 |
+
idx = row["idx"]
|
| 156 |
+
prompt = row["prompt"]
|
| 157 |
+
response = row["response"]
|
| 158 |
+
|
| 159 |
+
if idx in finished_idx_set:
|
| 160 |
+
continue
|
| 161 |
+
finished_idx_set.add(idx)
|
| 162 |
+
|
| 163 |
+
body = {
|
| 164 |
+
"schemaVersion": "messages-v1",
|
| 165 |
+
"messages": [
|
| 166 |
+
{
|
| 167 |
+
"role": "user",
|
| 168 |
+
"content": [{"text": prompt}]
|
| 169 |
+
}
|
| 170 |
+
],
|
| 171 |
+
"inferenceConfig": {
|
| 172 |
+
"maxTokens": 1,
|
| 173 |
+
"temperature": 0.5,
|
| 174 |
+
"topP": 0.95,
|
| 175 |
+
# 可选 topK 如果用额外字段
|
| 176 |
+
}
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
try:
|
| 180 |
+
# client.converse()
|
| 181 |
+
time.sleep(args.interval)
|
| 182 |
+
print(f"sleep: {args.interval}")
|
| 183 |
+
time_begin = time.time()
|
| 184 |
+
llm_response = client.invoke_model(
|
| 185 |
+
modelId=args.model_name,
|
| 186 |
+
body=json.dumps(body),
|
| 187 |
+
contentType="application/json"
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
llm_response = json.loads(llm_response["body"].read())
|
| 191 |
+
# print(result['content'][0]['text'])
|
| 192 |
+
time_cost = time.time() - time_begin
|
| 193 |
+
print(f"time_cost: {time_cost}")
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
print(f"request failed, error type: {type(e)}, error text: {str(e)}")
|
| 197 |
+
continue
|
| 198 |
+
|
| 199 |
+
llm_response = llm_response["output"]["message"]
|
| 200 |
+
prediction = llm_response["content"][0]["text"]
|
| 201 |
+
|
| 202 |
+
correct = 1 if prediction == response else 0
|
| 203 |
+
|
| 204 |
+
total += 1
|
| 205 |
+
total_correct += correct
|
| 206 |
+
score = total_correct / total
|
| 207 |
+
|
| 208 |
+
row_ = {
|
| 209 |
+
"idx": idx,
|
| 210 |
+
"prompt": prompt,
|
| 211 |
+
"response": response,
|
| 212 |
+
"prediction": prediction,
|
| 213 |
+
"correct": correct,
|
| 214 |
+
"total": total,
|
| 215 |
+
"total_correct": total_correct,
|
| 216 |
+
"score": score,
|
| 217 |
+
"time_cost": time_cost,
|
| 218 |
+
}
|
| 219 |
+
row_ = json.dumps(row_, ensure_ascii=False)
|
| 220 |
+
fout.write(f"{row_}\n")
|
| 221 |
+
fout.flush()
|
| 222 |
+
|
| 223 |
+
return
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
if __name__ == "__main__":
|
| 227 |
+
main()
|
llm_eval_script/google_anthropic.py
CHANGED
|
@@ -27,8 +27,17 @@ def get_args():
|
|
| 27 |
parser = argparse.ArgumentParser()
|
| 28 |
parser.add_argument(
|
| 29 |
"--model_name",
|
| 30 |
-
default="claude-opus-4@
|
|
|
|
| 31 |
# default="claude-sonnet-4@20250514",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
type=str
|
| 33 |
)
|
| 34 |
parser.add_argument(
|
|
|
|
| 27 |
parser = argparse.ArgumentParser()
|
| 28 |
parser.add_argument(
|
| 29 |
"--model_name",
|
| 30 |
+
# default="claude-opus-4-1@20250805",
|
| 31 |
+
# default="claude-opus-4@20250514",
|
| 32 |
# default="claude-sonnet-4@20250514",
|
| 33 |
+
# default="claude-3-7-sonnet@20250219",
|
| 34 |
+
# default="claude-3-5-haiku@20241022",
|
| 35 |
+
|
| 36 |
+
# default="claude-3-5-sonnet-v2@20241022",
|
| 37 |
+
# default="claude-3-opus@20240229",
|
| 38 |
+
|
| 39 |
+
# default="claude-3-5-sonnet@20240620",
|
| 40 |
+
default="claude-3-haiku@20240307",
|
| 41 |
type=str
|
| 42 |
)
|
| 43 |
parser.add_argument(
|
main.py
CHANGED
|
@@ -146,6 +146,7 @@ def load_board():
|
|
| 146 |
if total == 0:
|
| 147 |
continue
|
| 148 |
score = np.mean(score_list)
|
|
|
|
| 149 |
time_cost_mean = np.mean(time_cost_list)
|
| 150 |
time_cost_var = np.var(time_cost_list)
|
| 151 |
|
|
@@ -158,6 +159,7 @@ def load_board():
|
|
| 158 |
"model_name": model_name,
|
| 159 |
"dataset": dataset,
|
| 160 |
"score": round(score, 4),
|
|
|
|
| 161 |
"time_cost(mean)": round(time_cost_mean, 4),
|
| 162 |
"time_cost(var)": round(time_cost_var, 4),
|
| 163 |
"time_cost(75%)": round(time_cost_p75, 4),
|
|
@@ -238,6 +240,7 @@ def when_click_view_chat_button(filename: str):
|
|
| 238 |
|
| 239 |
board_columns_choices = [
|
| 240 |
"company", "model_name", "dataset", "score",
|
|
|
|
| 241 |
"time_cost(mean)",
|
| 242 |
"time_cost(var)",
|
| 243 |
"time_cost(75%)", "time_cost(95%)", "time_cost(99%)",
|
|
|
|
| 146 |
if total == 0:
|
| 147 |
continue
|
| 148 |
score = np.mean(score_list)
|
| 149 |
+
time_cost_min = np.min(time_cost_list)
|
| 150 |
time_cost_mean = np.mean(time_cost_list)
|
| 151 |
time_cost_var = np.var(time_cost_list)
|
| 152 |
|
|
|
|
| 159 |
"model_name": model_name,
|
| 160 |
"dataset": dataset,
|
| 161 |
"score": round(score, 4),
|
| 162 |
+
"time_cost(min)": round(time_cost_min, 4),
|
| 163 |
"time_cost(mean)": round(time_cost_mean, 4),
|
| 164 |
"time_cost(var)": round(time_cost_var, 4),
|
| 165 |
"time_cost(75%)": round(time_cost_p75, 4),
|
|
|
|
| 240 |
|
| 241 |
board_columns_choices = [
|
| 242 |
"company", "model_name", "dataset", "score",
|
| 243 |
+
"time_cost(min)",
|
| 244 |
"time_cost(mean)",
|
| 245 |
"time_cost(var)",
|
| 246 |
"time_cost(75%)", "time_cost(95%)", "time_cost(99%)",
|
requirements.txt
CHANGED
|
@@ -9,3 +9,7 @@ smithy-aws-core>=0.0.1
|
|
| 9 |
aws_sdk_bedrock_runtime
|
| 10 |
boto3
|
| 11 |
anthropic
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
aws_sdk_bedrock_runtime
|
| 10 |
boto3
|
| 11 |
anthropic
|
| 12 |
+
alibabacloud_kms20160120
|
| 13 |
+
alibabacloud_credentials
|
| 14 |
+
alibabacloud_tea_openapi
|
| 15 |
+
alibabacloud_tea_util
|
toolbox/aliyun_kms/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
pass
|
toolbox/aliyun_kms/aliyun_kms.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import argparse
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
from alibabacloud_kms20160120.client import Client as Kms20160120Client
|
| 8 |
+
from alibabacloud_credentials.client import Client as CredentialClient
|
| 9 |
+
from alibabacloud_kms20160120.models import GetSecretValueResponse
|
| 10 |
+
from alibabacloud_tea_openapi import models as open_api_models
|
| 11 |
+
from alibabacloud_kms20160120 import models as kms_20160120_models
|
| 12 |
+
from alibabacloud_tea_util import models as util_models
|
| 13 |
+
from alibabacloud_tea_util.client import Client as UtilClient
|
| 14 |
+
from alibabacloud_credentials.models import Config
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class AliyunKMS(object):
|
| 18 |
+
"""
|
| 19 |
+
https://help.aliyun.com/zh/sdk/developer-reference/v2-manage-python-access-credentials
|
| 20 |
+
"""
|
| 21 |
+
def __init__(self,
|
| 22 |
+
access_key_id: str = None,
|
| 23 |
+
access_key_secret: str = None,
|
| 24 |
+
endpoint: str = "kms.ap-southeast-1.aliyuncs.com",
|
| 25 |
+
):
|
| 26 |
+
self.access_key_id = access_key_id
|
| 27 |
+
self.access_key_secret = access_key_secret
|
| 28 |
+
self.endpoint = endpoint
|
| 29 |
+
|
| 30 |
+
self.client = self.get_client()
|
| 31 |
+
|
| 32 |
+
def get_client(self):
|
| 33 |
+
credential = CredentialClient(
|
| 34 |
+
config=Config(
|
| 35 |
+
type="access_key",
|
| 36 |
+
access_key_id=self.access_key_id,
|
| 37 |
+
access_key_secret=self.access_key_secret,
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
config = open_api_models.Config(
|
| 41 |
+
credential=credential
|
| 42 |
+
)
|
| 43 |
+
# Endpoint 请参考 https://api.aliyun.com/product/Kms
|
| 44 |
+
config.endpoint = self.endpoint
|
| 45 |
+
|
| 46 |
+
client = Kms20160120Client(config)
|
| 47 |
+
return client
|
| 48 |
+
|
| 49 |
+
def create_secret(self, secret_name: str, secret_data: str, version_id: str):
|
| 50 |
+
"""
|
| 51 |
+
https://next.api.aliyun.com/api/Kms/2016-01-20/CreateSecret
|
| 52 |
+
"""
|
| 53 |
+
create_secret_request = kms_20160120_models.CreateSecretRequest(
|
| 54 |
+
secret_name=secret_name,
|
| 55 |
+
secret_data=secret_data,
|
| 56 |
+
version_id=version_id,
|
| 57 |
+
)
|
| 58 |
+
runtime = util_models.RuntimeOptions()
|
| 59 |
+
|
| 60 |
+
result = self.client.create_secret_with_options(create_secret_request, runtime)
|
| 61 |
+
return result
|
| 62 |
+
|
| 63 |
+
def get_secret_value(self, secret_name: str, version_id: str = None):
|
| 64 |
+
"""
|
| 65 |
+
https://next.api.aliyun.com/api/Kms/2016-01-20/GetSecretValue
|
| 66 |
+
"""
|
| 67 |
+
get_secret_value_request = kms_20160120_models.GetSecretValueRequest(
|
| 68 |
+
secret_name=secret_name,
|
| 69 |
+
version_id=version_id,
|
| 70 |
+
)
|
| 71 |
+
runtime = util_models.RuntimeOptions()
|
| 72 |
+
|
| 73 |
+
response: GetSecretValueResponse = self.client.get_secret_value_with_options(get_secret_value_request, runtime)
|
| 74 |
+
js = response.to_map()
|
| 75 |
+
return js
|
| 76 |
+
|
| 77 |
+
async def async_get_secret_value(self, secret_name: str, version_id: str = None):
|
| 78 |
+
"""
|
| 79 |
+
https://next.api.aliyun.com/api/Kms/2016-01-20/GetSecretValue
|
| 80 |
+
"""
|
| 81 |
+
get_secret_value_request = kms_20160120_models.GetSecretValueRequest(
|
| 82 |
+
secret_name=secret_name,
|
| 83 |
+
version_id=version_id,
|
| 84 |
+
)
|
| 85 |
+
runtime = util_models.RuntimeOptions()
|
| 86 |
+
|
| 87 |
+
response: GetSecretValueResponse = await self.client.get_secret_value_with_options_async(get_secret_value_request, runtime)
|
| 88 |
+
js = response.to_map()
|
| 89 |
+
return js
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def main():
|
| 93 |
+
from settings import environment
|
| 94 |
+
|
| 95 |
+
access_key_id = environment.get(key="ALIBABA_CLOUD_ACCESS_KEY_ID", dtype=str)
|
| 96 |
+
access_key_secret = environment.get(key="ALIBABA_CLOUD_ACCESS_KEY_SECRET", dtype=str)
|
| 97 |
+
print(f"access_key_id: {access_key_id}")
|
| 98 |
+
print(f"access_key_secret: {access_key_secret}")
|
| 99 |
+
|
| 100 |
+
# os.environ["ALIBABA_CLOUD_ACCESS_KEY_ID"] = access_key_id
|
| 101 |
+
# os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"] = access_key_secret
|
| 102 |
+
|
| 103 |
+
manager = AliyunKMS(
|
| 104 |
+
access_key_id=access_key_id,
|
| 105 |
+
access_key_secret=access_key_secret,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# result = manager.get_secret_value(
|
| 109 |
+
# secret_name="azure-east-asia-asr-dev",
|
| 110 |
+
# version_id="v1",
|
| 111 |
+
# )
|
| 112 |
+
# print(result)
|
| 113 |
+
result = manager.get_secret_value(
|
| 114 |
+
secret_name="aliyun-nxai123-oss-dev",
|
| 115 |
+
# version_id="d5b82ac1ee63d748b25bf7be6c75695e",
|
| 116 |
+
)
|
| 117 |
+
print(result)
|
| 118 |
+
return
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
main()
|