File size: 5,108 Bytes
fcdea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5a16ff
fcdea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3a554a
 
fcdea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
edac724
fcdea4e
 
 
 
 
 
 
 
 
 
 
d5a16ff
fcdea4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5a16ff
fcdea4e
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import json
import os
from pathlib import Path
import platform

os.environ["UNSLOTH_USE_MODELSCOPE"] = "1"

if platform.system() in ("Windows", "Darwin"):
    from project_settings import project_path
else:
    project_path = os.path.abspath("../../../")
    project_path = Path(project_path)

from datasets import load_dataset
from unsloth import FastLanguageModel
from transformers import TextStreamer
from tqdm import tqdm


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--model_name",
        default="unsloth/Qwen3-8B-unsloth-bnb-4bit",
        type=str
    )
    parser.add_argument(
        "--lora_adapter_path",
        default=(project_path / "trained_models" / "Qwen3-8B-sft-lora-adapter-unsloth").as_posix(),
        type=str
    )

    parser.add_argument(
        "--dataset_path",
        default="miyuki2026/tutorials",
        type=str
    ),
    parser.add_argument("--dataset_name", default=None, type=str),
    parser.add_argument("--dataset_split", default=None, type=str),
    parser.add_argument(
        "--dataset_cache_dir",
        default=(project_path / "hub_datasets").as_posix(),
        type=str
    ),
    parser.add_argument("--dataset_streaming", default=None, type=str),
    parser.add_argument("--valid_dataset_size", default=1000, type=str),
    parser.add_argument("--shuffle_buffer_size", default=5000, type=str),

    parser.add_argument(
        "--max_new_tokens",
        default=1024, # 8192, 128
        type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
    )
    parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
    parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")

    parser.add_argument(
        "--num_workers",
        default=None if platform.system() == "Windows" else os.cpu_count() // 2,
        type=str
    )
    parser.add_argument("--output_file", default="evaluation.jsonl", type=str),

    args = parser.parse_args()
    return args


def main():
    args = get_args()

    output_file = Path(args.output_file)
    output_file.parent.mkdir(parents=True, exist_ok=True)

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model_name,
        max_seq_length=2048,  # 支持32K+长上下文
        device_map="auto",
        dtype=None,  # 自动选择最优精度
        load_in_4bit=True,  # 4bit量化节省70%显存
    )

    # 2、注入lora适配器
    model.load_adapter(args.lora_adapter_path)

    # 启用unsloth推理加速
    FastLanguageModel.for_inference(model)
    model.eval()

    dataset_dict = load_dataset(
        path=args.dataset_path,
        name=args.dataset_name,
        data_dir="keywords",
        # data_dir="psychology",
        split=args.dataset_split,
        cache_dir=args.dataset_cache_dir,
        # num_proc=args.num_workers if not args.dataset_streaming else None,
        streaming=args.dataset_streaming,
    )
    dataset = dataset_dict["train"]
    print(dataset)

    if args.dataset_streaming:
        valid_dataset = dataset.take(args.valid_dataset_size)
        # train_dataset = dataset.skip(args.valid_dataset_size)
        # train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None)
    else:
        dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
        # train_dataset = dataset["train"]
        valid_dataset = dataset["test"]

    with open(output_file.as_posix(), "w", encoding="utf-8") as f:
        for example in tqdm(valid_dataset):
            conversation = example["conversation"]
            prompt = conversation[:-1]
            response = conversation[-1]["content"]

            format_messages = tokenizer.apply_chat_template(
                prompt,
                tokenize=False,  # 训练时部分词,true返回的是张量
                add_generation_prompt=True,  # 训练期间要关闭,如果是推理则设为True
            )

            # 4、调用tokenizer得到input
            inputs = tokenizer(format_messages, return_tensors="pt").to(model.device)

            # 5、调用model.generate()
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=args.max_new_tokens, do_sample=True,
                pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
                top_p=args.top_p, temperature=args.temperature, repetition_penalty=1.0,
            )

            response_: str = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
            response_ = response_.split("</think>")[-1].strip()

            row = {
                "prompt": prompt,
                "response": response,
                "response_": response_,
            }
            row = json.dumps(row, ensure_ascii=False)
            f.write(f"{row}\n")
    return


if __name__ == "__main__":
    main()