miyuki2026 commited on
Commit
fcdea4e
·
1 Parent(s): 9994f24
examples/tutorials/lora_unsloth/step_2_train_model.py CHANGED
@@ -41,6 +41,9 @@ def get_args():
41
  ),
42
  parser.add_argument("--dataset_streaming", default=None, type=str),
43
 
 
 
 
44
  parser.add_argument(
45
  "--num_workers",
46
  default=None if platform.system() == "Windows" else os.cpu_count() // 2,
@@ -97,8 +100,16 @@ def main():
97
  # num_proc=args.num_workers if not args.dataset_streaming else None,
98
  streaming=args.dataset_streaming,
99
  )
100
- print(dataset_dict)
101
- train_dataset = dataset_dict["train"]
 
 
 
 
 
 
 
 
102
 
103
  train_dataset = train_dataset.map(
104
  format_func,
 
41
  ),
42
  parser.add_argument("--dataset_streaming", default=None, type=str),
43
 
44
+ parser.add_argument("--valid_dataset_size", default=None, type=str),
45
+ parser.add_argument("--shuffle_buffer_size", default=None, type=str),
46
+
47
  parser.add_argument(
48
  "--num_workers",
49
  default=None if platform.system() == "Windows" else os.cpu_count() // 2,
 
100
  # num_proc=args.num_workers if not args.dataset_streaming else None,
101
  streaming=args.dataset_streaming,
102
  )
103
+ dataset = dataset_dict["train"]
104
+
105
+ if args.dataset_streaming:
106
+ valid_dataset = dataset.take(args.valid_dataset_size)
107
+ train_dataset = dataset.skip(args.valid_dataset_size)
108
+ train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None)
109
+ else:
110
+ dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
111
+ train_dataset = dataset["train"]
112
+ valid_dataset = dataset["test"]
113
 
114
  train_dataset = train_dataset.map(
115
  format_func,
examples/tutorials/lora_unsloth/step_4_evaluation.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import json
5
+ import os
6
+ from pathlib import Path
7
+ import platform
8
+
9
+ os.environ["UNSLOTH_USE_MODELSCOPE"] = "1"
10
+
11
+ if platform.system() in ("Windows", "Darwin"):
12
+ from project_settings import project_path
13
+ else:
14
+ project_path = os.path.abspath("../../../")
15
+ project_path = Path(project_path)
16
+
17
+ from datasets import load_dataset
18
+ from unsloth import FastLanguageModel
19
+ from transformers import TextStreamer
20
+
21
+
22
+ def get_args():
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument(
25
+ "--model_name",
26
+ default="unsloth/Qwen3-8B-unsloth-bnb-4bit",
27
+ type=str
28
+ )
29
+ parser.add_argument(
30
+ "--lora_adapter_path",
31
+ default=(project_path / "trained_models" / "Qwen3-8B-sft-lora-adapter-unsloth").as_posix(),
32
+ type=str
33
+ )
34
+
35
+ parser.add_argument(
36
+ "--dataset_path",
37
+ default="miyuki2026/tutorials",
38
+ type=str
39
+ ),
40
+ parser.add_argument("--dataset_name", default=None, type=str),
41
+ parser.add_argument("--dataset_split", default=None, type=str),
42
+ parser.add_argument(
43
+ "--dataset_cache_dir",
44
+ default=(project_path / "hub_datasets").as_posix(),
45
+ type=str
46
+ ),
47
+ parser.add_argument("--dataset_streaming", default=None, type=str),
48
+ parser.add_argument("--valid_dataset_size", default=None, type=str),
49
+ parser.add_argument("--shuffle_buffer_size", default=None, type=str),
50
+
51
+ parser.add_argument(
52
+ "--max_new_tokens",
53
+ default=1024, # 8192, 128
54
+ type=int, help="最大生成长度(注意:并非模型实际长文本能力)"
55
+ )
56
+ parser.add_argument("--top_p", default=0.85, type=float, help="nucleus采样阈值(0-1)")
57
+ parser.add_argument("--temperature", default=0.85, type=float, help="生成温度,控制随机性(0-1,越大越随机)")
58
+
59
+ parser.add_argument(
60
+ "--num_workers",
61
+ default=None if platform.system() == "Windows" else os.cpu_count() // 2,
62
+ type=str
63
+ )
64
+ parser.add_argument("--output_file", default="evaluation.jsonl", type=str),
65
+
66
+ args = parser.parse_args()
67
+ return args
68
+
69
+
70
+ def main():
71
+ args = get_args()
72
+
73
+ output_file = Path(args.output_file)
74
+ output_file.parent.mkdir(parents=True, exist_ok=True)
75
+
76
+ model, tokenizer = FastLanguageModel.from_pretrained(
77
+ model_name=args.model_name,
78
+ max_seq_length=2048, # 支持32K+长上下文
79
+ device_map="auto",
80
+ dtype=None, # 自动选择最优精度
81
+ load_in_4bit=True, # 4bit量化节省70%显存
82
+ )
83
+
84
+ # 2、注入lora适配器
85
+ model.load_adapter(args.lora_adapter_path)
86
+
87
+ # 启用unsloth推理加速
88
+ FastLanguageModel.for_inference(model)
89
+ model.eval()
90
+
91
+ dataset_dict = load_dataset(
92
+ path=args.dataset_path,
93
+ name=args.dataset_name,
94
+ data_dir="keywords",
95
+ # data_dir="psychology",
96
+ split=args.dataset_split,
97
+ cache_dir=args.dataset_cache_dir,
98
+ # num_proc=args.num_workers if not args.dataset_streaming else None,
99
+ streaming=args.dataset_streaming,
100
+ )
101
+ dataset = dataset_dict["train"]
102
+
103
+ if args.dataset_streaming:
104
+ valid_dataset = dataset.take(args.valid_dataset_size)
105
+ # train_dataset = dataset.skip(args.valid_dataset_size)
106
+ # train_dataset = train_dataset.shuffle(buffer_size=args.shuffle_buffer_size, seed=None)
107
+ else:
108
+ dataset = dataset.train_test_split(test_size=args.valid_dataset_size, seed=None)
109
+ # train_dataset = dataset["train"]
110
+ valid_dataset = dataset["test"]
111
+
112
+ with open(output_file.as_posix(), "w", encoding="utf-8") as f:
113
+ for example in valid_dataset:
114
+ conversation = example["conversation"]
115
+ prompt = conversation[:-1]
116
+ response = conversation[-1]["content"]
117
+
118
+ format_messages = tokenizer.apply_chat_template(
119
+ prompt,
120
+ tokenize=False, # 训练时部分词,true返回的是张量
121
+ add_generation_prompt=True, # 训练期间要关闭,如果是推理则设为True
122
+ )
123
+
124
+ # 4、调用tokenizer得到input
125
+ inputs = tokenizer(format_messages, return_tensors="pt").to(model.device)
126
+
127
+ # 5、调用model.generate()
128
+ generated_ids = model.generate(
129
+ **inputs,
130
+ max_new_tokens=args.max_new_tokens, do_sample=True,
131
+ pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id,
132
+ top_p=args.top_p, temperature=args.temperature, repetition_penalty=1.0,
133
+ )
134
+
135
+ response_: str = tokenizer.decode(generated_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
136
+ response_ = response_.split("</thinking>")[-1].strip()
137
+
138
+ row = {
139
+ "prompt": prompt,
140
+ "response": response,
141
+ "response_": response_,
142
+ }
143
+ row = json.dumps(row, ensure_ascii=False)
144
+ f.write(f"{row}\n")
145
+ return
146
+
147
+
148
+ if __name__ == "__main__":
149
+ main()