BubbleQ commited on
Commit
f0e52a3
·
verified ·
1 Parent(s): 78ab107

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +27 -28
README.md CHANGED
@@ -132,6 +132,7 @@ Note:
132
  | | AlignBench v1.1 | 6.8 | 5.99 | 6.95 | 6.3 | 6.33 |
133
  | | LiveBench 1125 | 48.7 | 25.5 | 52.1 | 43.1 | 40 |
134
 
 
135
  ## 3. Quick start
136
 
137
  ### Inference with huggingface
@@ -144,10 +145,10 @@ You can now inference in Transformers starting from version `4.56.0`.
144
  import torch
145
  from transformers import AutoTokenizer, AutoModelForCausalLM
146
 
147
- model_name = "/path/to/Klear-Base"
148
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
149
 
150
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", dtype=torch.bfloat16, trust_remote_code=True)
151
 
152
  text = "世界上最大的湖是"
153
  inputs = tokenizer(text, return_tensors="pt")
@@ -162,10 +163,10 @@ print(result)
162
  import torch
163
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
164
 
165
- model_name = "/path/to/Klear-Inst."
166
- tokenizer = AutoTokenizer.from_pretrained(model_name)
167
 
168
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
169
 
170
  messages = [
171
  {"role": "user", "content": "帮我用 python 写一个计算器的代码吧。"}
@@ -193,36 +194,34 @@ An OpenAI-compatible API will be available at `http://localhost:8000/v1`.
193
  Or you can refer to the following Python script for offline inference
194
  ```python
195
  from vllm import LLM, SamplingParams
 
 
 
 
196
 
197
- model_path = "/path/to/Klear"
198
  llm = LLM(
199
  model=model_path,
200
  trust_remote_code=True,
201
- num_speculative_tokens=1,
202
- disable_log_stats=False
203
  )
204
- sampling_params = SamplingParams(temperature=0.2)
205
-
206
- conversation = [
207
- {
208
- "role": "system",
209
- "content": ""
210
- },
211
- {
212
- "role": "user",
213
- "content": "Please help me write a snake game code.",
214
- },
215
  ]
216
 
217
- outputs = llm.chat(conversation,
218
- sampling_params=sampling_params,
219
- use_tqdm=False)
 
 
 
 
 
 
 
 
220
 
221
- for idx, output in enumerate(outputs):
222
- prompt = output.prompt
223
- generated_text = output.outputs[0].text
224
- print(f"==== Response #{idx} ====")
225
- print(f"Prompt: {prompt}, Generated text: {generated_text}")
226
 
227
  ```
228
 
 
132
  | | AlignBench v1.1 | 6.8 | 5.99 | 6.95 | 6.3 | 6.33 |
133
  | | LiveBench 1125 | 48.7 | 25.5 | 52.1 | 43.1 | 40 |
134
 
135
+
136
  ## 3. Quick start
137
 
138
  ### Inference with huggingface
 
145
  import torch
146
  from transformers import AutoTokenizer, AutoModelForCausalLM
147
 
148
+ model_path = "/path/to/Klear-Base"
149
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
150
 
151
+ model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", dtype=torch.bfloat16, trust_remote_code=True)
152
 
153
  text = "世界上最大的湖是"
154
  inputs = tokenizer(text, return_tensors="pt")
 
163
  import torch
164
  from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
165
 
166
+ model_path = "/path/to/Klear-Inst."
167
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
168
 
169
+ model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype=torch.bfloat16)
170
 
171
  messages = [
172
  {"role": "user", "content": "帮我用 python 写一个计算器的代码吧。"}
 
194
  Or you can refer to the following Python script for offline inference
195
  ```python
196
  from vllm import LLM, SamplingParams
197
+ from transformers import AutoTokenizer
198
+
199
+ model_path = "/path/to/Klear-Inst."
200
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
201
 
 
202
  llm = LLM(
203
  model=model_path,
204
  trust_remote_code=True,
205
+ tensor_parallel_size=torch.cuda.device_count(),
206
+ gpu_memory_utilization=0.7
207
  )
208
+ messages = [
209
+ {"role": "user", "content": "请介绍一下快手"}
 
 
 
 
 
 
 
 
 
210
  ]
211
 
212
+ prompt = tokenizer.apply_chat_template(
213
+ messages,
214
+ tokenize=False,
215
+ add_generation_prompt=True
216
+ )
217
+
218
+ sampling_params = SamplingParams(
219
+ temperature=0.6, top_p=0.8, max_tokens=512
220
+ )
221
+
222
+ outputs = llm.generate([prompt], sampling_params)
223
 
224
+ print(outputs[0].outputs[0].text)
 
 
 
 
225
 
226
  ```
227