## Inference
We have trained a well-trained checkpoint through the `self-cognition-sft.ipynb` tutorial, and here we use `PtEngine` to do the inference on it.

In [6]:
# import some libraries
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

from swift.llm import InferEngine, InferRequest, PtEngine, RequestConfig, get_template

In [7]:
# Hyperparameters for inference
last_model_checkpoint = 'output/checkpoint-xxx'

# model
model_id_or_path = 'Qwen/Qwen2.5-3B-Instruct' # model_id or model_path
system = 'You are a helpful assistant.'
infer_backend = 'pt'

# generation_config
max_new_tokens = 512
temperature = 0
stream = True

In [None]:
# Get model and template, and load LoRA weights.
engine = PtEngine(model_id_or_path, adapters=[last_model_checkpoint])
template = get_template(engine.model_meta.template, engine.tokenizer, default_system=system)
# You can modify the `default_template` directly here, or pass it in during `engine.infer`.
engine.default_template = template

In [11]:
query_list = [
 'who are you?',
 "What should I do if I can't sleep at night?",
 '你是谁训练的?',
]

def infer_stream(engine: InferEngine, infer_request: InferRequest):
 request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature, stream=True)
 gen_list = engine.infer([infer_request], request_config)
 query = infer_request.messages[0]['content']
 print(f'query: {query}\nresponse: ', end='')
 for resp in gen_list[0]:
 if resp is None:
 continue
 print(resp.choices[0].delta.content, end='', flush=True)
 print()

def infer(engine: InferEngine, infer_request: InferRequest):
 request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature)
 resp_list = engine.infer([infer_request], request_config)
 query = infer_request.messages[0]['content']
 response = resp_list[0].choices[0].message.content
 print(f'query: {query}')
 print(f'response: {response}')

infer_func = infer_stream if stream else infer
for query in query_list:
 infer_func(engine, InferRequest(messages=[{'role': 'user', 'content': query}]))
 print('-' * 50)

query: who are you?
response: I am an artificial intelligence language model named Xiao Huang, developed by ModelScope. I can answer various questions and engage in conversation with humans. If you have any questions or need help, feel free to ask me at any time.
--------------------------------------------------
query: What should I do if I can't sleep at night?
response: If you're having trouble sleeping, there are several things you can try:

1. Establish a regular sleep schedule: Try to go to bed and wake up at the same time every day, even on weekends.

2. Create a relaxing bedtime routine: Engage in calming activities before bed, such as reading a book or taking a warm bath.

3. Make your bedroom conducive to sleep: Keep your bedroom cool, dark, and quiet. Invest in comfortable bedding and pillows.

4. Avoid stimulating activities before bed: Avoid using electronic devices, watching TV, or engaging in mentally stimulating activities before bed.

5. Exercise regularly: Regular phy