| | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig |
| | from llama_condense_monkey_patch import replace_llama_with_condense |
| | from peft import PeftConfig |
| | from peft import PeftModel |
| | import torch |
| |
|
| | |
| | peft_model_id = "mingkuan/longchat-7b-qlora-customer-support" |
| | base_model_id = "lmsys/longchat-7b-16k" |
| |
|
| | config = AutoConfig.from_pretrained(base_model_id) |
| | replace_llama_with_condense(config.rope_condense_ratio) |
| | tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=False) |
| |
|
| | kwargs = {"torch_dtype": torch.float16} |
| | kwargs["device_map"] = "auto" |
| | nf4_config = BitsAndBytesConfig( |
| | load_in_4bit=True, |
| | bnb_4bit_quant_type="nf4", |
| | bnb_4bit_use_double_quant=True, |
| | bnb_4bit_compute_dtype=torch.bfloat16 |
| | ) |
| | model = AutoModelForCausalLM.from_pretrained( |
| | base_model_id, |
| | return_dict=True, |
| | trust_remote_code=True, |
| | quantization_config=nf4_config, |
| | load_in_4bit=True, |
| | **kwargs |
| | ) |
| |
|
| |
|
| | model = PeftModel.from_pretrained(model, peft_model_id) |
| |
|
| | def generate_prompt(query): |
| | return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. |
| | |
| | ### Instruction: |
| | You are a customer support assistant that can extract user request intent and category, and then provide appropriate answers. If the user input is related to customer support domain, please try to generate a json string that contains extracted category and intent, and the proper response. |
| | If user input is unrelated to customer support domain, please try to answer it in natural language. |
| | |
| | Example run: |
| | Input: Would it be possible to cancel the order I made? |
| | Output: "Category": "ORDER", "Intent": "cancel_order", "Answer": "Sure, I definitely can help you with that. Can you provide me your order number for the cancelation?" |
| | |
| | ### Input: |
| | {query} |
| | |
| | """ |
| |
|
| |
|
| | def getLLMResponse(prompt): |
| | device = "cuda" |
| | input_ids = tokenizer(prompt, return_tensors='pt').input_ids.cuda() |
| | output = model.generate(inputs=input_ids, temperature=0.5, max_new_tokens=256) |
| | promptLen = len(prompt) |
| | response = tokenizer.decode(output[0], skip_special_tokens=True)[promptLen:] |
| | return response |
| |
|
| | query = 'help me to setup a new shipping address?' |
| | response = getLLMResponse(generate_prompt(query)) |
| | print(f'\nUserInput:{query}\n\nLLM:\n{response}\n\n') |
| |
|