Update app.py
Browse files
app.py
CHANGED
|
@@ -17,8 +17,10 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
| 17 |
zero = torch.Tensor([0]).cuda()
|
| 18 |
print(zero.device) # <-- 'cpu' 🤔
|
| 19 |
|
| 20 |
-
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 21 |
-
peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
|
|
|
|
|
|
|
| 22 |
#attn_implementation="flash_attention_2",
|
| 23 |
model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
|
| 24 |
model.load_adapter(peft_model_id)
|
|
@@ -32,10 +34,10 @@ if tokenizer.pad_token_id is None:
|
|
| 32 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 33 |
|
| 34 |
# Define terminators
|
| 35 |
-
terminators = [
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
]
|
| 39 |
|
| 40 |
generation_params = {
|
| 41 |
'max_new_tokens': 2000,
|
|
@@ -44,8 +46,8 @@ generation_params = {
|
|
| 44 |
'temperature': 0.7,
|
| 45 |
'top_p': 0.9,
|
| 46 |
# 'top_k': 50,
|
| 47 |
-
'pad_token_id': tokenizer.pad_token_id,
|
| 48 |
-
'eos_token_id': terminators,
|
| 49 |
}
|
| 50 |
|
| 51 |
|
|
@@ -89,9 +91,10 @@ def inference(query):
|
|
| 89 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
| 90 |
outputs = model.generate(tokenized_chat, **generation_params)
|
| 91 |
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
| 92 |
-
assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
| 93 |
-
|
| 94 |
-
|
|
|
|
| 95 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|
| 96 |
# return outputs
|
| 97 |
|
|
|
|
| 17 |
zero = torch.Tensor([0]).cuda()
|
| 18 |
print(zero.device) # <-- 'cpu' 🤔
|
| 19 |
|
| 20 |
+
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
|
| 21 |
+
# peft_model_id = "Imran1/Llama3.1_8b_Qlora_bnk"
|
| 22 |
+
model_id = "Qwen/Qwen2.5-14B-Instruct"
|
| 23 |
+
peft_model_id = "Imran1/Qwen2.5-14b-bnk-lora-11"
|
| 24 |
#attn_implementation="flash_attention_2",
|
| 25 |
model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="sdpa", torch_dtype= torch.bfloat16)
|
| 26 |
model.load_adapter(peft_model_id)
|
|
|
|
| 34 |
tokenizer.pad_token_id = tokenizer.eos_token_id
|
| 35 |
|
| 36 |
# Define terminators
|
| 37 |
+
# terminators = [
|
| 38 |
+
# tokenizer.eos_token_id,
|
| 39 |
+
# tokenizer.convert_tokens_to_ids("<|eot_id|>")
|
| 40 |
+
# ]
|
| 41 |
|
| 42 |
generation_params = {
|
| 43 |
'max_new_tokens': 2000,
|
|
|
|
| 46 |
'temperature': 0.7,
|
| 47 |
'top_p': 0.9,
|
| 48 |
# 'top_k': 50,
|
| 49 |
+
# 'pad_token_id': tokenizer.pad_token_id,
|
| 50 |
+
# 'eos_token_id': terminators,
|
| 51 |
}
|
| 52 |
|
| 53 |
|
|
|
|
| 91 |
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
|
| 92 |
outputs = model.generate(tokenized_chat, **generation_params)
|
| 93 |
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=False)
|
| 94 |
+
# assistant_response = decoded_outputs[0].split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
|
| 95 |
+
response = decoded_outputs[0][tokenized_chat.shape[-1]:]
|
| 96 |
+
response = tokenizer.decode(response, skip_special_tokens=True))
|
| 97 |
+
return response
|
| 98 |
# outputs = model.generate(tokenized_chat, **generation_params, streamer=streamer)
|
| 99 |
# return outputs
|
| 100 |
|