|
|
import os, json, itertools, bisect, gc |
|
|
|
|
|
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
|
|
import transformers |
|
|
import torch |
|
|
from accelerate import Accelerator |
|
|
import accelerate |
|
|
import time |
|
|
from huggingface_hub import login |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN") |
|
|
|
|
|
model = None |
|
|
tokenizer = None |
|
|
generator = None |
|
|
|
|
|
def load_model(model_name, eight_bit=0, device_map="auto"): |
|
|
global model, tokenizer, generator |
|
|
|
|
|
print("Loading "+model_name+"...") |
|
|
|
|
|
if device_map == "zero": |
|
|
device_map = "balanced_low_0" |
|
|
|
|
|
|
|
|
gpu_count = torch.cuda.device_count() |
|
|
print('gpu_count', gpu_count) |
|
|
|
|
|
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token) |
|
|
model = transformers.LlamaForCausalLM.from_pretrained( |
|
|
model_name, |
|
|
|
|
|
|
|
|
torch_dtype=torch.float16, |
|
|
|
|
|
|
|
|
low_cpu_mem_usage=True, |
|
|
load_in_8bit=False, |
|
|
cache_dir="cache", |
|
|
token=hf_token |
|
|
).cuda() |
|
|
|
|
|
generator = model.generate |
|
|
|
|
|
load_model("Muhammadidrees/JayConverstionalModel") |
|
|
|
|
|
history = [] |
|
|
|
|
|
def go(): |
|
|
invitation = "Assistant: " |
|
|
human_invitation = "Human: " |
|
|
|
|
|
|
|
|
msg = input(human_invitation) |
|
|
print("") |
|
|
|
|
|
history.append(human_invitation + msg) |
|
|
|
|
|
fulltext = "\n\n".join(history) + "\n\n" + invitation |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
generated_text = "" |
|
|
gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda() |
|
|
in_tokens = len(gen_in) |
|
|
with torch.no_grad(): |
|
|
generated_ids = generator( |
|
|
gen_in, |
|
|
max_new_tokens=200, |
|
|
use_cache=True, |
|
|
pad_token_id=tokenizer.eos_token_id, |
|
|
num_return_sequences=1, |
|
|
do_sample=True, |
|
|
repetition_penalty=1.1, |
|
|
temperature=0.5, |
|
|
top_k = 50, |
|
|
top_p = 1.0, |
|
|
early_stopping=True, |
|
|
) |
|
|
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
text_without_prompt = generated_text[len(fulltext):] |
|
|
|
|
|
response = text_without_prompt |
|
|
|
|
|
response = response.split(human_invitation)[0] |
|
|
|
|
|
response.strip() |
|
|
|
|
|
print(invitation + response) |
|
|
|
|
|
print("") |
|
|
|
|
|
history.append(invitation + response) |
|
|
|
|
|
while True: |
|
|
go() |
|
|
|