File size: 2,910 Bytes
7a855fe 99c1f78 7a855fe 99b22a7 ea68c0c 7a855fe 99c1f78 99b22a7 7a855fe d729467 7a855fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os, json, itertools, bisect, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
import torch
from accelerate import Accelerator
import accelerate
import time
from huggingface_hub import login
# 🟢 (Option 2 - recommended for Render/Colab)
# Save your token as an environment variable called HUGGINGFACE_HUB_TOKEN
# Then this will automatically pick it up:
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
model = None
tokenizer = None
generator = None
def load_model(model_name, eight_bit=0, device_map="auto"):
global model, tokenizer, generator
print("Loading "+model_name+"...")
if device_map == "zero":
device_map = "balanced_low_0"
# config
gpu_count = torch.cuda.device_count()
print('gpu_count', gpu_count)
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token)
model = transformers.LlamaForCausalLM.from_pretrained(
model_name,
#device_map=device_map,
#device_map="auto",
torch_dtype=torch.float16,
#max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
#load_in_8bit=eight_bit,
low_cpu_mem_usage=True,
load_in_8bit=False,
cache_dir="cache",
token=hf_token
).cuda()
generator = model.generate
load_model("Muhammadidrees/JayConverstionalModel")
history = []
def go():
invitation = "Assistant: "
human_invitation = "Human: "
# input
msg = input(human_invitation)
print("")
history.append(human_invitation + msg)
fulltext = "\n\n".join(history) + "\n\n" + invitation
# print('SENDING==========')
# print(fulltext)
# print('==========')
generated_text = ""
gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
in_tokens = len(gen_in)
with torch.no_grad():
generated_ids = generator(
gen_in,
max_new_tokens=200,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
num_return_sequences=1,
do_sample=True,
repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
temperature=0.5, # default: 1.0
top_k = 50, # default: 50
top_p = 1.0, # default: 1.0
early_stopping=True,
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?
text_without_prompt = generated_text[len(fulltext):]
response = text_without_prompt
response = response.split(human_invitation)[0]
response.strip()
print(invitation + response)
print("")
history.append(invitation + response)
while True:
go()
|