MoizChatDoctor / app.py
Muhammadidrees's picture
Update app.py
d729467 verified
import os, json, itertools, bisect, gc
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
import torch
from accelerate import Accelerator
import accelerate
import time
from huggingface_hub import login
# 🟢 (Option 2 - recommended for Render/Colab)
# Save your token as an environment variable called HUGGINGFACE_HUB_TOKEN
# Then this will automatically pick it up:
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
model = None
tokenizer = None
generator = None
def load_model(model_name, eight_bit=0, device_map="auto"):
global model, tokenizer, generator
print("Loading "+model_name+"...")
if device_map == "zero":
device_map = "balanced_low_0"
# config
gpu_count = torch.cuda.device_count()
print('gpu_count', gpu_count)
tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token)
model = transformers.LlamaForCausalLM.from_pretrained(
model_name,
#device_map=device_map,
#device_map="auto",
torch_dtype=torch.float16,
#max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
#load_in_8bit=eight_bit,
low_cpu_mem_usage=True,
load_in_8bit=False,
cache_dir="cache",
token=hf_token
).cuda()
generator = model.generate
load_model("Muhammadidrees/JayConverstionalModel")
history = []
def go():
invitation = "Assistant: "
human_invitation = "Human: "
# input
msg = input(human_invitation)
print("")
history.append(human_invitation + msg)
fulltext = "\n\n".join(history) + "\n\n" + invitation
# print('SENDING==========')
# print(fulltext)
# print('==========')
generated_text = ""
gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
in_tokens = len(gen_in)
with torch.no_grad():
generated_ids = generator(
gen_in,
max_new_tokens=200,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
num_return_sequences=1,
do_sample=True,
repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
temperature=0.5, # default: 1.0
top_k = 50, # default: 50
top_p = 1.0, # default: 1.0
early_stopping=True,
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?
text_without_prompt = generated_text[len(fulltext):]
response = text_without_prompt
response = response.split(human_invitation)[0]
response.strip()
print(invitation + response)
print("")
history.append(invitation + response)
while True:
go()