File size: 3,329 Bytes
373f237 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os, json, itertools, bisect, gc
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import transformers
import torch
from accelerate import Accelerator
import accelerate
import time
model = None
tokenizer = None
generator = None
os.environ["CUDA_VISIBLE_DEVICES"]="0"
def load_model(model_name, eight_bit=0, device_map="auto"):
global model, tokenizer, generator
print("Loading "+model_name+"...")
if device_map == "zero":
device_map = "balanced_low_0"
# config
device = "cuda" if torch.cuda.is_available() else "cpu"
gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0
print('gpu_count', gpu_count)
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model = LlamaForCausalLM.from_pretrained(
model_name,
#device_map=device_map,
#device_map="auto",
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
#max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
#load_in_8bit=eight_bit,
#from_tf=True,
low_cpu_mem_usage=True,
load_in_8bit=False,
cache_dir="cache"
).to(device)
generator = model.generate
load_model(r"C:\Users\JAY\Downloads\Chatdoc\ChatDoctor\pretrained")
First_chat = "ChatDoctor: I am ChatDoctor, what medical questions do you have?"
print(First_chat)
history = []
history.append(First_chat)
def go():
invitation = "ChatDoctor: "
human_invitation = "Patient: "
# input
msg = input(human_invitation)
print("")
history.append(human_invitation + msg)
fulltext = "If you are a doctor, please answer the medical questions based on the patient's description. \n\n" + "\n\n".join(history) + "\n\n" + invitation
#fulltext = "\n\n".join(history) + "\n\n" + invitation
#print('SENDING==========')
#print(fulltext)
#print('==========')
generated_text = ""
device = "cuda" if torch.cuda.is_available() else "cpu"
gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.to(device)
in_tokens = len(gen_in)
with torch.no_grad():
generated_ids = generator(
gen_in,
max_new_tokens=200,
use_cache=True,
pad_token_id=tokenizer.eos_token_id,
num_return_sequences=1,
do_sample=True,
repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
temperature=0.5, # default: 1.0
top_k = 50, # default: 50
top_p = 1.0, # default: 1.0
early_stopping=True,
)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?
text_without_prompt = generated_text[len(fulltext):]
response = text_without_prompt
response = response.split(human_invitation)[0]
response.strip()
print(invitation + response)
print("")
history.append(invitation + response)
while True:
go()
|