import os, json, itertools, bisect, gc from transformers import LlamaTokenizer, LlamaForCausalLM from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import transformers import torch from accelerate import Accelerator import accelerate import time model = None tokenizer = None generator = None os.environ["CUDA_VISIBLE_DEVICES"]="0" def load_model(model_name, eight_bit=0, device_map="auto"): global model, tokenizer, generator print("Loading "+model_name+"...") if device_map == "zero": device_map = "balanced_low_0" # config device = "cuda" if torch.cuda.is_available() else "cpu" gpu_count = torch.cuda.device_count() if torch.cuda.is_available() else 0 print('gpu_count', gpu_count) tokenizer = LlamaTokenizer.from_pretrained(model_name) model = LlamaForCausalLM.from_pretrained( model_name, #device_map=device_map, #device_map="auto", torch_dtype=torch.float16 if device == "cuda" else torch.float32, #max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"}, #load_in_8bit=eight_bit, #from_tf=True, low_cpu_mem_usage=True, load_in_8bit=False, cache_dir="cache" ).to(device) generator = model.generate load_model(r"C:\Users\JAY\Downloads\Chatdoc\ChatDoctor\pretrained") First_chat = "ChatDoctor: I am ChatDoctor, what medical questions do you have?" print(First_chat) history = [] history.append(First_chat) def go(): invitation = "ChatDoctor: " human_invitation = "Patient: " # input msg = input(human_invitation) print("") history.append(human_invitation + msg) fulltext = "If you are a doctor, please answer the medical questions based on the patient's description. \n\n" + "\n\n".join(history) + "\n\n" + invitation #fulltext = "\n\n".join(history) + "\n\n" + invitation #print('SENDING==========') #print(fulltext) #print('==========') generated_text = "" device = "cuda" if torch.cuda.is_available() else "cpu" gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.to(device) in_tokens = len(gen_in) with torch.no_grad(): generated_ids = generator( gen_in, max_new_tokens=200, use_cache=True, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1, do_sample=True, repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx: temperature=0.5, # default: 1.0 top_k = 50, # default: 50 top_p = 1.0, # default: 1.0 early_stopping=True, ) generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element? text_without_prompt = generated_text[len(fulltext):] response = text_without_prompt response = response.split(human_invitation)[0] response.strip() print(invitation + response) print("") history.append(invitation + response) while True: go()