import os, json, itertools, bisect, gc from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import transformers import torch from accelerate import Accelerator import accelerate import time from huggingface_hub import login # 🟢 (Option 2 - recommended for Render/Colab) # Save your token as an environment variable called HUGGINGFACE_HUB_TOKEN # Then this will automatically pick it up: hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN") model = None tokenizer = None generator = None def load_model(model_name, eight_bit=0, device_map="auto"): global model, tokenizer, generator print("Loading "+model_name+"...") if device_map == "zero": device_map = "balanced_low_0" # config gpu_count = torch.cuda.device_count() print('gpu_count', gpu_count) tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name, token=hf_token) model = transformers.LlamaForCausalLM.from_pretrained( model_name, #device_map=device_map, #device_map="auto", torch_dtype=torch.float16, #max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"}, #load_in_8bit=eight_bit, low_cpu_mem_usage=True, load_in_8bit=False, cache_dir="cache", token=hf_token ).cuda() generator = model.generate load_model("Muhammadidrees/JayConverstionalModel") history = [] def go(): invitation = "Assistant: " human_invitation = "Human: " # input msg = input(human_invitation) print("") history.append(human_invitation + msg) fulltext = "\n\n".join(history) + "\n\n" + invitation # print('SENDING==========') # print(fulltext) # print('==========') generated_text = "" gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda() in_tokens = len(gen_in) with torch.no_grad(): generated_ids = generator( gen_in, max_new_tokens=200, use_cache=True, pad_token_id=tokenizer.eos_token_id, num_return_sequences=1, do_sample=True, repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx: temperature=0.5, # default: 1.0 top_k = 50, # default: 50 top_p = 1.0, # default: 1.0 early_stopping=True, ) generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element? text_without_prompt = generated_text[len(fulltext):] response = text_without_prompt response = response.split(human_invitation)[0] response.strip() print(invitation + response) print("") history.append(invitation + response) while True: go()