Spaces:

Muhammadidrees
/

MoizChatDoctor

Paused

App Files Files Community

Muhammadidrees commited on Oct 15

Commit

7a855fe

verified ·

1 Parent(s): 2856d25

Upload chat.py

Browse files

Files changed (1) hide show

chat.py +95 -0

chat.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import os, json, itertools, bisect, gc
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+import transformers
+import torch
+from accelerate import Accelerator
+import accelerate
+import time
+model = None
+tokenizer = None
+generator = None
+def load_model(model_name, eight_bit=0, device_map="auto"):
+    global model, tokenizer, generator
+    print("Loading "+model_name+"...")
+    if device_map == "zero":
+        device_map = "balanced_low_0"
+    # config
+    gpu_count = torch.cuda.device_count()
+    print('gpu_count', gpu_count)
+    tokenizer = transformers.LlamaTokenizer.from_pretrained(model_name)
+    model = transformers.LLaMAForCausalLM.from_pretrained(
+        model_name,
+        #device_map=device_map,
+        #device_map="auto",
+        torch_dtype=torch.float16,
+        #max_memory = {0: "14GB", 1: "14GB", 2: "14GB", 3: "14GB",4: "14GB",5: "14GB",6: "14GB",7: "14GB"},
+        #load_in_8bit=eight_bit,
+        low_cpu_mem_usage=True,
+        load_in_8bit=False,
+        cache_dir="cache"
+    ).cuda()
+    generator = model.generate
+load_model("./pretrained")
+history = []
+def go():
+    invitation = "Assistant: "
+    human_invitation = "Human: "
+    # input
+    msg = input(human_invitation)
+    print("")
+    history.append(human_invitation + msg)
+    fulltext = "\n\n".join(history) + "\n\n" + invitation
+#    print('SENDING==========')
+#    print(fulltext)
+#    print('==========')
+    generated_text = ""
+    gen_in = tokenizer(fulltext, return_tensors="pt").input_ids.cuda()
+    in_tokens = len(gen_in)
+    with torch.no_grad():
+            generated_ids = generator(
+                gen_in,
+                max_new_tokens=200,
+                use_cache=True,
+                pad_token_id=tokenizer.eos_token_id,
+                num_return_sequences=1,
+                do_sample=True,
+                repetition_penalty=1.1, # 1.0 means 'off'. unfortunately if we penalize it it will not output Sphynx:
+                temperature=0.5, # default: 1.0
+                top_k = 50, # default: 50
+                top_p = 1.0, # default: 1.0
+                early_stopping=True,
+            )
+            generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # for some reason, batch_decode returns an array of one element?
+            text_without_prompt = generated_text[len(fulltext):]
+    response = text_without_prompt
+    response = response.split(human_invitation)[0]
+    response.strip()
+    print(invitation + response)
+    print("")
+    history.append(invitation + response)
+while True:
+    go()