Spaces:

Ayushnangia
/

Try_mixtral

Runtime error

Ayushnangia commited on Apr 21, 2024

Commit

ddfbae0

verified ·

1 Parent(s): 7490532

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -29,7 +29,7 @@ if not torch.cuda.is_available():
 if torch.cuda.is_available():
     model_id = "mistral-community/Mixtral-8x22B-v0.1-4bit"
     model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True),
-                                                 device_map="auto",
                                                  # torch_dtype=torch.float16,
                                                  # load_in_8bit=True,
                                                  trust_remote_code=True)
@@ -55,7 +55,7 @@ def generate(
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to("cuda")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")

 if torch.cuda.is_available():
     model_id = "mistral-community/Mixtral-8x22B-v0.1-4bit"
     model = AutoModelForCausalLM.from_pretrained(model_id,quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True),
+                                                 device_map="cuda",
                                                  # torch_dtype=torch.float16,
                                                  # load_in_8bit=True,
                                                  trust_remote_code=True)
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")