llama-3.2-3B-Instruct

Sleeping

dntrplytch commited on Oct 4, 2024

Commit

6fb91aa

verified ·

1 Parent(s): 34d32ce

reverted data type, modified TextIteratorStreamer timeout to 120

Files changed (1) hide show

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
-    torch_dtype=torch.float16,
 )
 model.eval()
@@ -57,7 +57,7 @@ def generate(
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,

 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
+    torch_dtype=torch.bfloat16,
 )
 model.eval()
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=120.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,