CaptMetal commited on
Commit
26f7ee1
·
verified ·
1 Parent(s): e9572f2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -2
app.py CHANGED
@@ -1,15 +1,26 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import gradio as gr
3
  import torch
4
 
 
 
 
 
 
 
 
 
5
  # Load Phi-2 (smaller model with high-quality responses)
6
  model_name = "microsoft/phi-2"
7
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
8
  model = AutoModelForCausalLM.from_pretrained(model_name)
 
 
9
 
10
  def respond(message, history):
11
  inputs = tokenizer(message, return_tensors="pt")
12
- outputs = model.generate(inputs.input_ids, max_new_tokens=100, temperature=0.7, top_p=0.9)
13
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
14
  return response
15
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
2
  import gradio as gr
3
  import torch
4
 
5
+
6
+ # Set quantization config (4-bit for max speed)
7
+ quant_config = BitsAndBytesConfig(
8
+ load_in_4bit=True, # 4-bit precision
9
+ bnb_4bit_quant_type="nf4", # NF4 for better accuracy
10
+ bnb_4bit_compute_dtype=torch.float16, # Use float16 for computation
11
+ device_map="auto"
12
+ )
13
  # Load Phi-2 (smaller model with high-quality responses)
14
  model_name = "microsoft/phi-2"
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+
17
  model = AutoModelForCausalLM.from_pretrained(model_name)
18
+ # Speed up inference with torch.compile
19
+ model = torch.compile(model) # Compile the model for faster inference
20
 
21
  def respond(message, history):
22
  inputs = tokenizer(message, return_tensors="pt")
23
+ outputs = model.generate(inputs.input_ids, max_new_tokens=50, temperature=0.7, top_p=0.9)
24
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
  return response
26