kdevoe commited on
Commit
8912a27
·
1 Parent(s): 6a7d6fa

Adding quantization during app.py loading

Browse files
Files changed (2) hide show
  1. app.py +24 -11
  2. requirements.txt +0 -4
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import time
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
- import torch
5
 
6
  model_dir = "tinyllama_model"
7
 
@@ -10,28 +10,41 @@ model = AutoModelForCausalLM.from_pretrained(model_dir)
10
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
11
 
12
  ################### Modify this to add quantization of the model ##############################
13
-
 
 
14
 
15
  # Define the inference function
16
  def generate_text(prompt):
17
- start_time = time.time()
 
18
  inputs = tokenizer(prompt, return_tensors='pt')
19
- outputs = model.generate(**inputs, max_length=100, num_return_sequences=1)
20
- generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
21
- end_time = time.time()
22
- response_time = end_time - start_time
 
 
 
 
 
 
 
23
 
24
- return generated_text, f"{response_time:.2f} seconds"
 
25
 
26
  # Create a Gradio interface
27
  iface = gr.Interface(
28
  fn=generate_text,
29
  inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
30
  outputs=[
31
- gr.Textbox(label="Generated Text"),
32
- gr.Textbox(label="Response Time")
 
 
33
  ],
34
- title="TinyLlama Text Generation"
35
  )
36
 
37
  # Launch the interface
 
1
  import gradio as gr
2
  import time
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ from quanto import quantize, freeze, qint8
5
 
6
  model_dir = "tinyllama_model"
7
 
 
10
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
11
 
12
  ################### Modify this to add quantization of the model ##############################
13
+ quantized_model = AutoModelForCausalLM.from_pretrained(model_dir)
14
+ quantize(quantized_model, weights=qint8, activations=None)
15
+ freeze(quantized_model)
16
 
17
  # Define the inference function
18
  def generate_text(prompt):
19
+ # Measure time and generate text for the normal model
20
+ start_time_normal = time.time()
21
  inputs = tokenizer(prompt, return_tensors='pt')
22
+ outputs_normal = model.generate(**inputs, max_length=100, num_return_sequences=1)
23
+ generated_text_normal = tokenizer.decode(outputs_normal[0], skip_special_tokens=True)
24
+ end_time_normal = time.time()
25
+ response_time_normal = end_time_normal - start_time_normal
26
+
27
+ # Measure time and generate text for the quantized model
28
+ start_time_quantized = time.time()
29
+ outputs_quantized = quantized_model.generate(**inputs, max_length=100, num_return_sequences=1)
30
+ generated_text_quantized = tokenizer.decode(outputs_quantized[0], skip_special_tokens=True)
31
+ end_time_quantized = time.time()
32
+ response_time_quantized = end_time_quantized - start_time_quantized
33
 
34
+ return (generated_text_normal, f"{response_time_normal:.2f} seconds",
35
+ generated_text_quantized, f"{response_time_quantized:.2f} seconds")
36
 
37
  # Create a Gradio interface
38
  iface = gr.Interface(
39
  fn=generate_text,
40
  inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
41
  outputs=[
42
+ gr.Textbox(label="Generated Text (Normal Model)"),
43
+ gr.Textbox(label="Response Time (Normal Model)"),
44
+ gr.Textbox(label="Generated Text (Quantized Model)"),
45
+ gr.Textbox(label="Response Time (Quantized Model)")
46
  ],
47
+ title="TinyLlama Text Generation Comparison"
48
  )
49
 
50
  # Launch the interface
requirements.txt CHANGED
@@ -1,9 +1,5 @@
1
- accelerate
2
- bitsandbytes
3
  gradio
4
  helper
5
  quanto
6
- sentencepiece
7
  torch
8
- torchinfo
9
  transformers
 
 
 
1
  gradio
2
  helper
3
  quanto
 
4
  torch
 
5
  transformers