kdevoe commited on
Commit
3a01d1a
·
1 Parent(s): ec46849

Replacing inference pipe with manual

Browse files
Files changed (1) hide show
  1. app.py +8 -23
app.py CHANGED
@@ -1,40 +1,25 @@
1
  import gradio as gr
2
  import time
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
  import torch
5
 
6
  model_dir = "tinyllama_model"
7
- #model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
8
 
 
9
  model = AutoModelForCausalLM.from_pretrained(model_dir)
10
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
11
 
12
-
13
- # Load the TinyLlama text generation pipeline
14
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
15
- #tokenizer = AutoTokenizer.from_pretrained(model_dir)
16
-
17
  # Define the inference function
18
  def generate_text(prompt):
19
  start_time = time.time()
20
- messages = [
21
- {
22
- "role": "system",
23
- "content": "You are a friendly and helpful chatbot",
24
- },
25
- {"role": "user", "content": prompt},
26
- ]
27
- prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
28
- results = pipe(prompt, max_length=100, num_return_sequences=1)
29
  end_time = time.time()
30
  response_time = end_time - start_time
31
 
32
- # Remove system message:
33
- assistant_prompt = "<|assistant|>"
34
- generated_text = results[0]['generated_text']
35
- if assistant_prompt in generated_text:
36
- generated_text = generated_text.split(assistant_prompt)[-1].strip()
37
-
38
  return generated_text, f"{response_time:.2f} seconds"
39
 
40
  # Create a Gradio interface
@@ -49,4 +34,4 @@ iface = gr.Interface(
49
  )
50
 
51
  # Launch the interface
52
- iface.launch()
 
1
  import gradio as gr
2
  import time
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
  model_dir = "tinyllama_model"
 
7
 
8
+ # Load the quantized model and tokenizer
9
  model = AutoModelForCausalLM.from_pretrained(model_dir)
10
  tokenizer = AutoTokenizer.from_pretrained(model_dir)
11
 
 
 
 
 
 
12
  # Define the inference function
13
  def generate_text(prompt):
14
  start_time = time.time()
15
+ inputs = tokenizer(prompt, return_tensors='pt')
16
+ # Manually move tensors to quantized int8 if necessary
17
+ inputs = {key: val.to(torch.int8) if val.dtype == torch.float32 else val for key, val in inputs.items()}
18
+ outputs = model.generate(**inputs, max_length=100, num_return_sequences=1)
19
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
 
 
20
  end_time = time.time()
21
  response_time = end_time - start_time
22
 
 
 
 
 
 
 
23
  return generated_text, f"{response_time:.2f} seconds"
24
 
25
  # Create a Gradio interface
 
34
  )
35
 
36
  # Launch the interface
37
+ iface.launch()