Art Wielogorski commited on
Commit
1fba743
Β·
1 Parent(s): 96c736b

load in 8 bit

Browse files
Files changed (1) hide show
  1. app.py +14 -15
app.py CHANGED
@@ -1,30 +1,29 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
- import transformers
4
  import torch
5
 
6
  model = f"tiiuae/falcon-7b"
7
 
8
  tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
9
 
10
- generator = transformers.pipeline(
11
- "text-generation",
12
- model=model,
13
- # offload_folder='/tmp',
14
- tokenizer=tokenizer,
15
  torch_dtype=torch.bfloat16,
16
- trust_remote_code=True,
17
  device_map="auto",
 
18
  )
19
 
20
- def greet(name):
21
- v = generator(
22
- f""" {name}""",
23
- max_length=25,
24
- do_sample=False,
25
- num_return_sequences=1,
26
- eos_token_id=tokenizer.eos_token_id,)
27
- return v[0]['generated_text']
 
 
 
28
 
29
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
30
  iface.launch()
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
3
  import torch
4
 
5
  model = f"tiiuae/falcon-7b"
6
 
7
  tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
8
 
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ model,
 
 
 
11
  torch_dtype=torch.bfloat16,
 
12
  device_map="auto",
13
+ load_in_8bit=True,
14
  )
15
 
16
+ def greet(prompt):
17
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
18
+ v = model.generate(
19
+ input_ids=inputs["input_ids"],
20
+ attention_mask=inputs["attention_mask"],
21
+ do_sample=True,
22
+ temperature=0.6,
23
+ top_p=0.9,
24
+ max_new_tokens=50,
25
+ )
26
+ return tokenizer.decode(v[0].to("cpu"))
27
 
28
  iface = gr.Interface(fn=greet, inputs="text", outputs="text")
29
  iface.launch()