Danielrahmai1991 commited on
Commit
a4b7283
·
verified ·
1 Parent(s): fc3983a

update app.py 1

Browse files
Files changed (1) hide show
  1. app.py +57 -1
app.py CHANGED
@@ -1,7 +1,63 @@
1
  import gradio as gr
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
  demo.launch()
 
1
  import gradio as gr
2
 
3
+ from unsloth import FastLanguageModel
4
+ import torch
5
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
+
9
+
10
+
11
+ model, tokenizer = FastLanguageModel.from_pretrained(
12
+ model_name = "unsloth/Meta-Llama-3.1-8B",
13
+ max_seq_length = max_seq_length,
14
+ dtype = dtype,
15
+ load_in_4bit = load_in_4bit,
16
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
17
+ )
18
+
19
+ model = FastLanguageModel.get_peft_model(
20
+ model,
21
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
22
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
23
+ "gate_proj", "up_proj", "down_proj",],
24
+ lora_alpha = 16,
25
+ lora_dropout = 0, # Supports any, but = 0 is optimized
26
+ bias = "none", # Supports any, but = "none" is optimized
27
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
28
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
29
+ random_state = 3407,
30
+ use_rslora = False, # We support rank stabilized LoRA
31
+ loftq_config = None, # And LoftQ
32
+ )
33
+
34
+ alpaca_prompt = """You are the Finiantial expert:
35
+
36
+ ### Instruction:
37
+ {}
38
+
39
+ ### Input:
40
+ {}
41
+
42
+ ### Response:
43
+ {}"""
44
+
45
+
46
  def greet(name):
47
+ # alpaca_prompt = Copied from above
48
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
+ inputs = tokenizer(
50
+ [
51
+ alpaca_prompt.format(
52
+ f"{name}", # instruction
53
+ "", # input
54
+ "", # output - leave this blank for generation!
55
+ )
56
+ ], return_tensors = "pt").to("cuda")
57
+
58
+ outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
59
+ out_gen = tokenizer.batch_decode(outputs)
60
+ return out_gen
61
 
62
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
63
  demo.launch()