Danielrahmai1991 commited on
Commit
157e95f
·
verified ·
1 Parent(s): 1f56a1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -46
app.py CHANGED
@@ -1,62 +1,39 @@
1
  import gradio as gr
2
 
3
- from unsloth import FastLanguageModel
4
- import torch
5
- max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
6
- dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
7
- load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
8
 
9
 
 
10
 
11
- model, tokenizer = FastLanguageModel.from_pretrained(
12
- model_name = "unsloth/Meta-Llama-3.1-8B",
13
- max_seq_length = max_seq_length,
14
- dtype = dtype,
15
- load_in_4bit = load_in_4bit,
16
- # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
17
  )
18
 
19
- model = FastLanguageModel.get_peft_model(
20
- model,
21
- r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
22
- target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
23
- "gate_proj", "up_proj", "down_proj",],
24
- lora_alpha = 16,
25
- lora_dropout = 0, # Supports any, but = 0 is optimized
26
- bias = "none", # Supports any, but = "none" is optimized
27
- # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
28
- use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
29
- random_state = 3407,
30
- use_rslora = False, # We support rank stabilized LoRA
31
- loftq_config = None, # And LoftQ
32
- )
33
-
34
- alpaca_prompt = """You are the Finiantial expert:
35
 
36
  ### Instruction:
37
- {}
38
 
39
  ### Input:
40
- {}
41
 
42
  ### Response:
43
- {}"""
44
-
45
-
46
- def greet(name):
47
- # alpaca_prompt = Copied from above
48
- FastLanguageModel.for_inference(model) # Enable native 2x faster inference
49
- inputs = tokenizer(
50
- [
51
- alpaca_prompt.format(
52
- f"{name}", # instruction
53
- "", # input
54
- "", # output - leave this blank for generation!
55
- )
56
- ], return_tensors = "pt").to("cuda")
57
-
58
- outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)
59
- out_gen = tokenizer.batch_decode(outputs)
60
  return out_gen
61
 
62
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 
1
  import gradio as gr
2
 
3
+ from langchain_community.llms import LlamaCpp
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.chains import LLMChain
6
+ from langchain_core.callbacks import StreamingStdOutCallbackHandler
 
7
 
8
 
9
+ callbacks = [StreamingStdOutCallbackHandler()]
10
 
11
+ llm = LlamaCpp(
12
+ model_path="/content/drive/MyDrive/models/demo1/unsloth.Q5_K_M.gguf",
13
+ n_gpu_layers=40,
14
+ n_batch=512,
15
+ callbacks=callbacks,
16
+ verbose=True,
17
  )
18
 
19
+ template = """You are the Finiantial expert:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  ### Instruction:
22
+ {question}
23
 
24
  ### Input:
25
+
26
 
27
  ### Response:
28
+ """
29
+
30
+ prompt = PromptTemplate(template=template, input_variables=["question"])
31
+
32
+ llm_chain_model = LLMChain(prompt=prompt, llm=llm)
33
+
34
+
35
+ def greet(question):
36
+ out_gen = llm_chain_model.run(question)
 
 
 
 
 
 
 
 
37
  return out_gen
38
 
39
  demo = gr.Interface(fn=greet, inputs="text", outputs="text")