DebasishDhal99 commited on
Commit
215cefd
·
verified ·
1 Parent(s): 31d2d0d

Fix inference code in readme

Browse files
Files changed (1) hide show
  1. README.md +38 -98
README.md CHANGED
@@ -33,118 +33,58 @@ This repo contains a low-rank adapter for LLaMA-7b fit on the Stanford Alpaca da
33
 
34
  Model can be easily loaded with AutoModelForCausalLM.
35
  ``` python
36
- # import torch
37
- from peft import PeftModel
38
- # from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
39
-
40
-
41
  import torch
42
- # from peft import PeftModel
43
  import transformers
44
- import gradio as gr
45
-
46
- assert (
47
- "LlamaTokenizer" in transformers._import_structure["models.llama"]
48
- ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git"
49
  from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
50
 
51
- tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf")
 
52
 
53
- BASE_MODEL = "decapoda-research/llama-7b-hf"
54
- LORA_WEIGHTS = "OdiaGenAI/odiagenAI-model-v1"
55
 
56
- if torch.cuda.is_available():
57
- device = "cuda"
58
- else:
59
- device = "cpu"
 
 
60
 
61
- try:
62
- if torch.backends.mps.is_available():
63
- device = "mps"
64
- except:
65
- pass
 
66
 
67
- if device == "cuda":
68
- model = LlamaForCausalLM.from_pretrained(
69
- BASE_MODEL,
70
- load_in_8bit=False,
71
- torch_dtype=torch.float16,
72
- device_map="auto",
73
- )
74
- model = PeftModel.from_pretrained(
75
- model, LORA_WEIGHTS, torch_dtype=torch.float16, force_download=True
76
- )
77
- elif device == "mps":
78
- model = LlamaForCausalLM.from_pretrained(
79
- BASE_MODEL,
80
- device_map={"": device},
81
- torch_dtype=torch.float16,
82
- )
83
- model = PeftModel.from_pretrained(
84
- model,
85
- LORA_WEIGHTS,
86
- device_map={"": device},
87
- torch_dtype=torch.float16,
88
- )
89
- else:
90
- model = LlamaForCausalLM.from_pretrained(
91
- BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True
92
- )
93
- model = PeftModel.from_pretrained(
94
- model,
95
- LORA_WEIGHTS,
96
- device_map={"": device},
97
- )
98
 
99
- def generate_prompt(instruction, input=None):
100
- if input:
101
- return f"""### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"""
102
- else:
103
- return f"""### Instruction:\n{instruction}\n\n### Response:\n"""
104
-
105
- if device != "cpu":
106
- model.half()
107
- model.eval()
108
- if torch.__version__ >= "2":
109
- model = torch.compile(model)
110
-
111
- def evaluate(
112
- instruction,
113
- input=None,
114
  temperature=0.1,
115
  top_p=0.75,
116
  top_k=40,
117
  num_beams=4,
118
- max_new_tokens=128,
119
- **kwargs,
120
- ):
121
- prompt = generate_prompt(instruction, input)
122
- print(prompt)
123
- inputs = tokenizer(prompt, return_tensors="pt")
124
- print(inputs)
125
- input_ids = inputs["input_ids"].to(device)
126
- print(input_ids)
127
- generation_config = GenerationConfig(
128
- temperature=temperature,
129
- top_p=top_p,
130
- top_k=top_k,
131
- num_beams=num_beams,
132
- **kwargs,
133
  )
134
- with torch.no_grad():
135
- generation_output = model.generate(
136
- input_ids=input_ids,
137
- generation_config=generation_config,
138
- return_dict_in_generate=True,
139
- output_scores=True,
140
- max_new_tokens=max_new_tokens,
141
- )
142
- print(generation_output)
143
- s = generation_output.sequences[0]
144
- print(s)
145
- output = tokenizer.decode(s)
146
- print(output)
147
- return output.split("### Response:")[1].strip()
148
 
149
  ```
150
 
 
33
 
34
  Model can be easily loaded with AutoModelForCausalLM.
35
  ``` python
 
 
 
 
 
36
  import torch
37
+ from peft import PeftModel
38
  import transformers
39
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
40
+ from peft import PeftModel, PeftConfig
 
 
 
41
  from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
42
 
43
+ base_model_path = "meta-llama/Llama-2-7b-hf"
44
+ adapter_path = "OdiaGenAI/odiagenAI-model-v1"
45
 
46
+ tokenizer = AutoTokenizer.from_pretrained(base_model_path, trust_remote_code=True)
47
+ tokenizer.pad_token = tokenizer.eos_token
48
 
49
+ bnb_config = BitsAndBytesConfig(
50
+ load_in_4bit=True,
51
+ bnb_4bit_quant_type="nf4",
52
+ bnb_4bit_use_double_quant=True,
53
+ bnb_4bit_compute_dtype=torch.float16,
54
+ )
55
 
56
+ base_model = AutoModelForCausalLM.from_pretrained(
57
+ base_model_path,
58
+ quantization_config=bnb_config,
59
+ device_map="auto",
60
+ trust_remote_code=True
61
+ )
62
 
63
+ model = PeftModel.from_pretrained(base_model, adapter_path)
64
+
65
+ instruction = "ଭାରତ ବିଷୟରେ କିଛି କୁହନ୍ତୁ"
66
+
67
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ inputs = tokenizer(instruction, return_tensors="pt").to(device)
70
+ input_ids = inputs["input_ids"].to(device)
71
+ generation_config = GenerationConfig(
 
 
 
 
 
 
 
 
 
 
 
 
72
  temperature=0.1,
73
  top_p=0.75,
74
  top_k=40,
75
  num_beams=4,
76
+ )
77
+ with torch.no_grad():
78
+ generation_output = model.generate(
79
+ input_ids=input_ids,
80
+ generation_config=generation_config,
81
+ return_dict_in_generate=True,
82
+ output_scores=True,
83
+ max_new_tokens=128,
 
 
 
 
 
 
 
84
  )
85
+ s = generation_output.sequences[0]
86
+ output = tokenizer.decode(s)
87
+ print(output)
 
 
 
 
 
 
 
 
 
 
 
88
 
89
  ```
90