techpro-saida commited on
Commit
cf3dfe3
·
verified ·
1 Parent(s): b5f85a3

Update README.md

Browse files

4bit QTZ inference

Files changed (1) hide show
  1. README.md +22 -5
README.md CHANGED
@@ -121,15 +121,32 @@ The model was fine-tuned on a custom dataset (`data.jsonl`) consisting of:
121
  ## Example Usage
122
 
123
  ```python
124
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
125
 
126
- tokenizer = AutoTokenizer.from_pretrained("techpro-saida/msci_software_engineering_slm_v1")
127
- model = AutoModelForCausalLM.from_pretrained("techpro-saida/msci_software_engineering_slm_v1")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  prompt = "Explain SOLID principles in OOP?"
130
- inputs = tokenizer(prompt, return_tensors="pt")
131
- outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
 
132
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 
133
  ```
134
 
135
  ## Developer
 
121
  ## Example Usage
122
 
123
  ```python
124
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
125
+ import torch
126
 
127
+ model_id = "techpro-saida/msci_software_engineering_slm_v1"
128
+
129
+ # 4-bit config for efficient inference
130
+ bnb_config = BitsAndBytesConfig(
131
+ load_in_4bit=True,
132
+ bnb_4bit_compute_dtype=torch.bfloat16,
133
+ bnb_4bit_quant_type="nf4",
134
+ )
135
+
136
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
137
+
138
+ model = AutoModelForCausalLM.from_pretrained(
139
+ model_id,
140
+ quantization_config=bnb_config,
141
+ device_map="auto", # automatically balances between GPU/CPU
142
+ )
143
 
144
  prompt = "Explain SOLID principles in OOP?"
145
+ inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
146
+
147
+ outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7, top_p=0.9)
148
  print(tokenizer.decode(outputs[0], skip_special_tokens=True))
149
+
150
  ```
151
 
152
  ## Developer