Spaces:
Sleeping
Sleeping
priyansh-saxena1 commited on
Commit ·
03af64f
1
Parent(s): 0bcdd07
chore: add more lgging
Browse files- app/llm.py +23 -4
app/llm.py
CHANGED
|
@@ -156,10 +156,12 @@ class TransformersLLM:
|
|
| 156 |
|
| 157 |
def _load(self):
|
| 158 |
if self.model is None and not self._load_lock:
|
|
|
|
|
|
|
| 159 |
self._load_lock = True
|
| 160 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 161 |
import torch
|
| 162 |
-
print(f"[LLM] Loading
|
| 163 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 164 |
# Use float16 — halves memory footprint and is ~2x faster than float32 on CPU
|
| 165 |
dtype = torch.float16
|
|
@@ -170,15 +172,21 @@ class TransformersLLM:
|
|
| 170 |
low_cpu_mem_usage=True,
|
| 171 |
)
|
| 172 |
self.model.eval()
|
| 173 |
-
print("[LLM] Model
|
| 174 |
|
| 175 |
-
def _infer(self, messages: list[dict], max_tokens: int =
|
| 176 |
"""Single shared inference method. Greedy decode for speed."""
|
| 177 |
import torch
|
|
|
|
|
|
|
|
|
|
| 178 |
text = self.tokenizer.apply_chat_template(
|
| 179 |
messages, tokenize=False, add_generation_prompt=True
|
| 180 |
)
|
| 181 |
inputs = self.tokenizer(text, return_tensors="pt")
|
|
|
|
|
|
|
|
|
|
| 182 |
with torch.no_grad():
|
| 183 |
outputs = self.model.generate(
|
| 184 |
**inputs,
|
|
@@ -186,10 +194,17 @@ class TransformersLLM:
|
|
| 186 |
do_sample=False, # Greedy — deterministic and fastest
|
| 187 |
pad_token_id=self.tokenizer.eos_token_id,
|
| 188 |
)
|
|
|
|
|
|
|
|
|
|
| 189 |
response = self.tokenizer.decode(
|
| 190 |
outputs[0][inputs.input_ids.shape[1]:],
|
| 191 |
skip_special_tokens=True,
|
| 192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
return response.strip()
|
| 194 |
|
| 195 |
def combined_call(self, transcript: str, current_json: str) -> CombinedOutput:
|
|
@@ -211,7 +226,11 @@ class TransformersLLM:
|
|
| 211 |
{"role": "user", "content": prompt},
|
| 212 |
]
|
| 213 |
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
# Parse JSON robustly
|
| 217 |
json_str = raw
|
|
|
|
| 156 |
|
| 157 |
def _load(self):
|
| 158 |
if self.model is None and not self._load_lock:
|
| 159 |
+
import time
|
| 160 |
+
t0 = time.time()
|
| 161 |
self._load_lock = True
|
| 162 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 163 |
import torch
|
| 164 |
+
print(f"[LLM] Loading {self.model_name} into memory. This may take 5-30 secs on CPU...")
|
| 165 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 166 |
# Use float16 — halves memory footprint and is ~2x faster than float32 on CPU
|
| 167 |
dtype = torch.float16
|
|
|
|
| 172 |
low_cpu_mem_usage=True,
|
| 173 |
)
|
| 174 |
self.model.eval()
|
| 175 |
+
print(f"[LLM] Model load complete in {time.time() - t0:.1f} seconds.")
|
| 176 |
|
| 177 |
+
def _infer(self, messages: list[dict], max_tokens: int = 200) -> str:
|
| 178 |
"""Single shared inference method. Greedy decode for speed."""
|
| 179 |
import torch
|
| 180 |
+
import time
|
| 181 |
+
|
| 182 |
+
t0 = time.time()
|
| 183 |
text = self.tokenizer.apply_chat_template(
|
| 184 |
messages, tokenize=False, add_generation_prompt=True
|
| 185 |
)
|
| 186 |
inputs = self.tokenizer(text, return_tensors="pt")
|
| 187 |
+
tok_time = time.time() - t0
|
| 188 |
+
|
| 189 |
+
t1 = time.time()
|
| 190 |
with torch.no_grad():
|
| 191 |
outputs = self.model.generate(
|
| 192 |
**inputs,
|
|
|
|
| 194 |
do_sample=False, # Greedy — deterministic and fastest
|
| 195 |
pad_token_id=self.tokenizer.eos_token_id,
|
| 196 |
)
|
| 197 |
+
gen_time = time.time() - t1
|
| 198 |
+
|
| 199 |
+
t2 = time.time()
|
| 200 |
response = self.tokenizer.decode(
|
| 201 |
outputs[0][inputs.input_ids.shape[1]:],
|
| 202 |
skip_special_tokens=True,
|
| 203 |
)
|
| 204 |
+
dec_time = time.time() - t2
|
| 205 |
+
|
| 206 |
+
print(f"[LLM Timing] Tokens generated: {outputs.shape[1] - inputs.input_ids.shape[1]} | "
|
| 207 |
+
f"Tokenize: {tok_time:.3f}s | Infer: {gen_time:.1f}s | Decode: {dec_time:.3f}s")
|
| 208 |
return response.strip()
|
| 209 |
|
| 210 |
def combined_call(self, transcript: str, current_json: str) -> CombinedOutput:
|
|
|
|
| 226 |
{"role": "user", "content": prompt},
|
| 227 |
]
|
| 228 |
|
| 229 |
+
import time
|
| 230 |
+
t_start = time.time()
|
| 231 |
+
print("[LLM] Starting inference call...")
|
| 232 |
+
raw = self._infer(messages, max_tokens=200)
|
| 233 |
+
print(f"[LLM] Inference completed in {time.time() - t_start:.1f} seconds total.")
|
| 234 |
|
| 235 |
# Parse JSON robustly
|
| 236 |
json_str = raw
|