priyansh-saxena1 commited on
Commit
03af64f
·
1 Parent(s): 0bcdd07

chore: add more lgging

Browse files
Files changed (1) hide show
  1. app/llm.py +23 -4
app/llm.py CHANGED
@@ -156,10 +156,12 @@ class TransformersLLM:
156
 
157
  def _load(self):
158
  if self.model is None and not self._load_lock:
 
 
159
  self._load_lock = True
160
  from transformers import AutoModelForCausalLM, AutoTokenizer
161
  import torch
162
- print(f"[LLM] Loading model {self.model_name}...")
163
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
164
  # Use float16 — halves memory footprint and is ~2x faster than float32 on CPU
165
  dtype = torch.float16
@@ -170,15 +172,21 @@ class TransformersLLM:
170
  low_cpu_mem_usage=True,
171
  )
172
  self.model.eval()
173
- print("[LLM] Model ready.")
174
 
175
- def _infer(self, messages: list[dict], max_tokens: int = 350) -> str:
176
  """Single shared inference method. Greedy decode for speed."""
177
  import torch
 
 
 
178
  text = self.tokenizer.apply_chat_template(
179
  messages, tokenize=False, add_generation_prompt=True
180
  )
181
  inputs = self.tokenizer(text, return_tensors="pt")
 
 
 
182
  with torch.no_grad():
183
  outputs = self.model.generate(
184
  **inputs,
@@ -186,10 +194,17 @@ class TransformersLLM:
186
  do_sample=False, # Greedy — deterministic and fastest
187
  pad_token_id=self.tokenizer.eos_token_id,
188
  )
 
 
 
189
  response = self.tokenizer.decode(
190
  outputs[0][inputs.input_ids.shape[1]:],
191
  skip_special_tokens=True,
192
  )
 
 
 
 
193
  return response.strip()
194
 
195
  def combined_call(self, transcript: str, current_json: str) -> CombinedOutput:
@@ -211,7 +226,11 @@ class TransformersLLM:
211
  {"role": "user", "content": prompt},
212
  ]
213
 
214
- raw = self._infer(messages, max_tokens=350)
 
 
 
 
215
 
216
  # Parse JSON robustly
217
  json_str = raw
 
156
 
157
  def _load(self):
158
  if self.model is None and not self._load_lock:
159
+ import time
160
+ t0 = time.time()
161
  self._load_lock = True
162
  from transformers import AutoModelForCausalLM, AutoTokenizer
163
  import torch
164
+ print(f"[LLM] Loading {self.model_name} into memory. This may take 5-30 secs on CPU...")
165
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
166
  # Use float16 — halves memory footprint and is ~2x faster than float32 on CPU
167
  dtype = torch.float16
 
172
  low_cpu_mem_usage=True,
173
  )
174
  self.model.eval()
175
+ print(f"[LLM] Model load complete in {time.time() - t0:.1f} seconds.")
176
 
177
+ def _infer(self, messages: list[dict], max_tokens: int = 200) -> str:
178
  """Single shared inference method. Greedy decode for speed."""
179
  import torch
180
+ import time
181
+
182
+ t0 = time.time()
183
  text = self.tokenizer.apply_chat_template(
184
  messages, tokenize=False, add_generation_prompt=True
185
  )
186
  inputs = self.tokenizer(text, return_tensors="pt")
187
+ tok_time = time.time() - t0
188
+
189
+ t1 = time.time()
190
  with torch.no_grad():
191
  outputs = self.model.generate(
192
  **inputs,
 
194
  do_sample=False, # Greedy — deterministic and fastest
195
  pad_token_id=self.tokenizer.eos_token_id,
196
  )
197
+ gen_time = time.time() - t1
198
+
199
+ t2 = time.time()
200
  response = self.tokenizer.decode(
201
  outputs[0][inputs.input_ids.shape[1]:],
202
  skip_special_tokens=True,
203
  )
204
+ dec_time = time.time() - t2
205
+
206
+ print(f"[LLM Timing] Tokens generated: {outputs.shape[1] - inputs.input_ids.shape[1]} | "
207
+ f"Tokenize: {tok_time:.3f}s | Infer: {gen_time:.1f}s | Decode: {dec_time:.3f}s")
208
  return response.strip()
209
 
210
  def combined_call(self, transcript: str, current_json: str) -> CombinedOutput:
 
226
  {"role": "user", "content": prompt},
227
  ]
228
 
229
+ import time
230
+ t_start = time.time()
231
+ print("[LLM] Starting inference call...")
232
+ raw = self._infer(messages, max_tokens=200)
233
+ print(f"[LLM] Inference completed in {time.time() - t_start:.1f} seconds total.")
234
 
235
  # Parse JSON robustly
236
  json_str = raw