CrazyMonkey0 commited on
Commit
d5d8ff1
·
1 Parent(s): 137ea47

feat(nlp): optimize NLP model for CPU

Browse files
Files changed (1) hide show
  1. app/routes/nlp.py +10 -4
app/routes/nlp.py CHANGED
@@ -15,9 +15,10 @@ def load_model_nlp():
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
- torch_dtype=torch.float32, # CPU-friendly
 
19
  )
20
- model.to("cpu") # Wymuszenie CPU
21
  model.eval()
22
  return model, tokenizer
23
 
@@ -43,12 +44,17 @@ async def chat(request: Request, message: ChatRequest):
43
  model_inputs = tokenizer([text_input], return_tensors="pt")
44
 
45
  # generate response
46
- with torch.no_grad():
47
  generated_ids = model.generate(
48
  **model_inputs,
49
- max_new_tokens=512,
 
 
 
 
50
  )
51
 
 
52
  # extract only the newly generated tokens
53
  new_tokens = [
54
  out_ids[len(in_ids):] for in_ids, out_ids in zip(
 
15
  tokenizer = AutoTokenizer.from_pretrained(model_name)
16
  model = AutoModelForCausalLM.from_pretrained(
17
  model_name,
18
+ torch_dtype=torch.float32, # CPU-friendly
19
+ low_cpu_mem_usage=True # low memory usage
20
  )
21
+ model.to("cpu")
22
  model.eval()
23
  return model, tokenizer
24
 
 
44
  model_inputs = tokenizer([text_input], return_tensors="pt")
45
 
46
  # generate response
47
+ with torch.inference_mode():
48
  generated_ids = model.generate(
49
  **model_inputs,
50
+ max_new_tokens=128, # CPU + RAM
51
+ do_sample=True,
52
+ temperature=0.7,
53
+ top_p=0.9,
54
+ top_k=50,
55
  )
56
 
57
+
58
  # extract only the newly generated tokens
59
  new_tokens = [
60
  out_ids[len(in_ids):] for in_ids, out_ids in zip(