CrazyMonkey0 commited on
Commit
6151d5f
·
1 Parent(s): 23187e2

refactor(chat): migrate from transformers to llama-cpp-python using Qwen 3B

Browse files
Files changed (1) hide show
  1. app/routes/nlp.py +17 -51
app/routes/nlp.py CHANGED
@@ -1,73 +1,39 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- from pydantic import BaseModel
3
  from fastapi import APIRouter, Request
4
- from .tts import save_audio
5
- import torch
 
6
 
7
- model_name = "Qwen/Qwen2.5-1.5B-Instruct"
8
  router = APIRouter()
9
 
10
  class ChatRequest(BaseModel):
11
  message: str
12
 
13
- # Load NLP model and tokenizer
14
  def load_model_nlp():
15
- tokenizer = AutoTokenizer.from_pretrained(model_name)
16
- model = AutoModelForCausalLM.from_pretrained(
17
- model_name,
18
- torch_dtype=torch.float32, # CPU-friendly
19
- low_cpu_mem_usage=True # low memory usage
20
  )
21
- model.to("cpu")
22
- model.eval()
23
- return model, tokenizer
24
 
25
  @router.post("/chat")
26
  async def chat(request: Request, message: ChatRequest):
27
  text = message.message
28
- model = request.app.state.model_nlp
29
- tokenizer = request.app.state.tokenizer_nlp
30
-
31
- messages = [
32
- {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You help users learn English."},
33
- {"role": "user", "content": text},
34
- ]
35
-
36
- # apply chat template
37
- text_input = tokenizer.apply_chat_template(
38
- messages,
39
- tokenize=False,
40
- add_generation_prompt=True
41
- )
42
-
43
- # tokenize input
44
- model_inputs = tokenizer([text_input], return_tensors="pt")
45
-
46
- # generate response
47
- with torch.inference_mode():
48
- generated_ids = model.generate(
49
- **model_inputs,
50
- max_new_tokens=128, # CPU + RAM
51
- do_sample=True,
52
- temperature=0.7,
53
- top_p=0.9,
54
- top_k=50,
55
- )
56
 
 
 
57
 
58
- # extract only the newly generated tokens
59
- new_tokens = [
60
- out_ids[len(in_ids):] for in_ids, out_ids in zip(
61
- model_inputs.input_ids, generated_ids
62
- )
63
- ]
64
 
65
- response_text = tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0]
66
 
67
- # Generate audio response (commented out for testing purposes)
68
  # url_path = save_audio(request, response_text)
69
 
70
  return {
71
  "response": response_text,
72
  "audio": 'url_path' # placeholder
73
- }
 
 
 
1
  from fastapi import APIRouter, Request
2
+ from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
+ # from .tts import save_audio # jeśli używasz TTS
5
 
 
6
  router = APIRouter()
7
 
8
  class ChatRequest(BaseModel):
9
  message: str
10
 
11
+ # Load NLP model
12
  def load_model_nlp():
13
+ llm = Llama.from_pretrained(
14
+ repo_id="Qwen/Qwen2.5-3B-Instruct-GGUF",
15
+ filename="qwen2.5-3b-instruct-q5_0.gguf",
16
+ n_ctx=2048,
 
17
  )
18
+ return llm
 
 
19
 
20
  @router.post("/chat")
21
  async def chat(request: Request, message: ChatRequest):
22
  text = message.message
23
+ llm = request.app.state.model_nlp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Opcjonalnie dodaj wiadomość systemową
26
+ prompt = f"You are Qwen, created by Alibaba Cloud. You help users learn English.\nUser: {text}\nAssistant:"
27
 
28
+ # Generowanie odpowiedzi
29
+ output = llm(prompt, max_tokens=128, temperature=0.7, top_p=0.9, top_k=50)
 
 
 
 
30
 
31
+ response_text = output['choices'][0]['text'].strip()
32
 
33
+ # Generate audio response (opcjonalnie)
34
  # url_path = save_audio(request, response_text)
35
 
36
  return {
37
  "response": response_text,
38
  "audio": 'url_path' # placeholder
39
+ }