CrazyMonkey0 commited on
Commit
b2565e9
·
1 Parent(s): 0b8f94b

feat(nlp): add lama.cpp support for Qwen3-8B-Q5_K_M.gguf and download models

Browse files
Files changed (4) hide show
  1. app/main.py +2 -2
  2. app/routes/nlp.py +21 -111
  3. models.sh +9 -0
  4. requirements.txt +2 -0
app/main.py CHANGED
@@ -1,5 +1,5 @@
1
  from fastapi import FastAPI
2
- from app.routes.nlp import load_model_nlp, router as nlp_router
3
  from app.routes.tts import load_model_tts
4
  from app.routes.asr import load_model_asr, router as asr_router
5
  from app.routes.translation import load_model_translation, router as trans_router
@@ -9,7 +9,7 @@ import os
9
  app = FastAPI(debug=False)
10
 
11
  # Load the pre-trained NLP
12
- app.state.model_nlp, app.state.tokenizer_nlp = load_model_nlp()
13
 
14
  # Load the pre-trained Translation
15
  app.state.model_trans, app.state.tokenizer_trans = load_model_translation()
 
1
  from fastapi import FastAPI
2
+ from app.routes.nlp import load_model_lama, router as nlp_router
3
  from app.routes.tts import load_model_tts
4
  from app.routes.asr import load_model_asr, router as asr_router
5
  from app.routes.translation import load_model_translation, router as trans_router
 
9
  app = FastAPI(debug=False)
10
 
11
  # Load the pre-trained NLP
12
+ app.state.model_lama = load_model_lama()
13
 
14
  # Load the pre-trained Translation
15
  app.state.model_trans, app.state.tokenizer_trans = load_model_translation()
app/routes/nlp.py CHANGED
@@ -1,126 +1,36 @@
1
- from transformers import AutoModelForCausalLM, AutoTokenizer
2
- from pydantic import BaseModel
3
  from fastapi import APIRouter, Request
 
 
4
  from .tts import save_audio
5
 
6
-
7
- # Model name for NLP
8
- model_name = "Qwen/Qwen2.5-1.5B-Instruct"
9
  router = APIRouter()
10
 
11
  class ChatRequest(BaseModel):
12
  message: str
13
-
14
- # Load NLP model and tokenizer
15
- def load_model_nlp():
16
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
17
- tokenizer = AutoTokenizer.from_pretrained(model_name)
18
- return model, tokenizer
19
-
20
- # Handle chat requests
21
- @router.post("/chat")
22
- async def chat(request: Request, message: ChatRequest):
23
- message = message.message
24
- # Get the loaded NLP model and tokenizer
25
- model, tokenizer = request.app.state.model_nlp, request.app.state.tokenizer_nlp
26
-
27
- # Prepare the conversation context
28
- messages = [
29
- {"role": "system", "content": """
30
- You are Emma — a friendly, patient, encouraging native speaker of American English and an experienced English teacher. Assume every user is learning English.
31
-
32
- Top priorities (in order):
33
-
34
- First: Reply NATURALLY and CONVERSATIONALLY to the user’s most recent (last) message. The reply should sound like a warm, helpful human: concise (2–4 sentences), encouraging, and easy to understand.
35
-
36
- Second: Immediately after that natural reply, analyze only that same most recent message for language errors and apply the correction rules below. Do not analyze earlier messages.
37
-
38
- What to detect (error categories):
39
-
40
- Grammar (tenses, word order, auxiliary duplication like “what’s is”, subject-verb agreement)
41
-
42
- Vocabulary (word choice, false friends, awkward collocations)
43
-
44
- Spelling
45
-
46
- Punctuation
47
-
48
- Register (formal vs. informal mismatch)
49
-
50
- Typical learner errors (missing articles, capitalization mistakes, double auxiliaries, common typos)
51
-
52
- Correction rules:
53
-
54
- If any errors are found, append exactly one correction block at the end of your reply. If no errors are found, append nothing.
55
-
56
- Corrections must be concise, clear, encouraging, and not overwhelming.
57
 
58
- Explanations must be one sentence and simple.
 
59
 
60
- Provide an example only if helpful, and keep it short (one sentence).
 
 
61
 
62
- If multiple possible fixes exist, show the single most natural and simple correction for the learner (you may include a second only if it’s essential).
 
63
 
64
- Exact correction block format (use this format verbatim):
65
-
66
- CORRECTION:
67
-
68
- Error: [short label — e.g. “Grammar” / “Spelling” / “Vocabulary”]
69
-
70
- Original: “...original text fragment...”
71
-
72
- Correction: “...suggested correction...”
73
-
74
- Explanation: [one-sentence, simple explanation]
75
- (If helpful) Example: “...full correct sentence...”
76
-
77
- Behavior & style constraints:
78
-
79
- Always prioritize the conversational reply above the correction. The correction is an add-on, never the primary content.
80
-
81
- Tone: friendly, supportive, patient, non-judgmental.
82
-
83
- Keep everything short, organized, and easy to scan.
84
-
85
- Never invent facts. If you don’t know something, say “I don’t know” or ask a clarifying question.
86
-
87
- Assume the user is an English learner and tailor explanations accordingly.
88
-
89
- No long grammar essays; keep corrections short and actionable.
90
-
91
- Execution notes for the model (internal-use guidance you should follow):
92
-
93
- Analyze only the last user message text (no earlier context).
94
-
95
- If the last message contains more than one error, include up to two prioritized corrections inside the single correction block (choose the two most important).
96
-
97
- Use natural, learner-friendly wording in explanations.
98
 
99
- Keep the correction block compact and visually distinct from the conversational reply.
 
100
 
101
- Use your prompt-optimization and code-writing strengths to keep instructions minimal but robust — be decisive and pick the clearest fix.
 
 
102
 
103
- Final instruction: Reply to the user’s most recent message now, following these rules exactly.
104
- """},
105
- {"role": "user", "content": message},
106
- ]
107
-
108
- # Tokenize input and generate a response
109
- text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
110
- model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
111
- generated_ids = model.generate(
112
- **model_inputs,
113
- max_new_tokens=512,
114
- top_p=0.9,
115
- temperature=0.7,
116
- do_sample=True,
117
- pad_token_id=tokenizer.eos_token_id)
118
-
119
- # Decode the response
120
- generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
121
- response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
122
-
123
- # Save response as audio
124
- url_path = save_audio(request, response)
125
 
126
- return {"response": response, "audio": url_path}
 
 
 
 
1
  from fastapi import APIRouter, Request
2
+ from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
  from .tts import save_audio
5
 
 
 
 
6
  router = APIRouter()
7
 
8
  class ChatRequest(BaseModel):
9
  message: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
+ # Model path
12
+ MODEL_PATH = "../models/Qwen3-8B-Q5_K_M.gguf"
13
 
14
+ # Load model function
15
+ def load_model_lama():
16
+ return Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8, temperature=0.7, top_p=0.9)
17
 
18
+ # FastAPI startup event (w main.py)
19
+ # app.state.model_lama = load_model_lama()
20
 
21
+ @router.post("/chat")
22
+ async def chat(request: Request, message: ChatRequest):
23
+ prompt = message.message
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # download model from app state
26
+ model = request.app.state.model_lama
27
 
28
+ # generate response
29
+ output = model(prompt, max_tokens=512)
30
+ response = output["choices"][0]["text"]
31
 
32
+ # # Save audio and get URL path
33
+ # url_path = save_audio(request, response)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # return {"response": response, "audio": url_path}
36
+ return {"response": response, "audio": "TTS disabled"}
models.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ mkdir -p /app/models/
4
+
5
+ echo "Downloading Qwen3-8B-GGUF model..."
6
+ wget -c https://huggingface.co/Qwen/Qwen3-8B-GGUF/resolve/main/Qwen3-8B-Q5_K_M.gguf?download=true \
7
+ -O /app/models/Qwen3-8B-Q5_K_M.gguf
8
+
9
+ echo "All models downloaded!"
requirements.txt CHANGED
@@ -27,6 +27,7 @@ cymem==2.0.11
27
  datasets==3.4.0
28
  decorator==5.2.1
29
  dill==0.3.8
 
30
  Distance==0.1.3
31
  dlinfo==2.0.0
32
  dnspython==2.7.0
@@ -71,6 +72,7 @@ language_data==1.3.0
71
  lazy_loader==0.4
72
  libclang==18.1.1
73
  librosa==0.11.0
 
74
  llvmlite==0.44.0
75
  loguru==0.7.3
76
  marisa-trie==1.2.1
 
27
  datasets==3.4.0
28
  decorator==5.2.1
29
  dill==0.3.8
30
+ diskcache==5.6.3
31
  Distance==0.1.3
32
  dlinfo==2.0.0
33
  dnspython==2.7.0
 
72
  lazy_loader==0.4
73
  libclang==18.1.1
74
  librosa==0.11.0
75
+ llama_cpp_python==0.3.16
76
  llvmlite==0.44.0
77
  loguru==0.7.3
78
  marisa-trie==1.2.1