Spaces:

CrazyMonkey0
/

APi_English

Sleeping

App Files Files Community

CrazyMonkey0 commited on Dec 11, 2025

Commit

b2565e9

1 Parent(s): 0b8f94b

feat(nlp): add lama.cpp support for Qwen3-8B-Q5_K_M.gguf and download models

Browse files

Files changed (4) hide show

app/main.py +2 -2
app/routes/nlp.py +21 -111
models.sh +9 -0
requirements.txt +2 -0

app/main.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from fastapi import FastAPI
-from app.routes.nlp import load_model_nlp, router as nlp_router
 from app.routes.tts import load_model_tts
 from app.routes.asr import load_model_asr, router as asr_router
 from app.routes.translation import load_model_translation, router as trans_router
@@ -9,7 +9,7 @@ import os
 app = FastAPI(debug=False)
 # Load the pre-trained NLP
-app.state.model_nlp, app.state.tokenizer_nlp = load_model_nlp()
 # Load the pre-trained Translation
 app.state.model_trans, app.state.tokenizer_trans = load_model_translation()

 from fastapi import FastAPI
+from app.routes.nlp import load_model_lama, router as nlp_router
 from app.routes.tts import load_model_tts
 from app.routes.asr import load_model_asr, router as asr_router
 from app.routes.translation import load_model_translation, router as trans_router
 app = FastAPI(debug=False)
 # Load the pre-trained NLP
+app.state.model_lama = load_model_lama()
 # Load the pre-trained Translation
 app.state.model_trans, app.state.tokenizer_trans = load_model_translation()

app/routes/nlp.py CHANGED Viewed

@@ -1,126 +1,36 @@
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from pydantic import BaseModel
 from fastapi import APIRouter, Request
 from .tts import save_audio
-# Model name for NLP
-model_name = "Qwen/Qwen2.5-1.5B-Instruct"
 router = APIRouter()
 class ChatRequest(BaseModel):
     message: str
-# Load NLP model and tokenizer
-def load_model_nlp():
-    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto")
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    return model, tokenizer
-# Handle chat requests
-@router.post("/chat")
-async def chat(request: Request, message: ChatRequest):
-    message = message.message
-    # Get the loaded NLP model and tokenizer
-    model, tokenizer = request.app.state.model_nlp, request.app.state.tokenizer_nlp
-    # Prepare the conversation context
-    messages = [
-        {"role": "system", "content": """
-You are Emma — a friendly, patient, encouraging native speaker of American English and an experienced English teacher. Assume every user is learning English.
-Top priorities (in order):
-First: Reply NATURALLY and CONVERSATIONALLY to the user’s most recent (last) message. The reply should sound like a warm, helpful human: concise (2–4 sentences), encouraging, and easy to understand.
-Second: Immediately after that natural reply, analyze only that same most recent message for language errors and apply the correction rules below. Do not analyze earlier messages.
-What to detect (error categories):
-Grammar (tenses, word order, auxiliary duplication like “what’s is”, subject-verb agreement)
-Vocabulary (word choice, false friends, awkward collocations)
-Spelling
-Punctuation
-Register (formal vs. informal mismatch)
-Typical learner errors (missing articles, capitalization mistakes, double auxiliaries, common typos)
-Correction rules:
-If any errors are found, append exactly one correction block at the end of your reply. If no errors are found, append nothing.
-Corrections must be concise, clear, encouraging, and not overwhelming.
-Explanations must be one sentence and simple.
-Provide an example only if helpful, and keep it short (one sentence).
-If multiple possible fixes exist, show the single most natural and simple correction for the learner (you may include a second only if it’s essential).
-Exact correction block format (use this format verbatim):
-CORRECTION:
-Error: [short label — e.g. “Grammar” / “Spelling” / “Vocabulary”]
-Original: “...original text fragment...”
-Correction: “...suggested correction...”
-Explanation: [one-sentence, simple explanation]
-(If helpful) Example: “...full correct sentence...”
-Behavior & style constraints:
-Always prioritize the conversational reply above the correction. The correction is an add-on, never the primary content.
-Tone: friendly, supportive, patient, non-judgmental.
-Keep everything short, organized, and easy to scan.
-Never invent facts. If you don’t know something, say “I don’t know” or ask a clarifying question.
-Assume the user is an English learner and tailor explanations accordingly.
-No long grammar essays; keep corrections short and actionable.
-Execution notes for the model (internal-use guidance you should follow):
-Analyze only the last user message text (no earlier context).
-If the last message contains more than one error, include up to two prioritized corrections inside the single correction block (choose the two most important).
-Use natural, learner-friendly wording in explanations.
-Keep the correction block compact and visually distinct from the conversational reply.
-Use your prompt-optimization and code-writing strengths to keep instructions minimal but robust — be decisive and pick the clearest fix.
-Final instruction: Reply to the user’s most recent message now, following these rules exactly.
-"""},
-        {"role": "user", "content": message},
-    ]
-    # Tokenize input and generate a response
-    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
-    generated_ids = model.generate(
-        **model_inputs,
-        max_new_tokens=512,
-        top_p=0.9,
-        temperature=0.7,
-        do_sample=True,
-        pad_token_id=tokenizer.eos_token_id)
-    # Decode the response
-    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
-    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Save response as audio
-    url_path = save_audio(request, response)
-    return {"response": response, "audio": url_path}

 from fastapi import APIRouter, Request
+from pydantic import BaseModel
+from llama_cpp import Llama
 from .tts import save_audio
 router = APIRouter()
 class ChatRequest(BaseModel):
     message: str
+# Model path
+MODEL_PATH = "../models/Qwen3-8B-Q5_K_M.gguf"
+# Load model function
+def load_model_lama():
+    return Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=8, temperature=0.7, top_p=0.9)
+# FastAPI startup event (w main.py)
+# app.state.model_lama = load_model_lama()
+@router.post("/chat")
+async def chat(request: Request, message: ChatRequest):
+    prompt = message.message
+    # download model from app state
+    model = request.app.state.model_lama
+    # generate response
+    output = model(prompt, max_tokens=512)
+    response = output["choices"][0]["text"]
+    # # Save audio and get URL path
+    # url_path = save_audio(request, response)
+    # return {"response": response, "audio": url_path}
+    return {"response": response, "audio": "TTS disabled"}

models.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/bash
+mkdir -p /app/models/
+echo "Downloading Qwen3-8B-GGUF model..."
+wget -c https://huggingface.co/Qwen/Qwen3-8B-GGUF/resolve/main/Qwen3-8B-Q5_K_M.gguf?download=true \
+    -O /app/models/Qwen3-8B-Q5_K_M.gguf
+echo "All models downloaded!"

requirements.txt CHANGED Viewed

@@ -27,6 +27,7 @@ cymem==2.0.11
 datasets==3.4.0
 decorator==5.2.1
 dill==0.3.8
 Distance==0.1.3
 dlinfo==2.0.0
 dnspython==2.7.0
@@ -71,6 +72,7 @@ language_data==1.3.0
 lazy_loader==0.4
 libclang==18.1.1
 librosa==0.11.0
 llvmlite==0.44.0
 loguru==0.7.3
 marisa-trie==1.2.1

 datasets==3.4.0
 decorator==5.2.1
 dill==0.3.8
+diskcache==5.6.3
 Distance==0.1.3
 dlinfo==2.0.0
 dnspython==2.7.0
 lazy_loader==0.4
 libclang==18.1.1
 librosa==0.11.0
+llama_cpp_python==0.3.16
 llvmlite==0.44.0
 loguru==0.7.3
 marisa-trie==1.2.1