CrazyMonkey0 commited on
Commit
fe8b413
·
1 Parent(s): ad5570a

test(models): downloading models from transformers

Browse files
Files changed (3) hide show
  1. Dockerfile +5 -22
  2. app/main.py +2 -2
  3. app/routes/nlp.py +29 -27
Dockerfile CHANGED
@@ -1,38 +1,21 @@
1
- # Use full Python 3.12 image for maximum compatibility
2
  FROM python:3.12
3
 
4
  # Set working directory
5
  WORKDIR /app
6
 
7
- # Install system dependencies needed for llama-cpp-python and general Python packages
8
- RUN apt-get update && apt-get install -y \
9
- wget \
10
- curl \
11
- git \
12
- build-essential \
13
- cmake \
14
- && rm -rf /var/lib/apt/lists/*
15
-
16
  # Upgrade pip
17
  RUN pip install --upgrade pip
18
 
19
- # Install prebuilt llama-cpp-python (CPU) and Hugging Face hub for from_pretrained()
20
- RUN pip install --no-cache-dir \
21
- llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu \
22
- huggingface-hub
23
-
24
- # Copy requirements and install other dependencies
25
  COPY ./requirements.txt /app/requirements.txt
26
  RUN pip install --no-cache-dir -r requirements.txt
27
 
28
- # Copy the application code
29
  COPY . /app
30
 
31
- # Ensure models folder exists (optional, can store HF cache here)
32
- RUN mkdir -p /app/models
33
-
34
  # Expose FastAPI port
35
  EXPOSE 7860
36
 
37
- # Use Gunicorn with Uvicorn workers for production
38
- CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860", "--workers", "2"]
 
1
+ # Use official Python 3.12 image
2
  FROM python:3.12
3
 
4
  # Set working directory
5
  WORKDIR /app
6
 
 
 
 
 
 
 
 
 
 
7
  # Upgrade pip
8
  RUN pip install --upgrade pip
9
 
10
+ # Copy requirements and install dependencies
 
 
 
 
 
11
  COPY ./requirements.txt /app/requirements.txt
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
14
+ # Copy application code
15
  COPY . /app
16
 
 
 
 
17
  # Expose FastAPI port
18
  EXPOSE 7860
19
 
20
+ # Use Gunicorn with Uvicorn workers
21
+ CMD ["gunicorn", "app.main:app", "-k", "uvicorn.workers.UvicornWorker", "--bind", "0.0.0.0:7860", "--workers", "1"]
app/main.py CHANGED
@@ -1,5 +1,5 @@
1
  from fastapi import FastAPI
2
- from app.routes.nlp import load_model_lama, router as nlp_router
3
  from app.routes.tts import load_model_tts
4
  from app.routes.asr import load_model_asr, router as asr_router
5
  from app.routes.translation import load_model_translation, router as trans_router
@@ -12,7 +12,7 @@ app = FastAPI(debug=False)
12
  async def startup_event():
13
  print("[INFO] Loading all models...")
14
  try:
15
- app.state.model_lama = load_model_lama()
16
  app.state.model_trans, app.state.tokenizer_trans = load_model_translation()
17
  app.state.model_tts = load_model_tts()
18
  app.state.processor_asr, app.state.model_asr = load_model_asr()
 
1
  from fastapi import FastAPI
2
+ from app.routes.nlp import load_model_nlp, router as nlp_router
3
  from app.routes.tts import load_model_tts
4
  from app.routes.asr import load_model_asr, router as asr_router
5
  from app.routes.translation import load_model_translation, router as trans_router
 
12
  async def startup_event():
13
  print("[INFO] Loading all models...")
14
  try:
15
+ app.state.model_nlp, app.state.tokenizer_nlp = load_model_nlp()
16
  app.state.model_trans, app.state.tokenizer_trans = load_model_translation()
17
  app.state.model_tts = load_model_tts()
18
  app.state.processor_asr, app.state.model_asr = load_model_asr()
app/routes/nlp.py CHANGED
@@ -1,41 +1,43 @@
1
  from fastapi import APIRouter, Request
 
2
  from pydantic import BaseModel
3
- from llama_cpp import Llama
4
  from .tts import save_audio
5
- import os
6
 
7
  router = APIRouter()
 
8
 
9
  class ChatRequest(BaseModel):
10
  message: str
11
 
12
- # Load model function
13
- def load_model_lama():
14
- return Llama.from_pretrained(
15
- repo_id="Qwen/Qwen3-8B-GGUF",
16
- filename="Qwen3-8B-Q4_K_M.gguf",
17
- n_ctx=2048,
18
- n_threads=8,
19
- temperature=0.7,
20
- top_p=0.9,
21
- )
22
-
23
- # FastAPI startup event (w main.py)
24
- # app.state.model_lama = load_model_lama()
25
 
 
26
  @router.post("/chat")
27
  async def chat(request: Request, message: ChatRequest):
28
- prompt = message.message
29
-
30
- # download model from app state
31
- model = request.app.state.model_lama
32
-
33
- # generate response
34
- output = model(prompt, max_tokens=512)
35
- response = output["choices"][0]["text"]
36
-
37
- # # Save audio and get URL path
 
 
 
 
 
 
 
 
 
 
38
  # url_path = save_audio(request, response)
39
 
40
- # return {"response": response, "audio": url_path}
41
- return {"response": response, "audio": "TTS disabled"}
 
1
  from fastapi import APIRouter, Request
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer
3
  from pydantic import BaseModel
 
4
  from .tts import save_audio
5
+
6
 
7
  router = APIRouter()
8
+ model_name = "Qwen/Qwen2.5-1.5B-Instruct"
9
 
10
  class ChatRequest(BaseModel):
11
  message: str
12
 
13
+ def load_model_nlp():
14
+ model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="cpu")
15
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
16
+ return model, tokenizer
 
 
 
 
 
 
 
 
 
17
 
18
+ # Handle chat requests
19
  @router.post("/chat")
20
  async def chat(request: Request, message: ChatRequest):
21
+ message = message.message
22
+ # Get the loaded NLP model and tokenizer
23
+ model, tokenizer = request.app.state.model_nlp, request.app.state.tokenizer_nlp
24
+
25
+ # Prepare the conversation context
26
+ messages = [
27
+ {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant for learning English."},
28
+ {"role": "user", "content": message},
29
+ ]
30
+
31
+ # Tokenize input and generate a response
32
+ text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
33
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
34
+ generated_ids = model.generate(**model_inputs, max_new_tokens=512)
35
+
36
+ # Decode the response
37
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
38
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
39
+
40
+ # Save response as audio
41
  # url_path = save_audio(request, response)
42
 
43
+ return {"response": response, "audio": "url_path"} # Return the response and audio URL for tests