Optimize startup with reduced threads and timing logs
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import logging
|
|
|
|
| 3 |
from fastapi import FastAPI, HTTPException
|
| 4 |
from pydantic import BaseModel
|
| 5 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
@@ -37,6 +38,8 @@ async def health_check():
|
|
| 37 |
# Async function to load models
|
| 38 |
async def load_models():
|
| 39 |
global t5_tokenizer, t5_model, mistral, models_loaded
|
|
|
|
|
|
|
| 40 |
try:
|
| 41 |
# Load T5 model from local cache
|
| 42 |
T5_MODEL_PATH = os.path.join(CACHE_DIR, "models--MGZON--mgzon-flan-t5-base/snapshots")
|
|
@@ -45,13 +48,13 @@ async def load_models():
|
|
| 45 |
T5_MODEL_PATH,
|
| 46 |
local_files_only=True
|
| 47 |
)
|
| 48 |
-
logger.info(f"Successfully loaded tokenizer for MGZON/mgzon-flan-t5-base")
|
| 49 |
logger.info(f"Loading model for MGZON/mgzon-flan-t5-base from {T5_MODEL_PATH}")
|
| 50 |
t5_model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 51 |
T5_MODEL_PATH,
|
| 52 |
local_files_only=True
|
| 53 |
)
|
| 54 |
-
logger.info(f"Successfully loaded model for MGZON/mgzon-flan-t5-base")
|
| 55 |
|
| 56 |
# Load Mistral GGUF model
|
| 57 |
gguf_path = os.path.abspath("models/mistral-7b-instruct-v0.1.Q4_K_M.gguf")
|
|
@@ -66,14 +69,18 @@ async def load_models():
|
|
| 66 |
mistral = Llama(
|
| 67 |
model_path=gguf_path,
|
| 68 |
n_ctx=2048,
|
| 69 |
-
n_threads=
|
| 70 |
-
|
|
|
|
| 71 |
)
|
| 72 |
-
logger.info(f"Successfully loaded Mistral model from {gguf_path}")
|
| 73 |
models_loaded = True
|
| 74 |
except Exception as e:
|
| 75 |
logger.error(f"Failed to load models: {str(e)}")
|
| 76 |
raise RuntimeError(f"Failed to load models: {str(e)}")
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
# Run model loading in the background
|
| 79 |
@app.on_event("startup")
|
|
@@ -108,7 +115,7 @@ async def ask(req: AskRequest):
|
|
| 108 |
else:
|
| 109 |
# نموذج Mistral
|
| 110 |
logger.info("Using Mistral-7B-GGUF model")
|
| 111 |
-
out = mistral(prompt=q, max_tokens=req.max_new_tokens)
|
| 112 |
answer = out["choices"][0]["text"].strip()
|
| 113 |
model_name = "Mistral-7B-GGUF"
|
| 114 |
logger.info(f"Response generated by {model_name}: {answer}")
|
|
|
|
| 1 |
import os
|
| 2 |
import logging
|
| 3 |
+
import time
|
| 4 |
from fastapi import FastAPI, HTTPException
|
| 5 |
from pydantic import BaseModel
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
| 38 |
# Async function to load models
|
| 39 |
async def load_models():
|
| 40 |
global t5_tokenizer, t5_model, mistral, models_loaded
|
| 41 |
+
start_time = time.time()
|
| 42 |
+
logger.info(f"Starting model loading at {start_time}")
|
| 43 |
try:
|
| 44 |
# Load T5 model from local cache
|
| 45 |
T5_MODEL_PATH = os.path.join(CACHE_DIR, "models--MGZON--mgzon-flan-t5-base/snapshots")
|
|
|
|
| 48 |
T5_MODEL_PATH,
|
| 49 |
local_files_only=True
|
| 50 |
)
|
| 51 |
+
logger.info(f"Successfully loaded tokenizer for MGZON/mgzon-flan-t5-base in {time.time() - start_time} seconds")
|
| 52 |
logger.info(f"Loading model for MGZON/mgzon-flan-t5-base from {T5_MODEL_PATH}")
|
| 53 |
t5_model = AutoModelForSeq2SeqLM.from_pretrained(
|
| 54 |
T5_MODEL_PATH,
|
| 55 |
local_files_only=True
|
| 56 |
)
|
| 57 |
+
logger.info(f"Successfully loaded model for MGZON/mgzon-flan-t5-base in {time.time() - start_time} seconds")
|
| 58 |
|
| 59 |
# Load Mistral GGUF model
|
| 60 |
gguf_path = os.path.abspath("models/mistral-7b-instruct-v0.1.Q4_K_M.gguf")
|
|
|
|
| 69 |
mistral = Llama(
|
| 70 |
model_path=gguf_path,
|
| 71 |
n_ctx=2048,
|
| 72 |
+
n_threads=4, # قللنا عدد الـ threads عشان نقلل الحمل
|
| 73 |
+
n_batch=512,
|
| 74 |
+
verbose=True
|
| 75 |
)
|
| 76 |
+
logger.info(f"Successfully loaded Mistral model from {gguf_path} in {time.time() - start_time} seconds")
|
| 77 |
models_loaded = True
|
| 78 |
except Exception as e:
|
| 79 |
logger.error(f"Failed to load models: {str(e)}")
|
| 80 |
raise RuntimeError(f"Failed to load models: {str(e)}")
|
| 81 |
+
finally:
|
| 82 |
+
end_time = time.time()
|
| 83 |
+
logger.info(f"Model loading completed in {end_time - start_time} seconds")
|
| 84 |
|
| 85 |
# Run model loading in the background
|
| 86 |
@app.on_event("startup")
|
|
|
|
| 115 |
else:
|
| 116 |
# نموذج Mistral
|
| 117 |
logger.info("Using Mistral-7B-GGUF model")
|
| 118 |
+
out = mistral(prompt=q, max_tokens=req.max_new_tokens, temperature=0.7)
|
| 119 |
answer = out["choices"][0]["text"].strip()
|
| 120 |
model_name = "Mistral-7B-GGUF"
|
| 121 |
logger.info(f"Response generated by {model_name}: {answer}")
|