Spaces:
Sleeping
Sleeping
Optimize Hugging Face Space: add eager model loading, reduce max tokens, fix stop tokens, limit CPU threads
Browse files- Dockerfile +2 -1
- config.py +1 -1
- llm/inference.py +2 -2
- llm/model_loader.py +2 -1
- main.py +33 -1
Dockerfile
CHANGED
|
@@ -6,7 +6,8 @@ ENV PYTHONUNBUFFERED=1 \
|
|
| 6 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
PORT=7860 \
|
| 8 |
HOME=/home/user \
|
| 9 |
-
USE_OLLAMA=false
|
|
|
|
| 10 |
|
| 11 |
# Install system dependencies
|
| 12 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
|
|
| 6 |
PYTHONDONTWRITEBYTECODE=1 \
|
| 7 |
PORT=7860 \
|
| 8 |
HOME=/home/user \
|
| 9 |
+
USE_OLLAMA=false \
|
| 10 |
+
LLAMA_THREADS=2
|
| 11 |
|
| 12 |
# Install system dependencies
|
| 13 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
config.py
CHANGED
|
@@ -5,6 +5,6 @@ VECTOR_DB_PATH = "vector_store/faiss_index"
|
|
| 5 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 6 |
CHUNK_SIZE = 500
|
| 7 |
CHUNK_OVERLAP = 50
|
| 8 |
-
MAX_TOKENS =
|
| 9 |
TEMPERATURE = 0.7
|
| 10 |
USE_OLLAMA = os.getenv("USE_OLLAMA", "True").lower() == "true"
|
|
|
|
| 5 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 6 |
CHUNK_SIZE = 500
|
| 7 |
CHUNK_OVERLAP = 50
|
| 8 |
+
MAX_TOKENS = 150
|
| 9 |
TEMPERATURE = 0.7
|
| 10 |
USE_OLLAMA = os.getenv("USE_OLLAMA", "True").lower() == "true"
|
llm/inference.py
CHANGED
|
@@ -117,7 +117,7 @@ def _generate_response_ollama(prompt: str) -> str:
|
|
| 117 |
"options": {
|
| 118 |
"num_predict": MAX_TOKENS,
|
| 119 |
"temperature": TEMPERATURE,
|
| 120 |
-
"stop": ["Question:", "<|im_end|>", "<|im_start|>"]
|
| 121 |
}
|
| 122 |
}
|
| 123 |
|
|
@@ -144,7 +144,7 @@ def generate_response(prompt: str) -> str:
|
|
| 144 |
prompt,
|
| 145 |
max_tokens=MAX_TOKENS,
|
| 146 |
temperature=TEMPERATURE,
|
| 147 |
-
stop=["Question:", "<|im_end|>", "<|im_start|>"]
|
| 148 |
)
|
| 149 |
text = output["choices"][0]["text"]
|
| 150 |
return text.strip()
|
|
|
|
| 117 |
"options": {
|
| 118 |
"num_predict": MAX_TOKENS,
|
| 119 |
"temperature": TEMPERATURE,
|
| 120 |
+
"stop": ["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
|
| 121 |
}
|
| 122 |
}
|
| 123 |
|
|
|
|
| 144 |
prompt,
|
| 145 |
max_tokens=MAX_TOKENS,
|
| 146 |
temperature=TEMPERATURE,
|
| 147 |
+
stop=["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
|
| 148 |
)
|
| 149 |
text = output["choices"][0]["text"]
|
| 150 |
return text.strip()
|
llm/model_loader.py
CHANGED
|
@@ -36,7 +36,8 @@ def get_llm() -> Llama:
|
|
| 36 |
)
|
| 37 |
try:
|
| 38 |
cpu_count = os.cpu_count()
|
| 39 |
-
|
|
|
|
| 40 |
_llm_instance = Llama(
|
| 41 |
model_path=MODEL_PATH,
|
| 42 |
n_ctx=4096,
|
|
|
|
| 36 |
)
|
| 37 |
try:
|
| 38 |
cpu_count = os.cpu_count()
|
| 39 |
+
default_threads = max(1, min(4, cpu_count if cpu_count else 2))
|
| 40 |
+
threads = int(os.getenv("LLAMA_THREADS", str(default_threads)))
|
| 41 |
_llm_instance = Llama(
|
| 42 |
model_path=MODEL_PATH,
|
| 43 |
n_ctx=4096,
|
main.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import traceback
|
|
|
|
| 2 |
|
| 3 |
from fastapi import FastAPI, Request
|
| 4 |
from fastapi.responses import JSONResponse
|
|
@@ -8,8 +9,39 @@ from routes.upload import router as upload_router
|
|
| 8 |
from routes.health import router as health_router
|
| 9 |
|
| 10 |
from utils.logger import logger
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
app.include_router(chat_router)
|
| 15 |
app.include_router(upload_router)
|
|
|
|
| 1 |
import traceback
|
| 2 |
+
from contextlib import asynccontextmanager
|
| 3 |
|
| 4 |
from fastapi import FastAPI, Request
|
| 5 |
from fastapi.responses import JSONResponse
|
|
|
|
| 9 |
from routes.health import router as health_router
|
| 10 |
|
| 11 |
from utils.logger import logger
|
| 12 |
+
from llm.model_loader import get_llm
|
| 13 |
+
from embeddings.embedding_model import get_embedding_model
|
| 14 |
+
from embeddings.vector_store import load_vector_store
|
| 15 |
|
| 16 |
+
|
| 17 |
+
@asynccontextmanager
|
| 18 |
+
async def lifespan(app: FastAPI):
|
| 19 |
+
# Eagerly load models on startup
|
| 20 |
+
logger.info("Eagerly loading LLM model on startup...")
|
| 21 |
+
try:
|
| 22 |
+
get_llm()
|
| 23 |
+
logger.info("LLM model loaded successfully!")
|
| 24 |
+
except Exception as e:
|
| 25 |
+
logger.error(f"Error loading LLM model on startup: {e}")
|
| 26 |
+
|
| 27 |
+
logger.info("Eagerly loading embedding model on startup...")
|
| 28 |
+
try:
|
| 29 |
+
get_embedding_model()
|
| 30 |
+
logger.info("Embedding model loaded successfully!")
|
| 31 |
+
except Exception as e:
|
| 32 |
+
logger.error(f"Error loading embedding model on startup: {e}")
|
| 33 |
+
|
| 34 |
+
logger.info("Eagerly loading vector store on startup...")
|
| 35 |
+
try:
|
| 36 |
+
load_vector_store()
|
| 37 |
+
logger.info("Vector store loaded successfully!")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.error(f"Error loading vector store on startup: {e}")
|
| 40 |
+
|
| 41 |
+
yield
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
app = FastAPI(title="AI Assistant", lifespan=lifespan)
|
| 45 |
|
| 46 |
app.include_router(chat_router)
|
| 47 |
app.include_router(upload_router)
|