khubchand commited on
Commit
09bc714
·
1 Parent(s): b597dd6

Optimize Hugging Face Space: add eager model loading, reduce max tokens, fix stop tokens, limit CPU threads

Browse files
Files changed (5) hide show
  1. Dockerfile +2 -1
  2. config.py +1 -1
  3. llm/inference.py +2 -2
  4. llm/model_loader.py +2 -1
  5. main.py +33 -1
Dockerfile CHANGED
@@ -6,7 +6,8 @@ ENV PYTHONUNBUFFERED=1 \
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  PORT=7860 \
8
  HOME=/home/user \
9
- USE_OLLAMA=false
 
10
 
11
  # Install system dependencies
12
  RUN apt-get update && apt-get install -y --no-install-recommends \
 
6
  PYTHONDONTWRITEBYTECODE=1 \
7
  PORT=7860 \
8
  HOME=/home/user \
9
+ USE_OLLAMA=false \
10
+ LLAMA_THREADS=2
11
 
12
  # Install system dependencies
13
  RUN apt-get update && apt-get install -y --no-install-recommends \
config.py CHANGED
@@ -5,6 +5,6 @@ VECTOR_DB_PATH = "vector_store/faiss_index"
5
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
6
  CHUNK_SIZE = 500
7
  CHUNK_OVERLAP = 50
8
- MAX_TOKENS = 512
9
  TEMPERATURE = 0.7
10
  USE_OLLAMA = os.getenv("USE_OLLAMA", "True").lower() == "true"
 
5
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
6
  CHUNK_SIZE = 500
7
  CHUNK_OVERLAP = 50
8
+ MAX_TOKENS = 150
9
  TEMPERATURE = 0.7
10
  USE_OLLAMA = os.getenv("USE_OLLAMA", "True").lower() == "true"
llm/inference.py CHANGED
@@ -117,7 +117,7 @@ def _generate_response_ollama(prompt: str) -> str:
117
  "options": {
118
  "num_predict": MAX_TOKENS,
119
  "temperature": TEMPERATURE,
120
- "stop": ["Question:", "<|im_end|>", "<|im_start|>"]
121
  }
122
  }
123
 
@@ -144,7 +144,7 @@ def generate_response(prompt: str) -> str:
144
  prompt,
145
  max_tokens=MAX_TOKENS,
146
  temperature=TEMPERATURE,
147
- stop=["Question:", "<|im_end|>", "<|im_start|>"]
148
  )
149
  text = output["choices"][0]["text"]
150
  return text.strip()
 
117
  "options": {
118
  "num_predict": MAX_TOKENS,
119
  "temperature": TEMPERATURE,
120
+ "stop": ["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
121
  }
122
  }
123
 
 
144
  prompt,
145
  max_tokens=MAX_TOKENS,
146
  temperature=TEMPERATURE,
147
+ stop=["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
148
  )
149
  text = output["choices"][0]["text"]
150
  return text.strip()
llm/model_loader.py CHANGED
@@ -36,7 +36,8 @@ def get_llm() -> Llama:
36
  )
37
  try:
38
  cpu_count = os.cpu_count()
39
- threads = max(1, min(4, cpu_count if cpu_count else 2))
 
40
  _llm_instance = Llama(
41
  model_path=MODEL_PATH,
42
  n_ctx=4096,
 
36
  )
37
  try:
38
  cpu_count = os.cpu_count()
39
+ default_threads = max(1, min(4, cpu_count if cpu_count else 2))
40
+ threads = int(os.getenv("LLAMA_THREADS", str(default_threads)))
41
  _llm_instance = Llama(
42
  model_path=MODEL_PATH,
43
  n_ctx=4096,
main.py CHANGED
@@ -1,4 +1,5 @@
1
  import traceback
 
2
 
3
  from fastapi import FastAPI, Request
4
  from fastapi.responses import JSONResponse
@@ -8,8 +9,39 @@ from routes.upload import router as upload_router
8
  from routes.health import router as health_router
9
 
10
  from utils.logger import logger
 
 
 
11
 
12
- app = FastAPI(title="AI Assistant")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  app.include_router(chat_router)
15
  app.include_router(upload_router)
 
1
  import traceback
2
+ from contextlib import asynccontextmanager
3
 
4
  from fastapi import FastAPI, Request
5
  from fastapi.responses import JSONResponse
 
9
  from routes.health import router as health_router
10
 
11
  from utils.logger import logger
12
+ from llm.model_loader import get_llm
13
+ from embeddings.embedding_model import get_embedding_model
14
+ from embeddings.vector_store import load_vector_store
15
 
16
+
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI):
19
+ # Eagerly load models on startup
20
+ logger.info("Eagerly loading LLM model on startup...")
21
+ try:
22
+ get_llm()
23
+ logger.info("LLM model loaded successfully!")
24
+ except Exception as e:
25
+ logger.error(f"Error loading LLM model on startup: {e}")
26
+
27
+ logger.info("Eagerly loading embedding model on startup...")
28
+ try:
29
+ get_embedding_model()
30
+ logger.info("Embedding model loaded successfully!")
31
+ except Exception as e:
32
+ logger.error(f"Error loading embedding model on startup: {e}")
33
+
34
+ logger.info("Eagerly loading vector store on startup...")
35
+ try:
36
+ load_vector_store()
37
+ logger.info("Vector store loaded successfully!")
38
+ except Exception as e:
39
+ logger.error(f"Error loading vector store on startup: {e}")
40
+
41
+ yield
42
+
43
+
44
+ app = FastAPI(title="AI Assistant", lifespan=lifespan)
45
 
46
  app.include_router(chat_router)
47
  app.include_router(upload_router)