khubchand commited on
Commit
d641e1d
·
1 Parent(s): 717df55

Optimize startup speed and query latency

Browse files
config.py CHANGED
@@ -4,4 +4,5 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
4
  CHUNK_SIZE = 500
5
  CHUNK_OVERLAP = 50
6
  MAX_TOKENS = 512
7
- TEMPERATURE = 0.7
 
 
4
  CHUNK_SIZE = 500
5
  CHUNK_OVERLAP = 50
6
  MAX_TOKENS = 512
7
+ TEMPERATURE = 0.7
8
+ USE_OLLAMA = True
documents/6th cse sepm QB.xlsx DELETED
Binary file (16 kB)
 
embeddings/embedding_model.py CHANGED
@@ -2,6 +2,30 @@ from langchain_huggingface import HuggingFaceEmbeddings
2
  from config import EMBEDDING_MODEL
3
 
4
 
5
- embedding_model = HuggingFaceEmbeddings(
6
- model_name=EMBEDDING_MODEL
7
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from config import EMBEDDING_MODEL
3
 
4
 
5
+ _embedding_model_instance = None
6
+
7
+ def get_embedding_model() -> HuggingFaceEmbeddings:
8
+ global _embedding_model_instance
9
+ if _embedding_model_instance is None:
10
+ _embedding_model_instance = HuggingFaceEmbeddings(
11
+ model_name=EMBEDDING_MODEL,
12
+ model_kwargs={"local_files_only": True}
13
+ )
14
+ return _embedding_model_instance
15
+
16
+ from langchain_core.embeddings import Embeddings
17
+
18
+ class LazyEmbeddingModel(Embeddings):
19
+ def __getattr__(self, name):
20
+ return getattr(get_embedding_model(), name)
21
+
22
+ def embed_documents(self, texts, *args, **kwargs):
23
+ return get_embedding_model().embed_documents(texts, *args, **kwargs)
24
+
25
+ def embed_query(self, text, *args, **kwargs):
26
+ return get_embedding_model().embed_query(text, *args, **kwargs)
27
+
28
+ def __call__(self, text, *args, **kwargs):
29
+ return get_embedding_model().embed_query(text, *args, **kwargs)
30
+
31
+ embedding_model = LazyEmbeddingModel()
llm/inference.py CHANGED
@@ -4,16 +4,22 @@ import time
4
  import requests
5
  from fastapi import HTTPException
6
  from llm.model_loader import get_llm
7
- from config import MAX_TOKENS, TEMPERATURE
8
 
9
  OLLAMA_API_URL = "http://localhost:11434"
10
  OLLAMA_MODEL_NAME = "qwen-local"
11
 
12
 
 
 
 
13
  def _ensure_ollama_ready():
14
  """
15
  Ensure the Ollama server is running and the custom model is registered.
16
  """
 
 
 
17
  # 1. Check if Ollama server is running
18
  server_running = False
19
  try:
@@ -94,6 +100,7 @@ def _ensure_ollama_ready():
94
  raise RuntimeError(f"Ollama create failed: {res.stderr}")
95
  except Exception as e:
96
  raise RuntimeError(f"Failed to register model in Ollama: {str(e)}")
 
97
 
98
 
99
  def _generate_response_ollama(prompt: str) -> str:
@@ -110,7 +117,7 @@ def _generate_response_ollama(prompt: str) -> str:
110
  "options": {
111
  "num_predict": MAX_TOKENS,
112
  "temperature": TEMPERATURE,
113
- "stop": ["Question:"]
114
  }
115
  }
116
 
@@ -120,9 +127,15 @@ def _generate_response_ollama(prompt: str) -> str:
120
 
121
 
122
  def generate_response(prompt: str) -> str:
123
- use_ollama_fallback = False
124
-
125
- # Try using llama-cpp-python first
 
 
 
 
 
 
126
  try:
127
  llm = get_llm()
128
 
@@ -131,7 +144,7 @@ def generate_response(prompt: str) -> str:
131
  prompt,
132
  max_tokens=MAX_TOKENS,
133
  temperature=TEMPERATURE,
134
- stop=["Question:"]
135
  )
136
  text = output["choices"][0]["text"]
137
  return text.strip()
@@ -150,21 +163,35 @@ def generate_response(prompt: str) -> str:
150
 
151
  # OSError: WinError 0xc000001d / illegal instruction -> trigger Ollama fallback
152
  print(f"\n [WARNING] llama-cpp-python failed due to hardware compatibility issue: {e}")
153
- print(" --> Falling back to Ollama local inference...\n")
154
- use_ollama_fallback = True
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  except Exception as e:
157
  # Catch any other initialization or execution errors
158
  print(f"\n [WARNING] llama-cpp-python failed: {e}")
159
- print(" --> Falling back to Ollama local inference...\n")
160
- use_ollama_fallback = True
161
-
162
- # Ollama Fallback Path
163
- if use_ollama_fallback:
164
- try:
165
- return _generate_response_ollama(prompt)
166
- except Exception as e:
 
 
167
  raise HTTPException(
168
  status_code=500,
169
- detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(e)}"
170
  )
 
4
  import requests
5
  from fastapi import HTTPException
6
  from llm.model_loader import get_llm
7
+ from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
8
 
9
  OLLAMA_API_URL = "http://localhost:11434"
10
  OLLAMA_MODEL_NAME = "qwen-local"
11
 
12
 
13
+ _ollama_ready = False
14
+
15
+
16
  def _ensure_ollama_ready():
17
  """
18
  Ensure the Ollama server is running and the custom model is registered.
19
  """
20
+ global _ollama_ready
21
+ if _ollama_ready:
22
+ return
23
  # 1. Check if Ollama server is running
24
  server_running = False
25
  try:
 
100
  raise RuntimeError(f"Ollama create failed: {res.stderr}")
101
  except Exception as e:
102
  raise RuntimeError(f"Failed to register model in Ollama: {str(e)}")
103
+ _ollama_ready = True
104
 
105
 
106
  def _generate_response_ollama(prompt: str) -> str:
 
117
  "options": {
118
  "num_predict": MAX_TOKENS,
119
  "temperature": TEMPERATURE,
120
+ "stop": ["Question:", "<|im_end|>", "<|im_start|>"]
121
  }
122
  }
123
 
 
127
 
128
 
129
  def generate_response(prompt: str) -> str:
130
+ # 1. If USE_OLLAMA is True, prioritize Ollama
131
+ if USE_OLLAMA:
132
+ try:
133
+ return _generate_response_ollama(prompt)
134
+ except Exception as e:
135
+ print(f"\n [WARNING] Ollama inference failed: {e}")
136
+ print(" --> Falling back to llama-cpp-python...\n")
137
+
138
+ # 2. Try using llama-cpp-python
139
  try:
140
  llm = get_llm()
141
 
 
144
  prompt,
145
  max_tokens=MAX_TOKENS,
146
  temperature=TEMPERATURE,
147
+ stop=["Question:", "<|im_end|>", "<|im_start|>"]
148
  )
149
  text = output["choices"][0]["text"]
150
  return text.strip()
 
163
 
164
  # OSError: WinError 0xc000001d / illegal instruction -> trigger Ollama fallback
165
  print(f"\n [WARNING] llama-cpp-python failed due to hardware compatibility issue: {e}")
166
+ if not USE_OLLAMA:
167
+ print(" --> Falling back to Ollama local inference...\n")
168
+ try:
169
+ return _generate_response_ollama(prompt)
170
+ except Exception as ex:
171
+ raise HTTPException(
172
+ status_code=500,
173
+ detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(ex)}"
174
+ )
175
+ else:
176
+ raise HTTPException(
177
+ status_code=500,
178
+ detail=f"llama-cpp-python failed and Ollama was already tried. llama-cpp error: {str(e)}"
179
+ )
180
 
181
  except Exception as e:
182
  # Catch any other initialization or execution errors
183
  print(f"\n [WARNING] llama-cpp-python failed: {e}")
184
+ if not USE_OLLAMA:
185
+ print(" --> Falling back to Ollama local inference...\n")
186
+ try:
187
+ return _generate_response_ollama(prompt)
188
+ except Exception as ex:
189
+ raise HTTPException(
190
+ status_code=500,
191
+ detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(ex)}"
192
+ )
193
+ else:
194
  raise HTTPException(
195
  status_code=500,
196
+ detail=f"llama-cpp-python failed and Ollama was already tried. llama-cpp error: {str(e)}"
197
  )
rag/prompt_builder.py CHANGED
@@ -1,5 +1,5 @@
1
  SYSTEM_PROMPT = """You are a helpful AI assistant.
2
- Answer only from provided context.
3
  If answer is not available, say you don't know."""
4
 
5
 
 
1
  SYSTEM_PROMPT = """You are a helpful AI assistant.
2
+ Answer only from provided context. Keep your answers brief, direct, and under 3 sentences.
3
  If answer is not available, say you don't know."""
4
 
5