Spaces:

khubchand
/

ai-assistant-engine

Sleeping

App Files Files Community

khubchand commited on 17 days ago

Commit

d641e1d

1 Parent(s): 717df55

Optimize startup speed and query latency

Browse files

Files changed (5) hide show

config.py +2 -1
documents/6th cse sepm QB.xlsx +0 -0
embeddings/embedding_model.py +27 -3
llm/inference.py +44 -17
rag/prompt_builder.py +1 -1

config.py CHANGED Viewed

@@ -4,4 +4,5 @@ EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
 MAX_TOKENS = 512
-TEMPERATURE = 0.7

 CHUNK_SIZE = 500
 CHUNK_OVERLAP = 50
 MAX_TOKENS = 512
+TEMPERATURE = 0.7
+USE_OLLAMA = True

documents/6th cse sepm QB.xlsx DELETED Viewed

Binary file (16 kB)

embeddings/embedding_model.py CHANGED Viewed

@@ -2,6 +2,30 @@ from langchain_huggingface import HuggingFaceEmbeddings
 from config import EMBEDDING_MODEL
-embedding_model = HuggingFaceEmbeddings(
-    model_name=EMBEDDING_MODEL
-)

 from config import EMBEDDING_MODEL
+_embedding_model_instance = None
+def get_embedding_model() -> HuggingFaceEmbeddings:
+    global _embedding_model_instance
+    if _embedding_model_instance is None:
+        _embedding_model_instance = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL,
+            model_kwargs={"local_files_only": True}
+        )
+    return _embedding_model_instance
+from langchain_core.embeddings import Embeddings
+class LazyEmbeddingModel(Embeddings):
+    def __getattr__(self, name):
+        return getattr(get_embedding_model(), name)
+    def embed_documents(self, texts, *args, **kwargs):
+        return get_embedding_model().embed_documents(texts, *args, **kwargs)
+    def embed_query(self, text, *args, **kwargs):
+        return get_embedding_model().embed_query(text, *args, **kwargs)
+    def __call__(self, text, *args, **kwargs):
+        return get_embedding_model().embed_query(text, *args, **kwargs)
+embedding_model = LazyEmbeddingModel()

llm/inference.py CHANGED Viewed

@@ -4,16 +4,22 @@ import time
 import requests
 from fastapi import HTTPException
 from llm.model_loader import get_llm
-from config import MAX_TOKENS, TEMPERATURE
 OLLAMA_API_URL = "http://localhost:11434"
 OLLAMA_MODEL_NAME = "qwen-local"
 def _ensure_ollama_ready():
     """
     Ensure the Ollama server is running and the custom model is registered.
     """
     # 1. Check if Ollama server is running
     server_running = False
     try:
@@ -94,6 +100,7 @@ def _ensure_ollama_ready():
                 raise RuntimeError(f"Ollama create failed: {res.stderr}")
         except Exception as e:
             raise RuntimeError(f"Failed to register model in Ollama: {str(e)}")
 def _generate_response_ollama(prompt: str) -> str:
@@ -110,7 +117,7 @@ def _generate_response_ollama(prompt: str) -> str:
         "options": {
             "num_predict": MAX_TOKENS,
             "temperature": TEMPERATURE,
-            "stop": ["Question:"]
         }
     }
@@ -120,9 +127,15 @@ def _generate_response_ollama(prompt: str) -> str:
 def generate_response(prompt: str) -> str:
-    use_ollama_fallback = False
-    # Try using llama-cpp-python first
     try:
         llm = get_llm()
@@ -131,7 +144,7 @@ def generate_response(prompt: str) -> str:
             prompt,
             max_tokens=MAX_TOKENS,
             temperature=TEMPERATURE,
-            stop=["Question:"]
         )
         text = output["choices"][0]["text"]
         return text.strip()
@@ -150,21 +163,35 @@ def generate_response(prompt: str) -> str:
         # OSError: WinError 0xc000001d / illegal instruction -> trigger Ollama fallback
         print(f"\n  [WARNING] llama-cpp-python failed due to hardware compatibility issue: {e}")
-        print("  --> Falling back to Ollama local inference...\n")
-        use_ollama_fallback = True
     except Exception as e:
         # Catch any other initialization or execution errors
         print(f"\n  [WARNING] llama-cpp-python failed: {e}")
-        print("  --> Falling back to Ollama local inference...\n")
-        use_ollama_fallback = True
-    # Ollama Fallback Path
-    if use_ollama_fallback:
-        try:
-            return _generate_response_ollama(prompt)
-        except Exception as e:
             raise HTTPException(
                 status_code=500,
-                detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(e)}"
             )

 import requests
 from fastapi import HTTPException
 from llm.model_loader import get_llm
+from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
 OLLAMA_API_URL = "http://localhost:11434"
 OLLAMA_MODEL_NAME = "qwen-local"
+_ollama_ready = False
 def _ensure_ollama_ready():
     """
     Ensure the Ollama server is running and the custom model is registered.
     """
+    global _ollama_ready
+    if _ollama_ready:
+        return
     # 1. Check if Ollama server is running
     server_running = False
     try:
                 raise RuntimeError(f"Ollama create failed: {res.stderr}")
         except Exception as e:
             raise RuntimeError(f"Failed to register model in Ollama: {str(e)}")
+    _ollama_ready = True
 def _generate_response_ollama(prompt: str) -> str:
         "options": {
             "num_predict": MAX_TOKENS,
             "temperature": TEMPERATURE,
+            "stop": ["Question:", "<|im_end|>", "<|im_start|>"]
         }
     }
 def generate_response(prompt: str) -> str:
+    # 1. If USE_OLLAMA is True, prioritize Ollama
+    if USE_OLLAMA:
+        try:
+            return _generate_response_ollama(prompt)
+        except Exception as e:
+            print(f"\n  [WARNING] Ollama inference failed: {e}")
+            print("  --> Falling back to llama-cpp-python...\n")
+    # 2. Try using llama-cpp-python
     try:
         llm = get_llm()
             prompt,
             max_tokens=MAX_TOKENS,
             temperature=TEMPERATURE,
+            stop=["Question:", "<|im_end|>", "<|im_start|>"]
         )
         text = output["choices"][0]["text"]
         return text.strip()
         # OSError: WinError 0xc000001d / illegal instruction -> trigger Ollama fallback
         print(f"\n  [WARNING] llama-cpp-python failed due to hardware compatibility issue: {e}")
+        if not USE_OLLAMA:
+            print("  --> Falling back to Ollama local inference...\n")
+            try:
+                return _generate_response_ollama(prompt)
+            except Exception as ex:
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(ex)}"
+                )
+        else:
+            raise HTTPException(
+                status_code=500,
+                detail=f"llama-cpp-python failed and Ollama was already tried. llama-cpp error: {str(e)}"
+            )
     except Exception as e:
         # Catch any other initialization or execution errors
         print(f"\n  [WARNING] llama-cpp-python failed: {e}")
+        if not USE_OLLAMA:
+            print("  --> Falling back to Ollama local inference...\n")
+            try:
+                return _generate_response_ollama(prompt)
+            except Exception as ex:
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Both llama-cpp-python and Ollama fallback failed. Ollama error: {str(ex)}"
+                )
+        else:
             raise HTTPException(
                 status_code=500,
+                detail=f"llama-cpp-python failed and Ollama was already tried. llama-cpp error: {str(e)}"
             )

rag/prompt_builder.py CHANGED Viewed

@@ -1,5 +1,5 @@
 SYSTEM_PROMPT = """You are a helpful AI assistant.
-Answer only from provided context.
 If answer is not available, say you don't know."""

 SYSTEM_PROMPT = """You are a helpful AI assistant.
+Answer only from provided context. Keep your answers brief, direct, and under 3 sentences.
 If answer is not available, say you don't know."""