For enhancing the LLM memory limit which is hindering the performance

Files changed (10) hide show

Dockerfile CHANGED Viewed

@@ -11,6 +11,9 @@ WORKDIR /app
 # Copy the requirements file into the container
 COPY requirements.txt .
 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt

 # Copy the requirements file into the container
 COPY requirements.txt .
+# Set a higher timeout for pip installations
+ENV PIP_DEFAULT_TIMEOUT=1000
 # Install any needed packages specified in requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt

docker-compose.yml CHANGED Viewed

@@ -15,7 +15,7 @@ services:
     environment:
       - QDRANT_HOST=qdrant
       - OLLAMA_HOST=ollama
-    entrypoint: ["/app/scripts/wait-for-qdrant.sh", "qdrant:6333", "--", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
   qdrant:
     image: qdrant/qdrant:latest
@@ -33,6 +33,7 @@ services:
     volumes:
       - ./scripts:/app
       - ollama_data:/root/.ollama
 volumes:
   qdrant_data:

     environment:
       - QDRANT_HOST=qdrant
       - OLLAMA_HOST=ollama
+    entrypoint: ["/app/scripts/wait-for-qdrant.sh", "qdrant:6333", "uvicorn", "src.main:app", "--host", "0.0.0.0", "--port", "8000"]
   qdrant:
     image: qdrant/qdrant:latest
     volumes:
       - ./scripts:/app
       - ollama_data:/root/.ollama
+    mem_limit: 6.5g
 volumes:
   qdrant_data:

scripts/ollama_entrypoint.sh CHANGED Viewed

@@ -18,7 +18,7 @@ done
 # Pull the model
 echo "Ollama server started. Pulling llama3 model..."
-ollama pull llama3
 # Wait for the background process to exit
 wait $pid

 # Pull the model
 echo "Ollama server started. Pulling llama3 model..."
+ollama pull phi3
 # Wait for the background process to exit
 wait $pid

scripts/wait-for-qdrant.sh CHANGED Viewed

@@ -14,4 +14,4 @@ until curl -s -f "$host/healthz" > /dev/null; do
 done
 >&2 echo "Qdrant is up - executing command"
-exec $cmd

 done
 >&2 echo "Qdrant is up - executing command"
+    exec "$@"

src/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-311.pyc and b/src/__pycache__/main.cpython-311.pyc differ

src/core/__pycache__/llm.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/llm.cpython-311.pyc and b/src/core/__pycache__/llm.cpython-311.pyc differ

src/core/__pycache__/vector_store.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/vector_store.cpython-311.pyc and b/src/core/__pycache__/vector_store.cpython-311.pyc differ

src/core/llm.py CHANGED Viewed

@@ -12,7 +12,7 @@ def get_ollama_client():
 def format_prompt(query: str, context: list[dict]) -> str:
     """Formats the prompt for the LLM with the retrieved context."""
-    context_str = "\n".join([item['payload']['text'] for item in context])
     prompt = f"""**Instruction**:
 Answer the user's query based *only* on the provided context.
 If the context does not contain the answer, state that you cannot answer the question with the given information.

 def format_prompt(query: str, context: list[dict]) -> str:
     """Formats the prompt for the LLM with the retrieved context."""
+    context_str = "\n".join([item.payload.get('text') for item in context])
     prompt = f"""**Instruction**:
 Answer the user's query based *only* on the provided context.
 If the context does not contain the answer, state that you cannot answer the question with the given information.

src/core/vector_store.py CHANGED Viewed

@@ -29,7 +29,7 @@ def upsert_vectors(client: QdrantClient, collection_name: str, vectors, payloads
     client.upsert(
         collection_name=collection_name,
         points=models.Batch(
-            ids=None,  # Let Qdrant assign IDs
             vectors=vectors,
             payloads=payloads
         ),

     client.upsert(
         collection_name=collection_name,
         points=models.Batch(
+            ids=list(range(len(vectors))),  # Generate sequential integer IDs
             vectors=vectors,
             payloads=payloads
         ),

src/main.py CHANGED Viewed

@@ -11,7 +11,7 @@ app = FastAPI()
 # --- Constants ---
 UPLOADS_DIR = "uploads"
 QDRANT_COLLECTION_NAME = "knowledge_base"
-OLLAMA_MODEL = "llama3"
 # --- Application Startup ---
 # Create uploads directory if it doesn't exist
@@ -87,11 +87,17 @@ def query_knowledge_base(request: QueryRequest):
         # 4. Generate a response from the LLM
         answer = generate_response(ollama_client, OLLAMA_MODEL, prompt)
         # 5. Extract source documents for citation
         source_documents = [
             {
-                "source": result.payload["source"],
-                "text": result.payload["text"],
                 "score": result.score
             }
             for result in search_results

 # --- Constants ---
 UPLOADS_DIR = "uploads"
 QDRANT_COLLECTION_NAME = "knowledge_base"
+OLLAMA_MODEL = "tinyllama"
 # --- Application Startup ---
 # Create uploads directory if it doesn't exist
         # 4. Generate a response from the LLM
         answer = generate_response(ollama_client, OLLAMA_MODEL, prompt)
+        # Debugging: Print search_results structure
+        print(f"Type of search_results: {type(search_results)}")
+        if search_results:
+            print(f"Type of first element in search_results: {type(search_results[0])}")
+            print(f"Content of first element in search_results: {search_results[0]}")
         # 5. Extract source documents for citation
         source_documents = [
             {
+                "source": result.payload.get("source") if result.payload else "Unknown",
+                "text": result.payload.get("text") if result.payload else "N/A",
                 "score": result.score
             }
             for result in search_results