Spaces:

AnuragShirke
/

knowledge-assistant-backend

Sleeping

App Files Files Community

AnuragShirke commited on Jul 26, 2025

Commit

1d9404d

1 Parent(s): 813ae62

Added .docx and .txt support

Browse files

Files changed (14) hide show

Dockerfile +1 -0
docker-compose.yml +8 -2
pull_model.sh +0 -22
requirements.txt +1 -0
scripts/ollama_entrypoint.sh +2 -2
scripts/wait-for-qdrant.sh +1 -1
src/__pycache__/main.cpython-311.pyc +0 -0
src/__pycache__/main.cpython-312.pyc +0 -0
src/core/__pycache__/llm.cpython-311.pyc +0 -0
src/core/__pycache__/models.cpython-311.pyc +0 -0
src/core/__pycache__/processing.cpython-311.pyc +0 -0
src/core/__pycache__/vector_store.cpython-311.pyc +0 -0
src/core/processing.py +27 -1
src/main.py +7 -6

Dockerfile CHANGED Viewed

@@ -19,6 +19,7 @@ RUN pip install --no-cache-dir -r requirements.txt
 # Copy the application code into the container
 COPY ./src /app/src
 # Expose port 8000 to allow communication to the Uvicorn server
 EXPOSE 8000

 # Copy the application code into the container
 COPY ./src /app/src
+COPY ./scripts /app/scripts
 # Expose port 8000 to allow communication to the Uvicorn server
 EXPOSE 8000

docker-compose.yml CHANGED Viewed

@@ -1,4 +1,3 @@
 version: '3.8'
 services:
@@ -34,7 +33,14 @@ services:
       - ./scripts:/app
       - ollama_data:/root/.ollama
     mem_limit: 6.5g
 volumes:
   qdrant_data:
-  ollama_data:

 version: '3.8'
 services:
       - ./scripts:/app
       - ollama_data:/root/.ollama
     mem_limit: 6.5g
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
 volumes:
   qdrant_data:
+  ollama_data:

pull_model.sh DELETED Viewed

@@ -1,22 +0,0 @@
-#!/bin/bash
-# This script automates pulling a model into the Ollama container.
-# The name of the model to pull
-MODEL_NAME="llama3"
-# The name of the ollama service in docker-compose.yml
-OLLAMA_SERVICE_NAME="ollama"
-# Check if the container is running
-if ! docker-compose ps -q $OLLAMA_SERVICE_NAME > /dev/null 2>&1; then
-    echo "Ollama container is not running. Please start it with 'docker-compose up -d'"
-    exit 1
-fi
-echo "Pulling the $MODEL_NAME model into the Ollama container..."
-echo "This may take a while depending on your internet connection."
-docker-compose exec $OLLAMA_SERVICE_NAME ollama pull $MODEL_NAME
-echo "Model pull complete."

requirements.txt CHANGED Viewed

@@ -15,3 +15,4 @@ python-jose[cryptography]
 pydantic-settings
 sqlalchemy
 psycopg2-binary

 pydantic-settings
 sqlalchemy
 psycopg2-binary
+python-docx

scripts/ollama_entrypoint.sh CHANGED Viewed

@@ -18,7 +18,7 @@ done
 # Pull the model
 echo "Ollama server started. Pulling llama3 model..."
-ollama pull phi3
 # Wait for the background process to exit
-wait $pid

 # Pull the model
 echo "Ollama server started. Pulling llama3 model..."
+ollama pull llama3
 # Wait for the background process to exit
+wait $pid

scripts/wait-for-qdrant.sh CHANGED Viewed

@@ -14,4 +14,4 @@ until curl -s -f "$host/healthz" > /dev/null; do
 done
 >&2 echo "Qdrant is up - executing command"
-    exec "$@"

 done
 >&2 echo "Qdrant is up - executing command"
+    exec "$@"

src/__pycache__/main.cpython-311.pyc CHANGED Viewed

Binary files a/src/__pycache__/main.cpython-311.pyc and b/src/__pycache__/main.cpython-311.pyc differ

src/__pycache__/main.cpython-312.pyc DELETED Viewed

Binary file (448 Bytes)

src/core/__pycache__/llm.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/llm.cpython-311.pyc and b/src/core/__pycache__/llm.cpython-311.pyc differ

src/core/__pycache__/models.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/models.cpython-311.pyc and b/src/core/__pycache__/models.cpython-311.pyc differ

src/core/__pycache__/processing.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/processing.cpython-311.pyc and b/src/core/__pycache__/processing.cpython-311.pyc differ

src/core/__pycache__/vector_store.cpython-311.pyc CHANGED Viewed

Binary files a/src/core/__pycache__/vector_store.cpython-311.pyc and b/src/core/__pycache__/vector_store.cpython-311.pyc differ

src/core/processing.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
 def parse_pdf(file_path: str) -> str:
     """Extracts text from a PDF file."""
@@ -11,6 +12,31 @@ def parse_pdf(file_path: str) -> str:
     doc.close()
     return text
 def chunk_text(text: str) -> list[str]:
     """Splits text into smaller chunks."""
     text_splitter = RecursiveCharacterTextSplitter(
@@ -22,4 +48,4 @@ def chunk_text(text: str) -> list[str]:
 def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
     """Loads the sentence-transformer model."""
-    return SentenceTransformer(model_name)

 import fitz  # PyMuPDF
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer
+import docx # Added for docx parsing
 def parse_pdf(file_path: str) -> str:
     """Extracts text from a PDF file."""
     doc.close()
     return text
+def parse_txt(file_path: str) -> str:
+    """Extracts text from a TXT file."""
+    with open(file_path, 'r', encoding='utf-8') as f:
+        text = f.read()
+    return text
+def parse_docx(file_path: str) -> str:
+    """Extracts text from a DOCX file."""
+    document = docx.Document(file_path)
+    text = []
+    for paragraph in document.paragraphs:
+        text.append(paragraph.text)
+    return '\n'.join(text)
+def parse_document(file_path: str, file_extension: str) -> str:
+    """Dispatches to the correct parser based on file extension."""
+    if file_extension == ".pdf":
+        return parse_pdf(file_path)
+    elif file_extension == ".txt":
+        return parse_txt(file_path)
+    elif file_extension == ".docx":
+        return parse_docx(file_path)
+    else:
+        raise ValueError(f"Unsupported file type: {file_extension}")
 def chunk_text(text: str) -> list[str]:
     """Splits text into smaller chunks."""
     text_splitter = RecursiveCharacterTextSplitter(
 def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
     """Loads the sentence-transformer model."""
+    return SentenceTransformer(model_name)

src/main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI, UploadFile, File, HTTPException
 import shutil
 import os
-from .core.processing import parse_pdf, chunk_text, get_embedding_model
 from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
 from .core.llm import get_ollama_client, format_prompt, generate_response
 from .core.models import QueryRequest, QueryResponse
@@ -11,7 +11,7 @@ app = FastAPI()
 # --- Constants ---
 UPLOADS_DIR = "uploads"
 QDRANT_COLLECTION_NAME = "knowledge_base"
-OLLAMA_MODEL = "tinyllama"
 # --- Application Startup ---
 # Create uploads directory if it doesn't exist
@@ -32,8 +32,9 @@ create_collection_if_not_exists(qdrant_client, QDRANT_COLLECTION_NAME, embedding
 # --- API Endpoints ---
 @app.post("/upload")
 def upload_file(file: UploadFile = File(...)):
-    if not file.filename.lower().endswith(".pdf"):
-        raise HTTPException(status_code=400, detail="Invalid file type. Only PDFs are supported.")
     file_path = os.path.join(UPLOADS_DIR, file.filename)
@@ -44,7 +45,7 @@ def upload_file(file: UploadFile = File(...)):
         raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
     try:
-        text = parse_pdf(file_path)
         if not text.strip():
             raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
@@ -110,4 +111,4 @@ def query_knowledge_base(request: QueryRequest):
 @app.get("/health")
 def health_check():
-    return {"status": "ok"}

 from fastapi import FastAPI, UploadFile, File, HTTPException
 import shutil
 import os
+from .core.processing import parse_document, chunk_text, get_embedding_model
 from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
 from .core.llm import get_ollama_client, format_prompt, generate_response
 from .core.models import QueryRequest, QueryResponse
 # --- Constants ---
 UPLOADS_DIR = "uploads"
 QDRANT_COLLECTION_NAME = "knowledge_base"
+OLLAMA_MODEL = "llama3"
 # --- Application Startup ---
 # Create uploads directory if it doesn't exist
 # --- API Endpoints ---
 @app.post("/upload")
 def upload_file(file: UploadFile = File(...)):
+    file_extension = os.path.splitext(file.filename)[1].lower()
+    if file_extension not in [".pdf", ".txt", ".docx"]:
+        raise HTTPException(status_code=400, detail="Invalid file type. Only PDF, TXT, and DOCX files are supported.")
     file_path = os.path.join(UPLOADS_DIR, file.filename)
         raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
     try:
+        text = parse_document(file_path, file_extension)
         if not text.strip():
             raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
 @app.get("/health")
 def health_check():
+    return {"status": "ok"}