Commit ·
1d9404d
1
Parent(s): 813ae62
Added .docx and .txt support
Browse files- Dockerfile +1 -0
- docker-compose.yml +8 -2
- pull_model.sh +0 -22
- requirements.txt +1 -0
- scripts/ollama_entrypoint.sh +2 -2
- scripts/wait-for-qdrant.sh +1 -1
- src/__pycache__/main.cpython-311.pyc +0 -0
- src/__pycache__/main.cpython-312.pyc +0 -0
- src/core/__pycache__/llm.cpython-311.pyc +0 -0
- src/core/__pycache__/models.cpython-311.pyc +0 -0
- src/core/__pycache__/processing.cpython-311.pyc +0 -0
- src/core/__pycache__/vector_store.cpython-311.pyc +0 -0
- src/core/processing.py +27 -1
- src/main.py +7 -6
Dockerfile
CHANGED
|
@@ -19,6 +19,7 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
| 19 |
|
| 20 |
# Copy the application code into the container
|
| 21 |
COPY ./src /app/src
|
|
|
|
| 22 |
|
| 23 |
# Expose port 8000 to allow communication to the Uvicorn server
|
| 24 |
EXPOSE 8000
|
|
|
|
| 19 |
|
| 20 |
# Copy the application code into the container
|
| 21 |
COPY ./src /app/src
|
| 22 |
+
COPY ./scripts /app/scripts
|
| 23 |
|
| 24 |
# Expose port 8000 to allow communication to the Uvicorn server
|
| 25 |
EXPOSE 8000
|
docker-compose.yml
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
version: '3.8'
|
| 3 |
|
| 4 |
services:
|
|
@@ -34,7 +33,14 @@ services:
|
|
| 34 |
- ./scripts:/app
|
| 35 |
- ollama_data:/root/.ollama
|
| 36 |
mem_limit: 6.5g
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
volumes:
|
| 39 |
qdrant_data:
|
| 40 |
-
ollama_data:
|
|
|
|
|
|
|
| 1 |
version: '3.8'
|
| 2 |
|
| 3 |
services:
|
|
|
|
| 33 |
- ./scripts:/app
|
| 34 |
- ollama_data:/root/.ollama
|
| 35 |
mem_limit: 6.5g
|
| 36 |
+
deploy:
|
| 37 |
+
resources:
|
| 38 |
+
reservations:
|
| 39 |
+
devices:
|
| 40 |
+
- driver: nvidia
|
| 41 |
+
count: 1
|
| 42 |
+
capabilities: [gpu]
|
| 43 |
|
| 44 |
volumes:
|
| 45 |
qdrant_data:
|
| 46 |
+
ollama_data:
|
pull_model.sh
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
|
| 3 |
-
# This script automates pulling a model into the Ollama container.
|
| 4 |
-
|
| 5 |
-
# The name of the model to pull
|
| 6 |
-
MODEL_NAME="llama3"
|
| 7 |
-
|
| 8 |
-
# The name of the ollama service in docker-compose.yml
|
| 9 |
-
OLLAMA_SERVICE_NAME="ollama"
|
| 10 |
-
|
| 11 |
-
# Check if the container is running
|
| 12 |
-
if ! docker-compose ps -q $OLLAMA_SERVICE_NAME > /dev/null 2>&1; then
|
| 13 |
-
echo "Ollama container is not running. Please start it with 'docker-compose up -d'"
|
| 14 |
-
exit 1
|
| 15 |
-
fi
|
| 16 |
-
|
| 17 |
-
echo "Pulling the $MODEL_NAME model into the Ollama container..."
|
| 18 |
-
echo "This may take a while depending on your internet connection."
|
| 19 |
-
|
| 20 |
-
docker-compose exec $OLLAMA_SERVICE_NAME ollama pull $MODEL_NAME
|
| 21 |
-
|
| 22 |
-
echo "Model pull complete."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -15,3 +15,4 @@ python-jose[cryptography]
|
|
| 15 |
pydantic-settings
|
| 16 |
sqlalchemy
|
| 17 |
psycopg2-binary
|
|
|
|
|
|
| 15 |
pydantic-settings
|
| 16 |
sqlalchemy
|
| 17 |
psycopg2-binary
|
| 18 |
+
python-docx
|
scripts/ollama_entrypoint.sh
CHANGED
|
@@ -18,7 +18,7 @@ done
|
|
| 18 |
|
| 19 |
# Pull the model
|
| 20 |
echo "Ollama server started. Pulling llama3 model..."
|
| 21 |
-
ollama pull
|
| 22 |
|
| 23 |
# Wait for the background process to exit
|
| 24 |
-
wait $pid
|
|
|
|
| 18 |
|
| 19 |
# Pull the model
|
| 20 |
echo "Ollama server started. Pulling llama3 model..."
|
| 21 |
+
ollama pull llama3
|
| 22 |
|
| 23 |
# Wait for the background process to exit
|
| 24 |
+
wait $pid
|
scripts/wait-for-qdrant.sh
CHANGED
|
@@ -14,4 +14,4 @@ until curl -s -f "$host/healthz" > /dev/null; do
|
|
| 14 |
done
|
| 15 |
|
| 16 |
>&2 echo "Qdrant is up - executing command"
|
| 17 |
-
exec "$@"
|
|
|
|
| 14 |
done
|
| 15 |
|
| 16 |
>&2 echo "Qdrant is up - executing command"
|
| 17 |
+
exec "$@"
|
src/__pycache__/main.cpython-311.pyc
CHANGED
|
Binary files a/src/__pycache__/main.cpython-311.pyc and b/src/__pycache__/main.cpython-311.pyc differ
|
|
|
src/__pycache__/main.cpython-312.pyc
DELETED
|
Binary file (448 Bytes)
|
|
|
src/core/__pycache__/llm.cpython-311.pyc
CHANGED
|
Binary files a/src/core/__pycache__/llm.cpython-311.pyc and b/src/core/__pycache__/llm.cpython-311.pyc differ
|
|
|
src/core/__pycache__/models.cpython-311.pyc
CHANGED
|
Binary files a/src/core/__pycache__/models.cpython-311.pyc and b/src/core/__pycache__/models.cpython-311.pyc differ
|
|
|
src/core/__pycache__/processing.cpython-311.pyc
CHANGED
|
Binary files a/src/core/__pycache__/processing.cpython-311.pyc and b/src/core/__pycache__/processing.cpython-311.pyc differ
|
|
|
src/core/__pycache__/vector_store.cpython-311.pyc
CHANGED
|
Binary files a/src/core/__pycache__/vector_store.cpython-311.pyc and b/src/core/__pycache__/vector_store.cpython-311.pyc differ
|
|
|
src/core/processing.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import fitz # PyMuPDF
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
from sentence_transformers import SentenceTransformer
|
|
|
|
| 4 |
|
| 5 |
def parse_pdf(file_path: str) -> str:
|
| 6 |
"""Extracts text from a PDF file."""
|
|
@@ -11,6 +12,31 @@ def parse_pdf(file_path: str) -> str:
|
|
| 11 |
doc.close()
|
| 12 |
return text
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def chunk_text(text: str) -> list[str]:
|
| 15 |
"""Splits text into smaller chunks."""
|
| 16 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -22,4 +48,4 @@ def chunk_text(text: str) -> list[str]:
|
|
| 22 |
|
| 23 |
def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
|
| 24 |
"""Loads the sentence-transformer model."""
|
| 25 |
-
return SentenceTransformer(model_name)
|
|
|
|
| 1 |
import fitz # PyMuPDF
|
| 2 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 3 |
from sentence_transformers import SentenceTransformer
|
| 4 |
+
import docx # Added for docx parsing
|
| 5 |
|
| 6 |
def parse_pdf(file_path: str) -> str:
|
| 7 |
"""Extracts text from a PDF file."""
|
|
|
|
| 12 |
doc.close()
|
| 13 |
return text
|
| 14 |
|
| 15 |
+
def parse_txt(file_path: str) -> str:
|
| 16 |
+
"""Extracts text from a TXT file."""
|
| 17 |
+
with open(file_path, 'r', encoding='utf-8') as f:
|
| 18 |
+
text = f.read()
|
| 19 |
+
return text
|
| 20 |
+
|
| 21 |
+
def parse_docx(file_path: str) -> str:
|
| 22 |
+
"""Extracts text from a DOCX file."""
|
| 23 |
+
document = docx.Document(file_path)
|
| 24 |
+
text = []
|
| 25 |
+
for paragraph in document.paragraphs:
|
| 26 |
+
text.append(paragraph.text)
|
| 27 |
+
return '\n'.join(text)
|
| 28 |
+
|
| 29 |
+
def parse_document(file_path: str, file_extension: str) -> str:
|
| 30 |
+
"""Dispatches to the correct parser based on file extension."""
|
| 31 |
+
if file_extension == ".pdf":
|
| 32 |
+
return parse_pdf(file_path)
|
| 33 |
+
elif file_extension == ".txt":
|
| 34 |
+
return parse_txt(file_path)
|
| 35 |
+
elif file_extension == ".docx":
|
| 36 |
+
return parse_docx(file_path)
|
| 37 |
+
else:
|
| 38 |
+
raise ValueError(f"Unsupported file type: {file_extension}")
|
| 39 |
+
|
| 40 |
def chunk_text(text: str) -> list[str]:
|
| 41 |
"""Splits text into smaller chunks."""
|
| 42 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 48 |
|
| 49 |
def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
|
| 50 |
"""Loads the sentence-transformer model."""
|
| 51 |
+
return SentenceTransformer(model_name)
|
src/main.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 2 |
import shutil
|
| 3 |
import os
|
| 4 |
-
from .core.processing import
|
| 5 |
from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
|
| 6 |
from .core.llm import get_ollama_client, format_prompt, generate_response
|
| 7 |
from .core.models import QueryRequest, QueryResponse
|
|
@@ -11,7 +11,7 @@ app = FastAPI()
|
|
| 11 |
# --- Constants ---
|
| 12 |
UPLOADS_DIR = "uploads"
|
| 13 |
QDRANT_COLLECTION_NAME = "knowledge_base"
|
| 14 |
-
OLLAMA_MODEL = "
|
| 15 |
|
| 16 |
# --- Application Startup ---
|
| 17 |
# Create uploads directory if it doesn't exist
|
|
@@ -32,8 +32,9 @@ create_collection_if_not_exists(qdrant_client, QDRANT_COLLECTION_NAME, embedding
|
|
| 32 |
# --- API Endpoints ---
|
| 33 |
@app.post("/upload")
|
| 34 |
def upload_file(file: UploadFile = File(...)):
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
file_path = os.path.join(UPLOADS_DIR, file.filename)
|
| 39 |
|
|
@@ -44,7 +45,7 @@ def upload_file(file: UploadFile = File(...)):
|
|
| 44 |
raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
|
| 45 |
|
| 46 |
try:
|
| 47 |
-
text =
|
| 48 |
if not text.strip():
|
| 49 |
raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
|
| 50 |
|
|
@@ -110,4 +111,4 @@ def query_knowledge_base(request: QueryRequest):
|
|
| 110 |
|
| 111 |
@app.get("/health")
|
| 112 |
def health_check():
|
| 113 |
-
return {"status": "ok"}
|
|
|
|
| 1 |
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 2 |
import shutil
|
| 3 |
import os
|
| 4 |
+
from .core.processing import parse_document, chunk_text, get_embedding_model
|
| 5 |
from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
|
| 6 |
from .core.llm import get_ollama_client, format_prompt, generate_response
|
| 7 |
from .core.models import QueryRequest, QueryResponse
|
|
|
|
| 11 |
# --- Constants ---
|
| 12 |
UPLOADS_DIR = "uploads"
|
| 13 |
QDRANT_COLLECTION_NAME = "knowledge_base"
|
| 14 |
+
OLLAMA_MODEL = "llama3"
|
| 15 |
|
| 16 |
# --- Application Startup ---
|
| 17 |
# Create uploads directory if it doesn't exist
|
|
|
|
| 32 |
# --- API Endpoints ---
|
| 33 |
@app.post("/upload")
|
| 34 |
def upload_file(file: UploadFile = File(...)):
|
| 35 |
+
file_extension = os.path.splitext(file.filename)[1].lower()
|
| 36 |
+
if file_extension not in [".pdf", ".txt", ".docx"]:
|
| 37 |
+
raise HTTPException(status_code=400, detail="Invalid file type. Only PDF, TXT, and DOCX files are supported.")
|
| 38 |
|
| 39 |
file_path = os.path.join(UPLOADS_DIR, file.filename)
|
| 40 |
|
|
|
|
| 45 |
raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
|
| 46 |
|
| 47 |
try:
|
| 48 |
+
text = parse_document(file_path, file_extension)
|
| 49 |
if not text.strip():
|
| 50 |
raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
|
| 51 |
|
|
|
|
| 111 |
|
| 112 |
@app.get("/health")
|
| 113 |
def health_check():
|
| 114 |
+
return {"status": "ok"}
|