AnuragShirke commited on
Commit
1d9404d
·
1 Parent(s): 813ae62

Added .docx and .txt support

Browse files
Dockerfile CHANGED
@@ -19,6 +19,7 @@ RUN pip install --no-cache-dir -r requirements.txt
19
 
20
  # Copy the application code into the container
21
  COPY ./src /app/src
 
22
 
23
  # Expose port 8000 to allow communication to the Uvicorn server
24
  EXPOSE 8000
 
19
 
20
  # Copy the application code into the container
21
  COPY ./src /app/src
22
+ COPY ./scripts /app/scripts
23
 
24
  # Expose port 8000 to allow communication to the Uvicorn server
25
  EXPOSE 8000
docker-compose.yml CHANGED
@@ -1,4 +1,3 @@
1
-
2
  version: '3.8'
3
 
4
  services:
@@ -34,7 +33,14 @@ services:
34
  - ./scripts:/app
35
  - ollama_data:/root/.ollama
36
  mem_limit: 6.5g
 
 
 
 
 
 
 
37
 
38
  volumes:
39
  qdrant_data:
40
- ollama_data:
 
 
1
  version: '3.8'
2
 
3
  services:
 
33
  - ./scripts:/app
34
  - ollama_data:/root/.ollama
35
  mem_limit: 6.5g
36
+ deploy:
37
+ resources:
38
+ reservations:
39
+ devices:
40
+ - driver: nvidia
41
+ count: 1
42
+ capabilities: [gpu]
43
 
44
  volumes:
45
  qdrant_data:
46
+ ollama_data:
pull_model.sh DELETED
@@ -1,22 +0,0 @@
1
- #!/bin/bash
2
-
3
- # This script automates pulling a model into the Ollama container.
4
-
5
- # The name of the model to pull
6
- MODEL_NAME="llama3"
7
-
8
- # The name of the ollama service in docker-compose.yml
9
- OLLAMA_SERVICE_NAME="ollama"
10
-
11
- # Check if the container is running
12
- if ! docker-compose ps -q $OLLAMA_SERVICE_NAME > /dev/null 2>&1; then
13
- echo "Ollama container is not running. Please start it with 'docker-compose up -d'"
14
- exit 1
15
- fi
16
-
17
- echo "Pulling the $MODEL_NAME model into the Ollama container..."
18
- echo "This may take a while depending on your internet connection."
19
-
20
- docker-compose exec $OLLAMA_SERVICE_NAME ollama pull $MODEL_NAME
21
-
22
- echo "Model pull complete."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -15,3 +15,4 @@ python-jose[cryptography]
15
  pydantic-settings
16
  sqlalchemy
17
  psycopg2-binary
 
 
15
  pydantic-settings
16
  sqlalchemy
17
  psycopg2-binary
18
+ python-docx
scripts/ollama_entrypoint.sh CHANGED
@@ -18,7 +18,7 @@ done
18
 
19
  # Pull the model
20
  echo "Ollama server started. Pulling llama3 model..."
21
- ollama pull phi3
22
 
23
  # Wait for the background process to exit
24
- wait $pid
 
18
 
19
  # Pull the model
20
  echo "Ollama server started. Pulling llama3 model..."
21
+ ollama pull llama3
22
 
23
  # Wait for the background process to exit
24
+ wait $pid
scripts/wait-for-qdrant.sh CHANGED
@@ -14,4 +14,4 @@ until curl -s -f "$host/healthz" > /dev/null; do
14
  done
15
 
16
  >&2 echo "Qdrant is up - executing command"
17
- exec "$@"
 
14
  done
15
 
16
  >&2 echo "Qdrant is up - executing command"
17
+ exec "$@"
src/__pycache__/main.cpython-311.pyc CHANGED
Binary files a/src/__pycache__/main.cpython-311.pyc and b/src/__pycache__/main.cpython-311.pyc differ
 
src/__pycache__/main.cpython-312.pyc DELETED
Binary file (448 Bytes)
 
src/core/__pycache__/llm.cpython-311.pyc CHANGED
Binary files a/src/core/__pycache__/llm.cpython-311.pyc and b/src/core/__pycache__/llm.cpython-311.pyc differ
 
src/core/__pycache__/models.cpython-311.pyc CHANGED
Binary files a/src/core/__pycache__/models.cpython-311.pyc and b/src/core/__pycache__/models.cpython-311.pyc differ
 
src/core/__pycache__/processing.cpython-311.pyc CHANGED
Binary files a/src/core/__pycache__/processing.cpython-311.pyc and b/src/core/__pycache__/processing.cpython-311.pyc differ
 
src/core/__pycache__/vector_store.cpython-311.pyc CHANGED
Binary files a/src/core/__pycache__/vector_store.cpython-311.pyc and b/src/core/__pycache__/vector_store.cpython-311.pyc differ
 
src/core/processing.py CHANGED
@@ -1,6 +1,7 @@
1
  import fitz # PyMuPDF
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from sentence_transformers import SentenceTransformer
 
4
 
5
  def parse_pdf(file_path: str) -> str:
6
  """Extracts text from a PDF file."""
@@ -11,6 +12,31 @@ def parse_pdf(file_path: str) -> str:
11
  doc.close()
12
  return text
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def chunk_text(text: str) -> list[str]:
15
  """Splits text into smaller chunks."""
16
  text_splitter = RecursiveCharacterTextSplitter(
@@ -22,4 +48,4 @@ def chunk_text(text: str) -> list[str]:
22
 
23
  def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
24
  """Loads the sentence-transformer model."""
25
- return SentenceTransformer(model_name)
 
1
  import fitz # PyMuPDF
2
  from langchain.text_splitter import RecursiveCharacterTextSplitter
3
  from sentence_transformers import SentenceTransformer
4
+ import docx # Added for docx parsing
5
 
6
  def parse_pdf(file_path: str) -> str:
7
  """Extracts text from a PDF file."""
 
12
  doc.close()
13
  return text
14
 
15
+ def parse_txt(file_path: str) -> str:
16
+ """Extracts text from a TXT file."""
17
+ with open(file_path, 'r', encoding='utf-8') as f:
18
+ text = f.read()
19
+ return text
20
+
21
+ def parse_docx(file_path: str) -> str:
22
+ """Extracts text from a DOCX file."""
23
+ document = docx.Document(file_path)
24
+ text = []
25
+ for paragraph in document.paragraphs:
26
+ text.append(paragraph.text)
27
+ return '\n'.join(text)
28
+
29
+ def parse_document(file_path: str, file_extension: str) -> str:
30
+ """Dispatches to the correct parser based on file extension."""
31
+ if file_extension == ".pdf":
32
+ return parse_pdf(file_path)
33
+ elif file_extension == ".txt":
34
+ return parse_txt(file_path)
35
+ elif file_extension == ".docx":
36
+ return parse_docx(file_path)
37
+ else:
38
+ raise ValueError(f"Unsupported file type: {file_extension}")
39
+
40
  def chunk_text(text: str) -> list[str]:
41
  """Splits text into smaller chunks."""
42
  text_splitter = RecursiveCharacterTextSplitter(
 
48
 
49
  def get_embedding_model(model_name: str = 'all-MiniLM-L6-v2'):
50
  """Loads the sentence-transformer model."""
51
+ return SentenceTransformer(model_name)
src/main.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
  import shutil
3
  import os
4
- from .core.processing import parse_pdf, chunk_text, get_embedding_model
5
  from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
6
  from .core.llm import get_ollama_client, format_prompt, generate_response
7
  from .core.models import QueryRequest, QueryResponse
@@ -11,7 +11,7 @@ app = FastAPI()
11
  # --- Constants ---
12
  UPLOADS_DIR = "uploads"
13
  QDRANT_COLLECTION_NAME = "knowledge_base"
14
- OLLAMA_MODEL = "tinyllama"
15
 
16
  # --- Application Startup ---
17
  # Create uploads directory if it doesn't exist
@@ -32,8 +32,9 @@ create_collection_if_not_exists(qdrant_client, QDRANT_COLLECTION_NAME, embedding
32
  # --- API Endpoints ---
33
  @app.post("/upload")
34
  def upload_file(file: UploadFile = File(...)):
35
- if not file.filename.lower().endswith(".pdf"):
36
- raise HTTPException(status_code=400, detail="Invalid file type. Only PDFs are supported.")
 
37
 
38
  file_path = os.path.join(UPLOADS_DIR, file.filename)
39
 
@@ -44,7 +45,7 @@ def upload_file(file: UploadFile = File(...)):
44
  raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
45
 
46
  try:
47
- text = parse_pdf(file_path)
48
  if not text.strip():
49
  raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
50
 
@@ -110,4 +111,4 @@ def query_knowledge_base(request: QueryRequest):
110
 
111
  @app.get("/health")
112
  def health_check():
113
- return {"status": "ok"}
 
1
  from fastapi import FastAPI, UploadFile, File, HTTPException
2
  import shutil
3
  import os
4
+ from .core.processing import parse_document, chunk_text, get_embedding_model
5
  from .core.vector_store import get_qdrant_client, create_collection_if_not_exists, upsert_vectors, search_vectors
6
  from .core.llm import get_ollama_client, format_prompt, generate_response
7
  from .core.models import QueryRequest, QueryResponse
 
11
  # --- Constants ---
12
  UPLOADS_DIR = "uploads"
13
  QDRANT_COLLECTION_NAME = "knowledge_base"
14
+ OLLAMA_MODEL = "llama3"
15
 
16
  # --- Application Startup ---
17
  # Create uploads directory if it doesn't exist
 
32
  # --- API Endpoints ---
33
  @app.post("/upload")
34
  def upload_file(file: UploadFile = File(...)):
35
+ file_extension = os.path.splitext(file.filename)[1].lower()
36
+ if file_extension not in [".pdf", ".txt", ".docx"]:
37
+ raise HTTPException(status_code=400, detail="Invalid file type. Only PDF, TXT, and DOCX files are supported.")
38
 
39
  file_path = os.path.join(UPLOADS_DIR, file.filename)
40
 
 
45
  raise HTTPException(status_code=500, detail=f"Error saving file: {e}")
46
 
47
  try:
48
+ text = parse_document(file_path, file_extension)
49
  if not text.strip():
50
  raise HTTPException(status_code=400, detail="Could not extract text from the PDF.")
51
 
 
111
 
112
  @app.get("/health")
113
  def health_check():
114
+ return {"status": "ok"}