ai-assistant-engine / embeddings /vector_store.py
khubchand's picture
Initial clean release
0a96660
import os
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from embeddings.embedding_model import embedding_model
from rag.chunking import split_documents
from config import VECTOR_DB_PATH
PDF_FOLDER = "documents/pdfs"
def _ocr_pdf(pdf_path: str) -> list[Document]:
"""
Fallback OCR using PyMuPDF (fitz) + Tesseract.
No Poppler required — PyMuPDF handles PDF-to-image conversion natively.
Requires: pip install pymupdf pytesseract pillow
Tesseract installed: winget install UB-Mannheim.TesseractOCR
"""
try:
import fitz # PyMuPDF
import pytesseract
from PIL import Image
import io
except ImportError as e:
print(f" [OCR] Missing package: {e}. Run: pip install pymupdf pytesseract pillow")
return []
print(f" [OCR] Running Tesseract OCR on: {os.path.basename(pdf_path)}")
try:
pdf_doc = fitz.open(pdf_path)
except Exception as e:
print(f" [OCR] Could not open PDF with PyMuPDF: {e}")
return []
documents = []
for i, page in enumerate(pdf_doc):
# Render page to image at 200 DPI
matrix = fitz.Matrix(200 / 72, 200 / 72)
pix = page.get_pixmap(matrix=matrix)
img_bytes = pix.tobytes("png")
img = Image.open(io.BytesIO(img_bytes))
try:
text = pytesseract.image_to_string(img)
except Exception as e:
print(f" [OCR] Tesseract failed on page {i+1}: {e}")
print(" [OCR] Is Tesseract installed? Run: winget install UB-Mannheim.TesseractOCR")
break
char_count = len(text.strip())
print(f" [OCR] Page {i+1}: extracted {char_count} chars")
if text.strip():
documents.append(Document(
page_content=text,
metadata={"source": pdf_path, "page": i}
))
pdf_doc.close()
return documents
def _load_pdf(pdf_path: str) -> list[Document]:
"""
Load a PDF using PyPDFLoader. If all pages are empty (scanned PDF) or loader fails,
automatically fall back to Tesseract OCR.
"""
filename = os.path.basename(pdf_path)
try:
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"Loaded {len(docs)} page(s) from {filename}")
# Check if any page has real text
text_docs = [d for d in docs if d.page_content.strip()]
if text_docs:
return text_docs
except Exception as e:
print(f" [WARNING] PyPDFLoader failed to read '{filename}': {e} — attempting OCR...")
# All pages empty or loader failed → scanned PDF, try OCR
print(f" [WARNING] No text extracted from '{filename}' — attempting OCR...")
ocr_docs = _ocr_pdf(pdf_path)
if not ocr_docs:
print(
f" [ERROR] OCR also failed for '{filename}'.\n"
" Make sure Tesseract is installed and on PATH:\n"
" winget install UB-Mannheim.TesseractOCR\n"
)
return ocr_docs
def _load_docx(docx_path: str) -> list[Document]:
"""
Load a Word (.docx) file and return a list of Documents.
"""
from docx import Document as DocxDocument
filename = os.path.basename(docx_path)
print(f"Loading Word document: {filename}")
try:
doc = DocxDocument(docx_path)
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text.strip())
# Also extract text from tables
for table in doc.tables:
for row in table.rows:
row_text = [cell.text.strip() for cell in row.cells if cell.text.strip()]
if row_text:
full_text.append(" | ".join(row_text))
text_content = "\n".join(full_text)
if text_content.strip():
return [Document(
page_content=text_content,
metadata={"source": docx_path}
)]
except Exception as e:
print(f" [ERROR] Failed to read Word file '{filename}': {e}")
return []
def _load_excel(excel_path: str) -> list[Document]:
"""
Load an Excel (.xlsx, .xls) file using pandas and openpyxl,
returning a text representation of the tables.
"""
import pandas as pd
filename = os.path.basename(excel_path)
print(f"Loading Excel spreadsheet: {filename}")
try:
with pd.ExcelFile(excel_path) as xls:
documents = []
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
if df.empty:
continue
# Convert sheet to string representation
sheet_text = f"Sheet: {sheet_name}\n"
sheet_text += df.to_string(index=False)
if sheet_text.strip():
documents.append(Document(
page_content=sheet_text,
metadata={"source": excel_path, "sheet": sheet_name}
))
return documents
except Exception as e:
print(f" [ERROR] Failed to read Excel file '{filename}': {e}")
return []
def load_any_document(file_path: str) -> list[Document]:
"""
Unified loader for PDF, Word, and Excel files.
"""
ext = os.path.splitext(file_path)[-1].lower()
if ext == ".pdf":
return _load_pdf(file_path)
elif ext == ".docx":
return _load_docx(file_path)
elif ext in [".xlsx", ".xls"]:
return _load_excel(file_path)
return []
def build_vector_store():
documents_folder = "documents"
if not os.path.exists(documents_folder):
os.makedirs(documents_folder)
documents = []
# Recursively find all supported files in documents/ directory
for root, dirs, files in os.walk(documents_folder):
for file in files:
file_path = os.path.join(root, file)
# Skip checking directories and temp/lock files (e.g. ~$Doc.docx)
if file.startswith("~$"):
continue
docs = load_any_document(file_path)
if docs:
documents.extend(docs)
if not documents:
print("No documents found or extracted in documents/ folder.")
return None
chunks = split_documents(documents)
if not chunks:
print("Chunking produced no results. Check CHUNK_SIZE / CHUNK_OVERLAP in config.py.")
return None
print(f"Building FAISS index from {len(chunks)} chunk(s)...")
vector_db = FAISS.from_documents(chunks, embedding_model)
vector_db.save_local(VECTOR_DB_PATH)
print(f"FAISS vector database created and saved to '{VECTOR_DB_PATH}'.")
return vector_db
def load_vector_store():
index_file = os.path.join(VECTOR_DB_PATH, "index.faiss")
if not os.path.exists(index_file):
print("FAISS index not found.")
print("Creating new vector database...")
return build_vector_store()
vector_db = FAISS.load_local(
VECTOR_DB_PATH,
embedding_model,
allow_dangerous_deserialization=True
)
return vector_db