“shubhamdhamal”
Deploy Flask app with Docker
7644eac
"""
Vector store implementation for RAG capabilities.
"""
from typing import List, Dict, Any, Optional
import json
import os
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
class VectorStore:
"""
Manages vector storage for RAG capabilities.
"""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize the vector store.
Args:
api_key: Optional OpenAI API key
"""
self.api_key = api_key
# Use free sentence-transformers embeddings (no API key needed)
try:
from langchain.embeddings import HuggingFaceEmbeddings
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("✅ Using free HuggingFace embeddings")
except ImportError:
# Fallback to OpenAI if HuggingFace not available
if api_key:
from langchain.embeddings import OpenAIEmbeddings
self.embeddings = OpenAIEmbeddings(api_key=api_key)
print("✅ Using OpenAI embeddings")
else:
raise ValueError("HuggingFace not available and no OpenAI API key provided")
self.vector_store_path = Path("vector_db")
self.vector_store_path.mkdir(exist_ok=True)
self.vector_store = None
def load_documents(self, directory: str = None) -> None:
"""
Load documents from a directory and create embeddings.
If no directory is provided, creates a minimal default vector store.
Args:
directory: Optional path to directory containing documents
"""
try:
# If no directory provided, create a minimal vector store
if directory is None:
self._create_minimal_vector_store()
return
# Check if directory exists
if not os.path.exists(directory):
print(f"Warning: Document directory {directory} not found. Creating minimal vector store.")
self._create_minimal_vector_store()
return
# Try to load documents
loader = DirectoryLoader(directory)
documents = loader.load()
if not documents:
print("Warning: No documents found in directory. Creating minimal vector store.")
self._create_minimal_vector_store()
return
# Process documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
texts = text_splitter.split_documents(documents)
# Create or update vector store
if os.path.exists(self.vector_store_path / "index.faiss"):
self.vector_store = FAISS.load_local(
str(self.vector_store_path),
self.embeddings
)
self.vector_store.add_documents(texts)
else:
self.vector_store = FAISS.from_documents(
texts,
self.embeddings
)
self.vector_store.save_local(str(self.vector_store_path))
except Exception as e:
print(f"Error loading documents: {str(e)}")
self._create_minimal_vector_store()
def _create_minimal_vector_store(self) -> None:
"""Create a minimal vector store with default content."""
try:
default_texts = [
"This is a default document. The vector store was initialized with minimal content.",
"You can add your own documents to the vector store by placing them in the vector_db/documents directory.",
"The application will automatically load and index any text files found in that directory."
]
if os.path.exists(self.vector_store_path / "index.faiss"):
self.vector_store = FAISS.load_local(
str(self.vector_store_path),
self.embeddings
)
else:
self.vector_store = FAISS.from_texts(
default_texts,
self.embeddings
)
self.vector_store.save_local(str(self.vector_store_path))
except Exception as e:
print(f"Error creating minimal vector store: {str(e)}")
# Create an empty FAISS index as a last resort
self.vector_store = FAISS.from_texts(
["Default document"],
self.embeddings
)
def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]:
"""
Search for relevant documents based on query.
Args:
query: Search query
k: Number of results to return
documents: Optional list of documents to search through (fallback)
Returns:
List of relevant documents with scores
"""
# If vector store is not available, fall back to simple text search
if not self.vector_store:
if not documents:
return []
# Simple text-based search as fallback
query = query.lower()
return [
{"content": doc, "score": 1.0, "metadata": {}}
for doc in documents
if query in doc.lower()
][:k]
try:
results = self.vector_store.similarity_search_with_score(query, k=k)
formatted_results = []
for doc, score in results:
formatted_results.append({
"content": doc.page_content,
"metadata": getattr(doc, 'metadata', {}),
"score": float(score) if hasattr(score, '__float__') else 0.0
})
return formatted_results
except Exception as e:
print(f"Error in vector store search: {str(e)}")
# Fall back to simple text search if available
if documents:
query = query.lower()
return [
{"content": doc, "score": 1.0, "metadata": {}}
for doc in documents
if query in doc.lower()
][:k]
return []