Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

ai-learning-path-generator / src /data /vector_store.py

“shubhamdhamal”

Deploy Flask app with Docker

7644eac about 1 month ago

6.72 kB

	"""
	Vector store implementation for RAG capabilities.
	"""
	from typing import List, Dict, Any, Optional
	import json
	import os
	from pathlib import Path
	from langchain.vectorstores import FAISS
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.document_loaders import DirectoryLoader

	class VectorStore:
	"""
	Manages vector storage for RAG capabilities.
	"""
	def __init__(self, api_key: Optional[str] = None):
	"""
	Initialize the vector store.

	Args:
	api_key: Optional OpenAI API key
	"""
	self.api_key = api_key

	# Use free sentence-transformers embeddings (no API key needed)
	try:
	from langchain.embeddings import HuggingFaceEmbeddings
	self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
	print("✅ Using free HuggingFace embeddings")
	except ImportError:
	# Fallback to OpenAI if HuggingFace not available
	if api_key:
	from langchain.embeddings import OpenAIEmbeddings
	self.embeddings = OpenAIEmbeddings(api_key=api_key)
	print("✅ Using OpenAI embeddings")
	else:
	raise ValueError("HuggingFace not available and no OpenAI API key provided")

	self.vector_store_path = Path("vector_db")
	self.vector_store_path.mkdir(exist_ok=True)
	self.vector_store = None

	def load_documents(self, directory: str = None) -> None:
	"""
	Load documents from a directory and create embeddings.
	If no directory is provided, creates a minimal default vector store.

	Args:
	directory: Optional path to directory containing documents
	"""
	try:
	# If no directory provided, create a minimal vector store
	if directory is None:
	self._create_minimal_vector_store()
	return

	# Check if directory exists
	if not os.path.exists(directory):
	print(f"Warning: Document directory {directory} not found. Creating minimal vector store.")
	self._create_minimal_vector_store()
	return

	# Try to load documents
	loader = DirectoryLoader(directory)
	documents = loader.load()

	if not documents:
	print("Warning: No documents found in directory. Creating minimal vector store.")
	self._create_minimal_vector_store()
	return

	# Process documents
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	)

	texts = text_splitter.split_documents(documents)

	# Create or update vector store
	if os.path.exists(self.vector_store_path / "index.faiss"):
	self.vector_store = FAISS.load_local(
	str(self.vector_store_path),
	self.embeddings
	)
	self.vector_store.add_documents(texts)
	else:
	self.vector_store = FAISS.from_documents(
	texts,
	self.embeddings
	)
	self.vector_store.save_local(str(self.vector_store_path))

	except Exception as e:
	print(f"Error loading documents: {str(e)}")
	self._create_minimal_vector_store()

	def _create_minimal_vector_store(self) -> None:
	"""Create a minimal vector store with default content."""
	try:
	default_texts = [
	"This is a default document. The vector store was initialized with minimal content.",
	"You can add your own documents to the vector store by placing them in the vector_db/documents directory.",
	"The application will automatically load and index any text files found in that directory."
	]

	if os.path.exists(self.vector_store_path / "index.faiss"):
	self.vector_store = FAISS.load_local(
	str(self.vector_store_path),
	self.embeddings
	)
	else:
	self.vector_store = FAISS.from_texts(
	default_texts,
	self.embeddings
	)
	self.vector_store.save_local(str(self.vector_store_path))

	except Exception as e:
	print(f"Error creating minimal vector store: {str(e)}")
	# Create an empty FAISS index as a last resort
	self.vector_store = FAISS.from_texts(
	["Default document"],
	self.embeddings
	)

	def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]:
	"""
	Search for relevant documents based on query.

	Args:
	query: Search query
	k: Number of results to return
	documents: Optional list of documents to search through (fallback)

	Returns:
	List of relevant documents with scores
	"""
	# If vector store is not available, fall back to simple text search
	if not self.vector_store:
	if not documents:
	return []

	# Simple text-based search as fallback
	query = query.lower()
	return [
	{"content": doc, "score": 1.0, "metadata": {}}
	for doc in documents
	if query in doc.lower()
	][:k]

	try:
	results = self.vector_store.similarity_search_with_score(query, k=k)
	formatted_results = []
	for doc, score in results:
	formatted_results.append({
	"content": doc.page_content,
	"metadata": getattr(doc, 'metadata', {}),
	"score": float(score) if hasattr(score, '__float__') else 0.0
	})
	return formatted_results

	except Exception as e:
	print(f"Error in vector store search: {str(e)}")
	# Fall back to simple text search if available
	if documents:
	query = query.lower()
	return [
	{"content": doc, "score": 1.0, "metadata": {}}
	for doc in documents
	if query in doc.lower()
	][:k]
	return []