Spaces:
Sleeping
Sleeping
File size: 6,718 Bytes
7644eac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
"""
Vector store implementation for RAG capabilities.
"""
from typing import List, Dict, Any, Optional
import json
import os
from pathlib import Path
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader
class VectorStore:
"""
Manages vector storage for RAG capabilities.
"""
def __init__(self, api_key: Optional[str] = None):
"""
Initialize the vector store.
Args:
api_key: Optional OpenAI API key
"""
self.api_key = api_key
# Use free sentence-transformers embeddings (no API key needed)
try:
from langchain.embeddings import HuggingFaceEmbeddings
self.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("✅ Using free HuggingFace embeddings")
except ImportError:
# Fallback to OpenAI if HuggingFace not available
if api_key:
from langchain.embeddings import OpenAIEmbeddings
self.embeddings = OpenAIEmbeddings(api_key=api_key)
print("✅ Using OpenAI embeddings")
else:
raise ValueError("HuggingFace not available and no OpenAI API key provided")
self.vector_store_path = Path("vector_db")
self.vector_store_path.mkdir(exist_ok=True)
self.vector_store = None
def load_documents(self, directory: str = None) -> None:
"""
Load documents from a directory and create embeddings.
If no directory is provided, creates a minimal default vector store.
Args:
directory: Optional path to directory containing documents
"""
try:
# If no directory provided, create a minimal vector store
if directory is None:
self._create_minimal_vector_store()
return
# Check if directory exists
if not os.path.exists(directory):
print(f"Warning: Document directory {directory} not found. Creating minimal vector store.")
self._create_minimal_vector_store()
return
# Try to load documents
loader = DirectoryLoader(directory)
documents = loader.load()
if not documents:
print("Warning: No documents found in directory. Creating minimal vector store.")
self._create_minimal_vector_store()
return
# Process documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
texts = text_splitter.split_documents(documents)
# Create or update vector store
if os.path.exists(self.vector_store_path / "index.faiss"):
self.vector_store = FAISS.load_local(
str(self.vector_store_path),
self.embeddings
)
self.vector_store.add_documents(texts)
else:
self.vector_store = FAISS.from_documents(
texts,
self.embeddings
)
self.vector_store.save_local(str(self.vector_store_path))
except Exception as e:
print(f"Error loading documents: {str(e)}")
self._create_minimal_vector_store()
def _create_minimal_vector_store(self) -> None:
"""Create a minimal vector store with default content."""
try:
default_texts = [
"This is a default document. The vector store was initialized with minimal content.",
"You can add your own documents to the vector store by placing them in the vector_db/documents directory.",
"The application will automatically load and index any text files found in that directory."
]
if os.path.exists(self.vector_store_path / "index.faiss"):
self.vector_store = FAISS.load_local(
str(self.vector_store_path),
self.embeddings
)
else:
self.vector_store = FAISS.from_texts(
default_texts,
self.embeddings
)
self.vector_store.save_local(str(self.vector_store_path))
except Exception as e:
print(f"Error creating minimal vector store: {str(e)}")
# Create an empty FAISS index as a last resort
self.vector_store = FAISS.from_texts(
["Default document"],
self.embeddings
)
def search(self, query: str, k: int = 4, documents: List[str] = None) -> List[Dict[str, Any]]:
"""
Search for relevant documents based on query.
Args:
query: Search query
k: Number of results to return
documents: Optional list of documents to search through (fallback)
Returns:
List of relevant documents with scores
"""
# If vector store is not available, fall back to simple text search
if not self.vector_store:
if not documents:
return []
# Simple text-based search as fallback
query = query.lower()
return [
{"content": doc, "score": 1.0, "metadata": {}}
for doc in documents
if query in doc.lower()
][:k]
try:
results = self.vector_store.similarity_search_with_score(query, k=k)
formatted_results = []
for doc, score in results:
formatted_results.append({
"content": doc.page_content,
"metadata": getattr(doc, 'metadata', {}),
"score": float(score) if hasattr(score, '__float__') else 0.0
})
return formatted_results
except Exception as e:
print(f"Error in vector store search: {str(e)}")
# Fall back to simple text search if available
if documents:
query = query.lower()
return [
{"content": doc, "score": 1.0, "metadata": {}}
for doc in documents
if query in doc.lower()
][:k]
return []
|