chatbot / app /services /vector_store.py
Tahasaif3's picture
Update app/services/vector_store.py
bbbd8da verified
# ===== 2. UPDATE app/services/vector_store.py =====
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
from typing import List, Dict, Any, Optional
import uuid
from app.core.config import settings
class QdrantVectorStore:
def __init__(self):
self.client = QdrantClient(
url=settings.QDRANT_URL,
api_key=settings.QDRANT_API_KEY
)
self.collection_name = "book_documents"
self.vector_size = settings.EMBEDDING_DIMENSION # Use dimension from settings
def create_collection(self, force_recreate: bool = False):
"""Create the Qdrant collection for document embeddings
Args:
force_recreate: If True, delete existing collection and recreate
"""
try:
if force_recreate:
try:
self.client.delete_collection(collection_name=self.collection_name)
print(f"βœ“ Deleted existing collection '{self.collection_name}'")
except Exception:
pass # Collection doesn't exist
self.client.create_collection(
collection_name=self.collection_name,
vectors_config=VectorParams(
size=self.vector_size, # Use correct dimension (768 for Gemini)
distance=Distance.COSINE
)
)
print(f"βœ“ Collection '{self.collection_name}' created with dimension {self.vector_size}")
except Exception as e:
error_msg = str(e)
if "already exists" in error_msg:
print(f"β„Ή Collection '{self.collection_name}' already exists")
# Check dimension mismatch
try:
collection_info = self.client.get_collection(self.collection_name)
existing_dim = collection_info.config.params.vectors.size
if existing_dim != self.vector_size:
print(f"⚠ DIMENSION MISMATCH!")
print(f" Expected: {self.vector_size} (Gemini text-embedding-004)")
print(f" Found: {existing_dim} (in existing collection)")
print(f" FIX: Call vector_store.create_collection(force_recreate=True)")
raise ValueError(
f"Vector dimension mismatch: collection has {existing_dim}, "
f"but Gemini embeddings are {self.vector_size}. "
f"Delete the collection and recreate it."
)
else:
print(f"βœ“ Dimension matches: {self.vector_size}")
except AttributeError:
print(f"⚠ Could not verify collection dimensions")
else:
print(f"βœ— Collection creation error: {e}")
raise
def add_documents(self, documents: List[Dict[str, Any]]) -> List[str]:
"""Add documents to the Qdrant collection
Args:
documents: List of document dictionaries with keys:
- id: document ID
- vector: embedding vector (768 dimensions for Gemini)
- payload: document metadata and content
Returns:
List of added document IDs
"""
if not documents:
return []
# Validate first vector dimension
first_vector = documents[0].get("vector")
if first_vector and len(first_vector) != self.vector_size:
raise ValueError(
f"Vector dimension mismatch!\n"
f" Expected: {self.vector_size} (Gemini text-embedding-004)\n"
f" Got: {len(first_vector)} (from your embeddings)\n"
f" The Qdrant collection needs to be recreated with correct dimensions."
)
points = []
for doc in documents:
point_id = str(doc.get("id", str(uuid.uuid4())))
points.append(PointStruct(
id=point_id,
vector=doc["vector"],
payload=doc["payload"]
))
if points:
try:
self.client.upsert(
collection_name=self.collection_name,
points=points
)
except AttributeError:
self.client.upsert_points(
collection_name=self.collection_name,
points=points
)
return [str(point.id) for point in points]
def search_documents(self, query_vector: List[float], limit: int = 5,
chapter_filter: Optional[str] = None) -> List[Dict[str, Any]]:
"""Search for documents using a query vector
Args:
query_vector: The query embedding vector (768 dimensions)
limit: Maximum number of results to return
chapter_filter: Optional chapter name to filter results
Returns:
List of matching documents with their payloads and scores
"""
# Validate query vector dimension
if len(query_vector) != self.vector_size:
raise ValueError(
f"Query vector dimension mismatch: expected {self.vector_size}, "
f"got {len(query_vector)}"
)
search_filter = None
if chapter_filter:
search_filter = Filter(
must=[
FieldCondition(
key="chapter",
match=MatchValue(value=chapter_filter)
)
]
)
try:
results = self.client.search(
collection_name=self.collection_name,
query_vector=query_vector,
limit=limit,
query_filter=search_filter
)
except AttributeError:
results = self.client.query_points(
collection_name=self.collection_name,
query=query_vector,
limit=limit,
query_filter=search_filter
)
# Process results
processed_results = []
result_items = results if not hasattr(results, 'points') else results.points
for result in result_items:
if hasattr(result, 'id') and hasattr(result, 'payload') and hasattr(result, 'score'):
processed_results.append({
"id": str(result.id),
"payload": result.payload,
"score": result.score
})
elif isinstance(result, dict) and 'id' in result and 'payload' in result:
processed_results.append({
"id": str(result['id']),
"payload": result['payload'],
"score": result.get('score', 0)
})
return processed_results
def delete_collection(self):
"""Delete the Qdrant collection"""
try:
self.client.delete_collection(collection_name=self.collection_name)
print(f"βœ“ Collection '{self.collection_name}' deleted")
except Exception as e:
print(f"βœ— Error deleting collection: {e}")
# Initialize the vector store
vector_store = QdrantVectorStore()