| |
| from qdrant_client import QdrantClient |
| from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue |
| from typing import List, Dict, Any, Optional |
| import uuid |
| from app.core.config import settings |
|
|
| class QdrantVectorStore: |
| def __init__(self): |
| self.client = QdrantClient( |
| url=settings.QDRANT_URL, |
| api_key=settings.QDRANT_API_KEY |
| ) |
| self.collection_name = "book_documents" |
| self.vector_size = settings.EMBEDDING_DIMENSION |
| |
| def create_collection(self, force_recreate: bool = False): |
| """Create the Qdrant collection for document embeddings |
| |
| Args: |
| force_recreate: If True, delete existing collection and recreate |
| """ |
| try: |
| if force_recreate: |
| try: |
| self.client.delete_collection(collection_name=self.collection_name) |
| print(f"β Deleted existing collection '{self.collection_name}'") |
| except Exception: |
| pass |
| |
| self.client.create_collection( |
| collection_name=self.collection_name, |
| vectors_config=VectorParams( |
| size=self.vector_size, |
| distance=Distance.COSINE |
| ) |
| ) |
| print(f"β Collection '{self.collection_name}' created with dimension {self.vector_size}") |
| except Exception as e: |
| error_msg = str(e) |
| if "already exists" in error_msg: |
| print(f"βΉ Collection '{self.collection_name}' already exists") |
| |
| try: |
| collection_info = self.client.get_collection(self.collection_name) |
| existing_dim = collection_info.config.params.vectors.size |
| if existing_dim != self.vector_size: |
| print(f"β DIMENSION MISMATCH!") |
| print(f" Expected: {self.vector_size} (Gemini text-embedding-004)") |
| print(f" Found: {existing_dim} (in existing collection)") |
| print(f" FIX: Call vector_store.create_collection(force_recreate=True)") |
| raise ValueError( |
| f"Vector dimension mismatch: collection has {existing_dim}, " |
| f"but Gemini embeddings are {self.vector_size}. " |
| f"Delete the collection and recreate it." |
| ) |
| else: |
| print(f"β Dimension matches: {self.vector_size}") |
| except AttributeError: |
| print(f"β Could not verify collection dimensions") |
| else: |
| print(f"β Collection creation error: {e}") |
| raise |
| |
| def add_documents(self, documents: List[Dict[str, Any]]) -> List[str]: |
| """Add documents to the Qdrant collection |
| |
| Args: |
| documents: List of document dictionaries with keys: |
| - id: document ID |
| - vector: embedding vector (768 dimensions for Gemini) |
| - payload: document metadata and content |
| |
| Returns: |
| List of added document IDs |
| """ |
| if not documents: |
| return [] |
| |
| |
| first_vector = documents[0].get("vector") |
| if first_vector and len(first_vector) != self.vector_size: |
| raise ValueError( |
| f"Vector dimension mismatch!\n" |
| f" Expected: {self.vector_size} (Gemini text-embedding-004)\n" |
| f" Got: {len(first_vector)} (from your embeddings)\n" |
| f" The Qdrant collection needs to be recreated with correct dimensions." |
| ) |
| |
| points = [] |
| for doc in documents: |
| point_id = str(doc.get("id", str(uuid.uuid4()))) |
| points.append(PointStruct( |
| id=point_id, |
| vector=doc["vector"], |
| payload=doc["payload"] |
| )) |
| |
| if points: |
| try: |
| self.client.upsert( |
| collection_name=self.collection_name, |
| points=points |
| ) |
| except AttributeError: |
| self.client.upsert_points( |
| collection_name=self.collection_name, |
| points=points |
| ) |
| |
| return [str(point.id) for point in points] |
| |
| def search_documents(self, query_vector: List[float], limit: int = 5, |
| chapter_filter: Optional[str] = None) -> List[Dict[str, Any]]: |
| """Search for documents using a query vector |
| |
| Args: |
| query_vector: The query embedding vector (768 dimensions) |
| limit: Maximum number of results to return |
| chapter_filter: Optional chapter name to filter results |
| |
| Returns: |
| List of matching documents with their payloads and scores |
| """ |
| |
| if len(query_vector) != self.vector_size: |
| raise ValueError( |
| f"Query vector dimension mismatch: expected {self.vector_size}, " |
| f"got {len(query_vector)}" |
| ) |
| |
| search_filter = None |
| if chapter_filter: |
| search_filter = Filter( |
| must=[ |
| FieldCondition( |
| key="chapter", |
| match=MatchValue(value=chapter_filter) |
| ) |
| ] |
| ) |
| |
| try: |
| results = self.client.search( |
| collection_name=self.collection_name, |
| query_vector=query_vector, |
| limit=limit, |
| query_filter=search_filter |
| ) |
| except AttributeError: |
| results = self.client.query_points( |
| collection_name=self.collection_name, |
| query=query_vector, |
| limit=limit, |
| query_filter=search_filter |
| ) |
| |
| |
| processed_results = [] |
| result_items = results if not hasattr(results, 'points') else results.points |
| |
| for result in result_items: |
| if hasattr(result, 'id') and hasattr(result, 'payload') and hasattr(result, 'score'): |
| processed_results.append({ |
| "id": str(result.id), |
| "payload": result.payload, |
| "score": result.score |
| }) |
| elif isinstance(result, dict) and 'id' in result and 'payload' in result: |
| processed_results.append({ |
| "id": str(result['id']), |
| "payload": result['payload'], |
| "score": result.get('score', 0) |
| }) |
| |
| return processed_results |
| |
| def delete_collection(self): |
| """Delete the Qdrant collection""" |
| try: |
| self.client.delete_collection(collection_name=self.collection_name) |
| print(f"β Collection '{self.collection_name}' deleted") |
| except Exception as e: |
| print(f"β Error deleting collection: {e}") |
|
|
| |
| vector_store = QdrantVectorStore() |