Tahasaif3 commited on
Commit
bbbd8da
·
verified ·
1 Parent(s): 0651c3f

Update app/services/vector_store.py

Browse files
Files changed (1) hide show
  1. app/services/vector_store.py +69 -38
app/services/vector_store.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from qdrant_client import QdrantClient
2
  from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
3
  from typing import List, Dict, Any, Optional
@@ -11,51 +12,81 @@ class QdrantVectorStore:
11
  api_key=settings.QDRANT_API_KEY
12
  )
13
  self.collection_name = "book_documents"
 
14
 
15
- def create_collection(self):
16
- """Create the Qdrant collection for document embeddings"""
 
 
 
 
17
  try:
 
 
 
 
 
 
 
18
  self.client.create_collection(
19
  collection_name=self.collection_name,
20
  vectors_config=VectorParams(
21
- size=1536, # OpenAI ada-002 embedding dimension
22
  distance=Distance.COSINE
23
  )
24
  )
25
- print(f"Collection '{self.collection_name}' created successfully")
26
- except AttributeError as e:
27
- # Handle different Qdrant client versions
28
- print(f"AttributeError during collection creation: {e}")
29
- try:
30
- # Alternative method
31
- from qdrant_client.http.models import CreateCollection
32
- self.client.create_collection(
33
- collection_name=self.collection_name,
34
- vectors_config=VectorParams(
35
- size=1536, # OpenAI ada-002 embedding dimension
36
- distance=Distance.COSINE
37
- )
38
- )
39
- print(f"Collection '{self.collection_name}' created successfully (alternative method)")
40
- except Exception as e2:
41
- print(f"Alternative collection creation failed: {e2}")
42
  except Exception as e:
43
- # Collection might already exist
44
- print(f"Collection creation info: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def add_documents(self, documents: List[Dict[str, Any]]) -> List[str]:
47
- """
48
- Add documents to the Qdrant collection
49
 
50
  Args:
51
  documents: List of document dictionaries with keys:
52
  - id: document ID
53
- - vector: embedding vector
54
  - payload: document metadata and content
55
 
56
  Returns:
57
  List of added document IDs
58
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  points = []
60
  for doc in documents:
61
  point_id = str(doc.get("id", str(uuid.uuid4())))
@@ -72,7 +103,6 @@ class QdrantVectorStore:
72
  points=points
73
  )
74
  except AttributeError:
75
- # Fallback for different Qdrant client versions
76
  self.client.upsert_points(
77
  collection_name=self.collection_name,
78
  points=points
@@ -82,17 +112,23 @@ class QdrantVectorStore:
82
 
83
  def search_documents(self, query_vector: List[float], limit: int = 5,
84
  chapter_filter: Optional[str] = None) -> List[Dict[str, Any]]:
85
- """
86
- Search for documents using a query vector
87
 
88
  Args:
89
- query_vector: The query embedding vector
90
  limit: Maximum number of results to return
91
  chapter_filter: Optional chapter name to filter results
92
 
93
  Returns:
94
  List of matching documents with their payloads and scores
95
  """
 
 
 
 
 
 
 
96
  search_filter = None
97
  if chapter_filter:
98
  search_filter = Filter(
@@ -104,7 +140,6 @@ class QdrantVectorStore:
104
  ]
105
  )
106
 
107
- # Use the correct method signature for the Qdrant client version
108
  try:
109
  results = self.client.search(
110
  collection_name=self.collection_name,
@@ -113,7 +148,6 @@ class QdrantVectorStore:
113
  query_filter=search_filter
114
  )
115
  except AttributeError:
116
- # Fallback for different Qdrant client versions
117
  results = self.client.query_points(
118
  collection_name=self.collection_name,
119
  query=query_vector,
@@ -121,21 +155,18 @@ class QdrantVectorStore:
121
  query_filter=search_filter
122
  )
123
 
124
- # Process results based on the response format
125
  processed_results = []
126
  result_items = results if not hasattr(results, 'points') else results.points
127
 
128
  for result in result_items:
129
- # Handle different result formats
130
  if hasattr(result, 'id') and hasattr(result, 'payload') and hasattr(result, 'score'):
131
- # Standard format
132
  processed_results.append({
133
  "id": str(result.id),
134
  "payload": result.payload,
135
  "score": result.score
136
  })
137
  elif isinstance(result, dict) and 'id' in result and 'payload' in result:
138
- # Dictionary format
139
  processed_results.append({
140
  "id": str(result['id']),
141
  "payload": result['payload'],
@@ -148,9 +179,9 @@ class QdrantVectorStore:
148
  """Delete the Qdrant collection"""
149
  try:
150
  self.client.delete_collection(collection_name=self.collection_name)
151
- except AttributeError:
152
- # Fallback for different Qdrant client versions
153
- self.client.delete_collection_points(collection_name=self.collection_name)
154
 
155
  # Initialize the vector store
156
  vector_store = QdrantVectorStore()
 
1
+ # ===== 2. UPDATE app/services/vector_store.py =====
2
  from qdrant_client import QdrantClient
3
  from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
4
  from typing import List, Dict, Any, Optional
 
12
  api_key=settings.QDRANT_API_KEY
13
  )
14
  self.collection_name = "book_documents"
15
+ self.vector_size = settings.EMBEDDING_DIMENSION # Use dimension from settings
16
 
17
+ def create_collection(self, force_recreate: bool = False):
18
+ """Create the Qdrant collection for document embeddings
19
+
20
+ Args:
21
+ force_recreate: If True, delete existing collection and recreate
22
+ """
23
  try:
24
+ if force_recreate:
25
+ try:
26
+ self.client.delete_collection(collection_name=self.collection_name)
27
+ print(f"✓ Deleted existing collection '{self.collection_name}'")
28
+ except Exception:
29
+ pass # Collection doesn't exist
30
+
31
  self.client.create_collection(
32
  collection_name=self.collection_name,
33
  vectors_config=VectorParams(
34
+ size=self.vector_size, # Use correct dimension (768 for Gemini)
35
  distance=Distance.COSINE
36
  )
37
  )
38
+ print(f"Collection '{self.collection_name}' created with dimension {self.vector_size}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  except Exception as e:
40
+ error_msg = str(e)
41
+ if "already exists" in error_msg:
42
+ print(f"ℹ Collection '{self.collection_name}' already exists")
43
+ # Check dimension mismatch
44
+ try:
45
+ collection_info = self.client.get_collection(self.collection_name)
46
+ existing_dim = collection_info.config.params.vectors.size
47
+ if existing_dim != self.vector_size:
48
+ print(f"⚠ DIMENSION MISMATCH!")
49
+ print(f" Expected: {self.vector_size} (Gemini text-embedding-004)")
50
+ print(f" Found: {existing_dim} (in existing collection)")
51
+ print(f" FIX: Call vector_store.create_collection(force_recreate=True)")
52
+ raise ValueError(
53
+ f"Vector dimension mismatch: collection has {existing_dim}, "
54
+ f"but Gemini embeddings are {self.vector_size}. "
55
+ f"Delete the collection and recreate it."
56
+ )
57
+ else:
58
+ print(f"✓ Dimension matches: {self.vector_size}")
59
+ except AttributeError:
60
+ print(f"⚠ Could not verify collection dimensions")
61
+ else:
62
+ print(f"✗ Collection creation error: {e}")
63
+ raise
64
 
65
  def add_documents(self, documents: List[Dict[str, Any]]) -> List[str]:
66
+ """Add documents to the Qdrant collection
 
67
 
68
  Args:
69
  documents: List of document dictionaries with keys:
70
  - id: document ID
71
+ - vector: embedding vector (768 dimensions for Gemini)
72
  - payload: document metadata and content
73
 
74
  Returns:
75
  List of added document IDs
76
  """
77
+ if not documents:
78
+ return []
79
+
80
+ # Validate first vector dimension
81
+ first_vector = documents[0].get("vector")
82
+ if first_vector and len(first_vector) != self.vector_size:
83
+ raise ValueError(
84
+ f"Vector dimension mismatch!\n"
85
+ f" Expected: {self.vector_size} (Gemini text-embedding-004)\n"
86
+ f" Got: {len(first_vector)} (from your embeddings)\n"
87
+ f" The Qdrant collection needs to be recreated with correct dimensions."
88
+ )
89
+
90
  points = []
91
  for doc in documents:
92
  point_id = str(doc.get("id", str(uuid.uuid4())))
 
103
  points=points
104
  )
105
  except AttributeError:
 
106
  self.client.upsert_points(
107
  collection_name=self.collection_name,
108
  points=points
 
112
 
113
  def search_documents(self, query_vector: List[float], limit: int = 5,
114
  chapter_filter: Optional[str] = None) -> List[Dict[str, Any]]:
115
+ """Search for documents using a query vector
 
116
 
117
  Args:
118
+ query_vector: The query embedding vector (768 dimensions)
119
  limit: Maximum number of results to return
120
  chapter_filter: Optional chapter name to filter results
121
 
122
  Returns:
123
  List of matching documents with their payloads and scores
124
  """
125
+ # Validate query vector dimension
126
+ if len(query_vector) != self.vector_size:
127
+ raise ValueError(
128
+ f"Query vector dimension mismatch: expected {self.vector_size}, "
129
+ f"got {len(query_vector)}"
130
+ )
131
+
132
  search_filter = None
133
  if chapter_filter:
134
  search_filter = Filter(
 
140
  ]
141
  )
142
 
 
143
  try:
144
  results = self.client.search(
145
  collection_name=self.collection_name,
 
148
  query_filter=search_filter
149
  )
150
  except AttributeError:
 
151
  results = self.client.query_points(
152
  collection_name=self.collection_name,
153
  query=query_vector,
 
155
  query_filter=search_filter
156
  )
157
 
158
+ # Process results
159
  processed_results = []
160
  result_items = results if not hasattr(results, 'points') else results.points
161
 
162
  for result in result_items:
 
163
  if hasattr(result, 'id') and hasattr(result, 'payload') and hasattr(result, 'score'):
 
164
  processed_results.append({
165
  "id": str(result.id),
166
  "payload": result.payload,
167
  "score": result.score
168
  })
169
  elif isinstance(result, dict) and 'id' in result and 'payload' in result:
 
170
  processed_results.append({
171
  "id": str(result['id']),
172
  "payload": result['payload'],
 
179
  """Delete the Qdrant collection"""
180
  try:
181
  self.client.delete_collection(collection_name=self.collection_name)
182
+ print(f"✓ Collection '{self.collection_name}' deleted")
183
+ except Exception as e:
184
+ print(f"✗ Error deleting collection: {e}")
185
 
186
  # Initialize the vector store
187
  vector_store = QdrantVectorStore()