cryogenic22 commited on
Commit
e7f2507
·
verified ·
1 Parent(s): 3537d63

Update utils/vector_store.py

Browse files
Files changed (1) hide show
  1. utils/vector_store.py +40 -51
utils/vector_store.py CHANGED
@@ -5,12 +5,13 @@ from sentence_transformers import SentenceTransformer, util
5
  import numpy as np
6
  from datetime import datetime
7
 
 
8
  class VectorStore:
9
  def __init__(self, storage_path: str = "data/vector_store", model_name: str = 'all-MiniLM-L6-v2'):
10
  """Initialize VectorStore with improved chunk handling."""
11
  self.storage_path = storage_path
12
  os.makedirs(storage_path, exist_ok=True)
13
-
14
  self.model = SentenceTransformer(model_name)
15
  self.vectors = self._load_vectors()
16
  self.chunk_size = 512 # Optimal size for most transformer models
@@ -32,11 +33,11 @@ class VectorStore:
32
  """Save vectors with backup and atomic write."""
33
  vector_file = os.path.join(self.storage_path, "vectors.pkl")
34
  backup_file = vector_file + ".backup"
35
-
36
  # Create backup of existing vectors
37
  if os.path.exists(vector_file):
38
  os.replace(vector_file, backup_file)
39
-
40
  try:
41
  with open(vector_file, "wb") as f:
42
  pickle.dump(self.vectors, f)
@@ -53,7 +54,7 @@ class VectorStore:
53
  """Add document with improved chunking and metadata."""
54
  # Create chunks with overlap
55
  chunks = self._create_chunks(text)
56
-
57
  # Add timestamp and chunk info to metadata
58
  base_metadata = {
59
  **metadata,
@@ -69,10 +70,10 @@ class VectorStore:
69
  "chunk_idx": chunk_idx,
70
  "chunk_text": chunk[:200] # Store preview of chunk text
71
  }
72
-
73
  # Encode chunk
74
  vector = self.model.encode(chunk, convert_to_tensor=True)
75
-
76
  # Store chunk with metadata
77
  self.vectors.append({
78
  "doc_id": f"{doc_id}_chunk_{chunk_idx}",
@@ -80,7 +81,7 @@ class VectorStore:
80
  "text": chunk,
81
  "metadata": chunk_metadata
82
  })
83
-
84
  self._save_vectors()
85
 
86
  def _create_chunks(self, text: str) -> List[str]:
@@ -90,10 +91,10 @@ class VectorStore:
90
  chunks = []
91
  current_chunk = []
92
  current_size = 0
93
-
94
  for sentence in sentences:
95
  sentence_size = len(sentence.split())
96
-
97
  if current_size + sentence_size > self.chunk_size:
98
  # Save current chunk
99
  if current_chunk:
@@ -105,39 +106,39 @@ class VectorStore:
105
  else:
106
  current_chunk.append(sentence)
107
  current_size += sentence_size
108
-
109
  # Add final chunk
110
  if current_chunk:
111
  chunks.append(' '.join(current_chunk))
112
-
113
  return chunks
114
 
115
  def similarity_search(self, query: str, k: int = 3) -> List[Dict]:
116
- """Perform similarity search with error handling."""
117
- try:
118
- # If no vectors are stored yet, return empty list
119
- if not self.vectors:
120
- return []
121
 
122
- query_vector = self.model.encode(query, convert_to_tensor=True)
123
- results = []
124
 
125
- for doc in self.vectors:
126
- similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
127
- results.append({
128
- "text": doc["text"],
129
- "metadata": doc["metadata"],
130
- "score": similarity
131
- })
132
 
133
- # Sort by similarity and return top k
134
- results.sort(key=lambda x: x["score"], reverse=True)
135
- return results[:k]
 
 
 
 
136
 
137
- except Exception as e:
138
- st.error(f"Error in similarity search: {str(e)}")
139
- return []
140
-
141
  def _rerank_results(self, results: List[Dict], query: str) -> List[Dict]:
142
  """Re-rank results considering chunk position and metadata relevance."""
143
  for result in results:
@@ -145,24 +146,24 @@ class VectorStore:
145
  chunk_idx = result["metadata"].get("chunk_idx", 0)
146
  total_chunks = result["metadata"].get("total_chunks", 1)
147
  position_score = 1 - (chunk_idx / total_chunks) # Favor earlier chunks
148
-
149
  # Adjust score based on metadata relevance
150
  metadata_score = self._calculate_metadata_relevance(result["metadata"], query)
151
-
152
  # Combine scores
153
  result["final_score"] = (
154
  result["score"] * 0.6 + # Base similarity
155
  position_score * 0.2 + # Position importance
156
  metadata_score * 0.2 # Metadata relevance
157
  )
158
-
159
  return sorted(results, key=lambda x: x["final_score"], reverse=True)
160
 
161
  def _calculate_metadata_relevance(self, metadata: Dict, query: str) -> float:
162
  """Calculate relevance score based on metadata matching."""
163
  relevance_score = 0.0
164
  query_lower = query.lower()
165
-
166
  # Check for metadata field matches
167
  for key, value in metadata.items():
168
  if isinstance(value, str):
@@ -170,19 +171,8 @@ class VectorStore:
170
  relevance_score += 0.2
171
  elif query_lower in value.lower():
172
  relevance_score += 0.1
173
-
174
- return min(1.0, relevance_score) # Normalize to [0,1]
175
 
176
- def _get_nested_dict_value(self, d: Dict, key_path: str):
177
- """Get value from nested dictionary using dot notation."""
178
- keys = key_path.split('.')
179
- value = d
180
- for key in keys:
181
- if isinstance(value, dict):
182
- value = value.get(key)
183
- else:
184
- return None
185
- return value
186
 
187
  def get_document_embeddings(self, doc_id: str) -> List[Dict]:
188
  """Retrieve all embeddings for a specific document."""
@@ -190,8 +180,7 @@ class VectorStore:
190
 
191
  def delete_document(self, doc_id: str):
192
  """Delete all chunks associated with a document."""
193
- self.vectors = [doc for doc in self.vectors
194
- if doc["metadata"]["doc_id"] != doc_id]
195
  self._save_vectors()
196
 
197
  def update_metadata(self, doc_id: str, metadata_updates: Dict):
@@ -199,4 +188,4 @@ class VectorStore:
199
  for doc in self.vectors:
200
  if doc["metadata"]["doc_id"] == doc_id:
201
  doc["metadata"].update(metadata_updates)
202
- self._save_vectors()
 
5
  import numpy as np
6
  from datetime import datetime
7
 
8
+
9
  class VectorStore:
10
  def __init__(self, storage_path: str = "data/vector_store", model_name: str = 'all-MiniLM-L6-v2'):
11
  """Initialize VectorStore with improved chunk handling."""
12
  self.storage_path = storage_path
13
  os.makedirs(storage_path, exist_ok=True)
14
+
15
  self.model = SentenceTransformer(model_name)
16
  self.vectors = self._load_vectors()
17
  self.chunk_size = 512 # Optimal size for most transformer models
 
33
  """Save vectors with backup and atomic write."""
34
  vector_file = os.path.join(self.storage_path, "vectors.pkl")
35
  backup_file = vector_file + ".backup"
36
+
37
  # Create backup of existing vectors
38
  if os.path.exists(vector_file):
39
  os.replace(vector_file, backup_file)
40
+
41
  try:
42
  with open(vector_file, "wb") as f:
43
  pickle.dump(self.vectors, f)
 
54
  """Add document with improved chunking and metadata."""
55
  # Create chunks with overlap
56
  chunks = self._create_chunks(text)
57
+
58
  # Add timestamp and chunk info to metadata
59
  base_metadata = {
60
  **metadata,
 
70
  "chunk_idx": chunk_idx,
71
  "chunk_text": chunk[:200] # Store preview of chunk text
72
  }
73
+
74
  # Encode chunk
75
  vector = self.model.encode(chunk, convert_to_tensor=True)
76
+
77
  # Store chunk with metadata
78
  self.vectors.append({
79
  "doc_id": f"{doc_id}_chunk_{chunk_idx}",
 
81
  "text": chunk,
82
  "metadata": chunk_metadata
83
  })
84
+
85
  self._save_vectors()
86
 
87
  def _create_chunks(self, text: str) -> List[str]:
 
91
  chunks = []
92
  current_chunk = []
93
  current_size = 0
94
+
95
  for sentence in sentences:
96
  sentence_size = len(sentence.split())
97
+
98
  if current_size + sentence_size > self.chunk_size:
99
  # Save current chunk
100
  if current_chunk:
 
106
  else:
107
  current_chunk.append(sentence)
108
  current_size += sentence_size
109
+
110
  # Add final chunk
111
  if current_chunk:
112
  chunks.append(' '.join(current_chunk))
113
+
114
  return chunks
115
 
116
  def similarity_search(self, query: str, k: int = 3) -> List[Dict]:
117
+ """Perform similarity search with error handling."""
118
+ try:
119
+ # If no vectors are stored yet, return empty list
120
+ if not self.vectors:
121
+ return []
122
 
123
+ query_vector = self.model.encode(query, convert_to_tensor=True)
124
+ results = []
125
 
126
+ for doc in self.vectors:
127
+ similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
128
+ results.append({
129
+ "text": doc["text"],
130
+ "metadata": doc["metadata"],
131
+ "score": similarity
132
+ })
133
 
134
+ # Sort by similarity and return top k
135
+ results.sort(key=lambda x: x["score"], reverse=True)
136
+ return results[:k]
137
+
138
+ except Exception as e:
139
+ print(f"Error in similarity search: {str(e)}")
140
+ return []
141
 
 
 
 
 
142
  def _rerank_results(self, results: List[Dict], query: str) -> List[Dict]:
143
  """Re-rank results considering chunk position and metadata relevance."""
144
  for result in results:
 
146
  chunk_idx = result["metadata"].get("chunk_idx", 0)
147
  total_chunks = result["metadata"].get("total_chunks", 1)
148
  position_score = 1 - (chunk_idx / total_chunks) # Favor earlier chunks
149
+
150
  # Adjust score based on metadata relevance
151
  metadata_score = self._calculate_metadata_relevance(result["metadata"], query)
152
+
153
  # Combine scores
154
  result["final_score"] = (
155
  result["score"] * 0.6 + # Base similarity
156
  position_score * 0.2 + # Position importance
157
  metadata_score * 0.2 # Metadata relevance
158
  )
159
+
160
  return sorted(results, key=lambda x: x["final_score"], reverse=True)
161
 
162
  def _calculate_metadata_relevance(self, metadata: Dict, query: str) -> float:
163
  """Calculate relevance score based on metadata matching."""
164
  relevance_score = 0.0
165
  query_lower = query.lower()
166
+
167
  # Check for metadata field matches
168
  for key, value in metadata.items():
169
  if isinstance(value, str):
 
171
  relevance_score += 0.2
172
  elif query_lower in value.lower():
173
  relevance_score += 0.1
 
 
174
 
175
+ return min(1.0, relevance_score) # Normalize to [0,1]
 
 
 
 
 
 
 
 
 
176
 
177
  def get_document_embeddings(self, doc_id: str) -> List[Dict]:
178
  """Retrieve all embeddings for a specific document."""
 
180
 
181
  def delete_document(self, doc_id: str):
182
  """Delete all chunks associated with a document."""
183
+ self.vectors = [doc for doc in self.vectors if doc["metadata"]["doc_id"] != doc_id]
 
184
  self._save_vectors()
185
 
186
  def update_metadata(self, doc_id: str, metadata_updates: Dict):
 
188
  for doc in self.vectors:
189
  if doc["metadata"]["doc_id"] == doc_id:
190
  doc["metadata"].update(metadata_updates)
191
+ self._save_vectors()