cryogenic22 commited on
Commit
d7b10b8
·
verified ·
1 Parent(s): 1a84935

Update utils/vector_store.py

Browse files
Files changed (1) hide show
  1. utils/vector_store.py +82 -74
utils/vector_store.py CHANGED
@@ -7,96 +7,104 @@ from datetime import datetime
7
  import streamlit as st
8
 
9
  class VectorStore:
10
- def __init__(self, storage_path: str = "data/vector_store"):
11
- """Initialize VectorStore with storage management."""
12
- self.storage_path = storage_path
13
- os.makedirs(storage_path, exist_ok=True)
14
-
15
- self.model = SentenceTransformer('all-MiniLM-L6-v2')
16
- self.vectors = []
17
- self._load_vectors()
18
-
19
- def _load_vectors(self):
20
- """Load stored vectors with error handling."""
21
- vector_file = os.path.join(self.storage_path, "vectors.pkl")
22
- try:
23
- if os.path.exists(vector_file):
24
- with open(vector_file, "rb") as f:
25
- self.vectors = pickle.load(f)
26
- if not isinstance(self.vectors, list):
27
- self.vectors = []
28
- except Exception as e:
29
- self.vectors = []
30
-
31
- def _save_vectors(self):
32
- """Save vectors with error handling."""
33
- vector_file = os.path.join(self.storage_path, "vectors.pkl")
34
- try:
35
- with open(vector_file, "wb") as f:
36
- pickle.dump(self.vectors, f)
37
- except Exception as e:
38
- raise Exception(f"Error saving vectors: {str(e)}")
39
-
40
- def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any] = None):
41
- """Add a document to the vector store."""
42
- try:
43
- vector = self.model.encode(text, convert_to_tensor=True)
44
- doc_record = {
45
- "doc_id": doc_id,
46
- "vector": vector,
47
- "text": text,
48
- "metadata": metadata or {}
49
- }
50
- if not isinstance(self.vectors, list):
51
- self.vectors = []
52
- self.vectors.append(doc_record)
53
- self._save_vectors()
54
- except Exception as e:
55
- raise Exception(f"Error adding document: {str(e)}")
56
-
57
  def similarity_search(self, query: str, k: int = 3, filter_docs: Optional[List[str]] = None) -> List[Dict]:
58
- """Perform similarity search with document filtering."""
59
  try:
60
  if not self.vectors:
61
  return []
62
 
 
63
  query_vector = self.model.encode(query, convert_to_tensor=True)
 
 
64
  results = []
65
-
66
  for doc in self.vectors:
67
- # Apply document filter if provided
68
  if filter_docs and doc["doc_id"] not in filter_docs:
69
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
- similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
72
- results.append({
73
- "doc_id": doc["doc_id"],
74
- "text": doc["text"],
75
- "metadata": doc["metadata"],
76
- "score": float(similarity)
77
- })
78
-
79
  results.sort(key=lambda x: x["score"], reverse=True)
80
  return results[:k]
81
 
82
  except Exception as e:
83
- raise Exception(f"Error in similarity search: {str(e)}")
 
84
 
85
- def delete_document(self, doc_id: str) -> bool:
86
- """Delete a document from the vector store."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  try:
88
- initial_length = len(self.vectors)
89
- self.vectors = [doc for doc in self.vectors if doc["doc_id"] != doc_id]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  self._save_vectors()
91
- return len(self.vectors) < initial_length
92
  except Exception as e:
93
- raise Exception(f"Error deleting document: {str(e)}")
94
-
95
- def clear(self):
96
- """Clear all vectors."""
97
- self.vectors = []
98
- self._save_vectors()
99
-
100
- def __len__(self):
101
- """Get number of documents in store."""
102
- return len(self.vectors) if self.vectors is not None else 0
 
7
  import streamlit as st
8
 
9
  class VectorStore:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def similarity_search(self, query: str, k: int = 3, filter_docs: Optional[List[str]] = None) -> List[Dict]:
11
+ """Enhanced similarity search with ontology awareness."""
12
  try:
13
  if not self.vectors:
14
  return []
15
 
16
+ # Encode query
17
  query_vector = self.model.encode(query, convert_to_tensor=True)
18
+
19
+ # Calculate enhanced similarities
20
  results = []
 
21
  for doc in self.vectors:
22
+ # Skip if document is filtered out
23
  if filter_docs and doc["doc_id"] not in filter_docs:
24
  continue
25
+
26
+ try:
27
+ # Base similarity score
28
+ base_similarity = util.pytorch_cos_sim(query_vector, doc["vector"]).item()
29
+
30
+ # Calculate ontology boost
31
+ ontology_boost = self._calculate_ontology_relevance(
32
+ query,
33
+ doc.get('metadata', {}).get('ontology_links', [])
34
+ )
35
+
36
+ # Final score combining vector similarity and ontology relevance
37
+ final_score = (base_similarity * 0.7) + (ontology_boost * 0.3)
38
+
39
+ results.append({
40
+ "doc_id": doc["doc_id"],
41
+ "text": doc["text"],
42
+ "metadata": doc["metadata"],
43
+ "score": float(final_score),
44
+ "base_similarity": float(base_similarity),
45
+ "ontology_boost": float(ontology_boost)
46
+ })
47
+
48
+ except Exception as e:
49
+ st.warning(f"Error processing document: {str(e)}")
50
+ continue
51
 
52
+ # Sort by final score
 
 
 
 
 
 
 
53
  results.sort(key=lambda x: x["score"], reverse=True)
54
  return results[:k]
55
 
56
  except Exception as e:
57
+ st.error(f"Error in similarity search: {str(e)}")
58
+ return []
59
 
60
+ def _calculate_ontology_relevance(self, query: str, ontology_links: List[Dict]) -> float:
61
+ """Calculate ontology-based relevance score."""
62
+ if not ontology_links:
63
+ return 0.0
64
+
65
+ query_lower = query.lower()
66
+ relevance_score = 0.0
67
+
68
+ for link in ontology_links:
69
+ # Direct concept match
70
+ if link['concept'].lower() in query_lower:
71
+ relevance_score += 0.3
72
+
73
+ # Description match
74
+ if 'description' in link and any(term in query_lower
75
+ for term in link['description'].lower().split()):
76
+ relevance_score += 0.2
77
+
78
+ # Related concepts match
79
+ if 'relationships' in link:
80
+ for related in link['relationships']:
81
+ if related.lower() in query_lower:
82
+ relevance_score += 0.1
83
+
84
+ # Normalize score to [0, 1]
85
+ return min(1.0, relevance_score)
86
+
87
+ def add_document(self, doc_id: str, text: str, metadata: Dict[str, Any] = None):
88
+ """Add document with enhanced metadata processing."""
89
  try:
90
+ # Create vector embedding
91
+ vector = self.model.encode(text, convert_to_tensor=True)
92
+
93
+ # Ensure metadata includes ontology links
94
+ if metadata and 'ontology_links' not in metadata:
95
+ metadata['ontology_links'] = []
96
+
97
+ doc_record = {
98
+ "doc_id": doc_id,
99
+ "vector": vector,
100
+ "text": text,
101
+ "metadata": metadata or {}
102
+ }
103
+
104
+ if not isinstance(self.vectors, list):
105
+ self.vectors = []
106
+ self.vectors.append(doc_record)
107
  self._save_vectors()
108
+
109
  except Exception as e:
110
+ raise Exception(f"Error adding document: {str(e)}")