AXZ91 commited on
Commit
e77ad25
·
1 Parent(s): 7f80f00

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +204 -3
app.py CHANGED
@@ -33,13 +33,214 @@ from llama_index import VectorStoreIndex, SimpleDirectoryReader, Document
33
  from llama_index.vector_stores import DeepLakeVectorStore
34
 
35
  # Create an index over the documnts
36
- vector_store = DeepLakeVectorStore(dataset_path=dataset_path
37
- )
38
 
39
- storage_context = StorageContext.from_defaults(vector_store=vector_store)
40
 
41
 
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  llm = OpenAI(model='gpt-3.5-turbo', temperature=0, max_tokens=3924)
44
  embed_model = OpenAIEmbedding()
45
  node_parser = SimpleNodeParser(
 
33
  from llama_index.vector_stores import DeepLakeVectorStore
34
 
35
  # Create an index over the documnts
36
+ #vector_store = DeepLakeVectorStore(dataset_path=dataset_path
 
37
 
 
38
 
39
 
40
 
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+ """LanceDB vector store."""
50
+ from typing import Any, List, Optional
51
+
52
+ from llama_index.schema import MetadataMode, NodeRelationship, RelatedNodeInfo, TextNode
53
+ from llama_index.vector_stores.types import (
54
+ MetadataFilters,
55
+ NodeWithEmbedding,
56
+ VectorStore,
57
+ VectorStoreQuery,
58
+ VectorStoreQueryResult,
59
+ )
60
+ from llama_index.vector_stores.utils import node_to_metadata_dict
61
+
62
+
63
+ def _to_lance_filter(standard_filters: MetadataFilters) -> Any:
64
+ """Translate standard metadata filters to Lance specific spec."""
65
+ filters = []
66
+ for filter in standard_filters.filters:
67
+ if isinstance(filter.value, str):
68
+ filters.append(filter.key + ' = "' + filter.value + '"')
69
+ else:
70
+ filters.append(filter.key + " = " + str(filter.value))
71
+ return " AND ".join(filters)
72
+
73
+
74
+ class LanceDBVectorStore1(VectorStore):
75
+ """The LanceDB Vector Store.
76
+
77
+ Stores text and embeddings in LanceDB. The vector store will open an existing
78
+ LanceDB dataset or create the dataset if it does not exist.
79
+
80
+ Args:
81
+ uri (str, required): Location where LanceDB will store its files.
82
+ table_name (str, optional): The table name where the embeddings will be stored.
83
+ Defaults to "vectors".
84
+ nprobes (int, optional): The number of probes used.
85
+ A higher number makes search more accurate but also slower.
86
+ Defaults to 20.
87
+ refine_factor: (int, optional): Refine the results by reading extra elements
88
+ and re-ranking them in memory.
89
+ Defaults to None
90
+
91
+ Raises:
92
+ ImportError: Unable to import `lancedb`.
93
+
94
+ Returns:
95
+ LanceDBVectorStore: VectorStore that supports creating LanceDB datasets and
96
+ querying it.
97
+ """
98
+
99
+ stores_text = True
100
+ flat_metadata: bool = True
101
+
102
+ def __init__(
103
+ self,
104
+ uri: str,
105
+ table_name: str = "vectors",
106
+ nprobes: int = 20,
107
+ refine_factor: Optional[int] = None,
108
+ **kwargs: Any,
109
+ ) -> None:
110
+ """Init params."""
111
+ import_err_msg = "`lancedb` package not found, please run `pip install lancedb`"
112
+ try:
113
+ import lancedb # noqa: F401
114
+ except ImportError:
115
+ raise ImportError(import_err_msg)
116
+
117
+ self.connection = lancedb.connect(uri)
118
+ self.uri = uri
119
+ self.table_name = table_name
120
+ self.nprobes = nprobes
121
+ self.refine_factor = refine_factor
122
+
123
+ @property
124
+ def client(self) -> None:
125
+ """Get client."""
126
+ return None
127
+
128
+ def add(
129
+ self,
130
+ embedding_results: List[NodeWithEmbedding],
131
+ ) -> List[str]:
132
+ data = []
133
+ ids = []
134
+ for result in embedding_results:
135
+ metadata = node_to_metadata_dict(
136
+ result.node, remove_text=True, flat_metadata=self.flat_metadata
137
+ )
138
+ append_data = {
139
+ "id": result.id,
140
+ "doc_id": result.ref_doc_id,
141
+ "vector": result.embedding,
142
+ "text": result.node.get_content(metadata_mode=MetadataMode.NONE),
143
+ }
144
+ append_data.update(metadata)
145
+ data.append(append_data)
146
+ ids.append(result.id)
147
+
148
+ if self.table_name in self.connection.table_names():
149
+ tbl = self.connection.open_table(self.table_name)
150
+ tbl.add(data)
151
+ else:
152
+ self.connection.create_table(self.table_name, data)
153
+ return ids
154
+
155
+ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
156
+ """
157
+ Delete nodes using with ref_doc_id.
158
+
159
+ Args:
160
+ ref_doc_id (str): The doc_id of the document to delete.
161
+
162
+ """
163
+ table = self.connection.open_table(self.table_name)
164
+ table.delete('document_id = "' + ref_doc_id + '"')
165
+
166
+ def query(
167
+ self,
168
+ query: VectorStoreQuery,
169
+ **kwargs: Any,
170
+ ) -> VectorStoreQueryResult:
171
+ """Query index for top k most similar nodes."""
172
+ if query.filters is not None:
173
+ if "where" in kwargs:
174
+ raise ValueError(
175
+ "Cannot specify filter via both query and kwargs. "
176
+ "Use kwargs only for lancedb specific items that are "
177
+ "not supported via the generic query interface."
178
+ )
179
+ where = _to_lance_filter(query.filters)
180
+ else:
181
+ where = kwargs.pop("where", None)
182
+
183
+ table = self.connection.open_table(self.table_name)
184
+ lance_query = (
185
+ table.search(query.query_embedding)
186
+ .limit(query.similarity_top_k)
187
+ .where(where)
188
+ .nprobes(self.nprobes)
189
+ )
190
+
191
+ if self.refine_factor is not None:
192
+ lance_query.refine_factor(self.refine_factor)
193
+
194
+ results = lance_query.to_df()
195
+ nodes = []
196
+ for _, item in results.iterrows():
197
+ node = TextNode(
198
+ text=item.text,
199
+ id_=item.id,
200
+ relationships={
201
+ NodeRelationship.SOURCE: RelatedNodeInfo(node_id=item.doc_id),
202
+ },
203
+ )
204
+ nodes.append(node)
205
+
206
+ return VectorStoreQueryResult(
207
+ nodes=nodes,
208
+ similarities=results["_distance"].tolist(),
209
+ ids=results["id"].tolist(),
210
+ )
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+ import logging
227
+ import sys
228
+
229
+ # Uncomment to see debug logs
230
+ # logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
231
+ # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
232
+
233
+ from llama_index import SimpleDirectoryReader, Document, StorageContext
234
+ from llama_index.indices.vector_store import VectorStoreIndex
235
+ from llama_index.vector_stores import LanceDBVectorStore
236
+ import textwrap
237
+
238
+ vector_store = LanceDBVectorStore1(uri="sample_data/")
239
+
240
+
241
+ storage_context = StorageContext.from_defaults(vector_store=vector_store) )
242
+
243
+
244
  llm = OpenAI(model='gpt-3.5-turbo', temperature=0, max_tokens=3924)
245
  embed_model = OpenAIEmbedding()
246
  node_parser = SimpleNodeParser(