Spaces:
Runtime error
Runtime error
File size: 5,403 Bytes
129cd69 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 | import os
from typing import Any, Dict, Iterable, List, Optional, Type
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VST, VectorStore
FIELD_TYPES = {
"f": "files",
"t": "texts",
"l": "links",
}
class NucliaDB(VectorStore):
"""NucliaDB vector store."""
_config: Dict[str, Any] = {}
def __init__(
self,
knowledge_box: str,
local: bool,
api_key: Optional[str] = None,
backend: Optional[str] = None,
) -> None:
"""Initialize the NucliaDB client.
Args:
knowledge_box: the Knowledge Box id.
local: Whether to use a local NucliaDB instance or Nuclia Cloud
api_key: A contributor API key for the kb (needed when local is False)
backend: The backend url to use when local is True, defaults to
http://localhost:8080
"""
try:
from nuclia.sdk import NucliaAuth
except ImportError:
raise ValueError(
"nuclia python package not found. "
"Please install it with `pip install nuclia`."
)
self._config["LOCAL"] = local
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
self._kb = knowledge_box
if local:
if not backend:
backend = "http://localhost:8080"
self._config["BACKEND"] = f"{backend}/api/v1"
self._config["TOKEN"] = None
NucliaAuth().nucliadb(url=backend)
NucliaAuth().kb(url=self.kb_url, interactive=False)
else:
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
self._config["TOKEN"] = api_key
NucliaAuth().kb(
url=self.kb_url, token=self._config["TOKEN"], interactive=False
)
@property
def is_local(self) -> str:
return self._config["LOCAL"]
@property
def kb_url(self) -> str:
return f"{self._config['BACKEND']}/kb/{self._kb}"
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Upload texts to NucliaDB"""
ids = []
from nuclia.sdk import NucliaResource
factory = NucliaResource()
for i, text in enumerate(texts):
extra: Dict[str, Any] = {"metadata": ""}
if metadatas:
extra = {"metadata": metadatas[i]}
id = factory.create(
texts={"text": {"body": text}},
extra=extra,
url=self.kb_url,
api_key=self._config["TOKEN"],
)
ids.append(id)
return ids
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
if not ids:
return None
from nuclia.sdk import NucliaResource
factory = NucliaResource()
results: List[bool] = []
for id in ids:
try:
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
results.append(True)
except ValueError:
results.append(False)
return all(results)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
from nuclia.sdk import NucliaSearch
from nucliadb_models.search import FindRequest, ResourceProperties
request = FindRequest(
query=query,
page_size=k,
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
)
search = NucliaSearch()
results = search.find(
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
)
paragraphs = []
for resource in results.resources.values():
for field in resource.fields.values():
for paragraph_id, paragraph in field.paragraphs.items():
info = paragraph_id.split("/")
field_type = FIELD_TYPES.get(info[1], None)
field_id = info[2]
if not field_type:
continue
value = getattr(resource.data, field_type, {}).get(field_id, None)
paragraphs.append(
{
"text": paragraph.text,
"metadata": {
"extra": getattr(
getattr(resource, "extra", {}), "metadata", None
),
"value": value,
},
"order": paragraph.order,
}
)
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
return [
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
for paragraph in sorted_paragraphs
]
@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> VST:
"""Return VectorStore initialized from texts and embeddings."""
raise NotImplementedError
|