Spaces:
Runtime error
Runtime error
💾 Adding a vectorstore (first implementation)
Browse files- .gitignore +3 -1
- example.ipynb +93 -21
- megabots/__init__.py +21 -14
- megabots/vectorstores.py +33 -0
.gitignore
CHANGED
|
@@ -6,4 +6,6 @@ dist
|
|
| 6 |
build
|
| 7 |
**.pickle
|
| 8 |
**.pkl
|
| 9 |
-
.env
|
|
|
|
|
|
|
|
|
| 6 |
build
|
| 7 |
**.pickle
|
| 8 |
**.pkl
|
| 9 |
+
.env
|
| 10 |
+
volumes
|
| 11 |
+
docker-compose.yml
|
example.ipynb
CHANGED
|
@@ -1,31 +1,40 @@
|
|
| 1 |
{
|
| 2 |
"cells": [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
{
|
| 4 |
"cell_type": "code",
|
| 5 |
-
"execution_count":
|
| 6 |
"metadata": {},
|
| 7 |
-
"outputs": [
|
| 8 |
-
{
|
| 9 |
-
"data": {
|
| 10 |
-
"text/plain": [
|
| 11 |
-
"True"
|
| 12 |
-
]
|
| 13 |
-
},
|
| 14 |
-
"execution_count": 8,
|
| 15 |
-
"metadata": {},
|
| 16 |
-
"output_type": "execute_result"
|
| 17 |
-
}
|
| 18 |
-
],
|
| 19 |
"source": [
|
| 20 |
"from megabots import bot\n",
|
| 21 |
-
"from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
"\n",
|
| 23 |
-
"
|
| 24 |
]
|
| 25 |
},
|
| 26 |
{
|
| 27 |
"cell_type": "code",
|
| 28 |
-
"execution_count":
|
| 29 |
"metadata": {},
|
| 30 |
"outputs": [
|
| 31 |
{
|
|
@@ -33,7 +42,7 @@
|
|
| 33 |
"output_type": "stream",
|
| 34 |
"text": [
|
| 35 |
"Using model: gpt-3.5-turbo\n",
|
| 36 |
-
"Loading path from
|
| 37 |
]
|
| 38 |
},
|
| 39 |
{
|
|
@@ -42,7 +51,7 @@
|
|
| 42 |
"'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
|
| 43 |
]
|
| 44 |
},
|
| 45 |
-
"execution_count":
|
| 46 |
"metadata": {},
|
| 47 |
"output_type": "execute_result"
|
| 48 |
}
|
|
@@ -52,9 +61,19 @@
|
|
| 52 |
"qnabot.ask(\"what was the first roster of the avengers?\")"
|
| 53 |
]
|
| 54 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
{
|
| 56 |
"cell_type": "code",
|
| 57 |
-
"execution_count":
|
| 58 |
"metadata": {},
|
| 59 |
"outputs": [
|
| 60 |
{
|
|
@@ -62,7 +81,7 @@
|
|
| 62 |
"output_type": "stream",
|
| 63 |
"text": [
|
| 64 |
"Using model: gpt-3.5-turbo\n",
|
| 65 |
-
"Loading path from
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
@@ -71,7 +90,7 @@
|
|
| 71 |
"\"Hmmm! Let me think about that... Ah yes, the original Avengers lineup included Iron Man, Thor, Hulk, Ant-Man, and the Wasp. They were like the ultimate superhero squad, except for maybe the Teenage Mutant Ninja Turtles. But let's be real, they were just a bunch of turtles who liked pizza.\""
|
| 72 |
]
|
| 73 |
},
|
| 74 |
-
"execution_count":
|
| 75 |
"metadata": {},
|
| 76 |
"output_type": "execute_result"
|
| 77 |
}
|
|
@@ -97,6 +116,59 @@
|
|
| 97 |
")\n",
|
| 98 |
"qnabot.ask(\"what was the first roster of the avengers?\")\n"
|
| 99 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
],
|
| 102 |
"metadata": {
|
|
|
|
| 1 |
{
|
| 2 |
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"attachments": {},
|
| 5 |
+
"cell_type": "markdown",
|
| 6 |
+
"metadata": {},
|
| 7 |
+
"source": [
|
| 8 |
+
"# Examples\n",
|
| 9 |
+
"\n",
|
| 10 |
+
"Below you can find some examples of how to use the 🤖 `Megabots` library."
|
| 11 |
+
]
|
| 12 |
+
},
|
| 13 |
{
|
| 14 |
"cell_type": "code",
|
| 15 |
+
"execution_count": 13,
|
| 16 |
"metadata": {},
|
| 17 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
"source": [
|
| 19 |
"from megabots import bot\n",
|
| 20 |
+
"from dotenv import load_dotenv"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"attachments": {},
|
| 25 |
+
"cell_type": "markdown",
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"source": [
|
| 28 |
+
"### Creating a bot\n",
|
| 29 |
+
"\n",
|
| 30 |
+
"The `bot` object is the main object of the library. It is used to create a bot and to interact with it.\n",
|
| 31 |
"\n",
|
| 32 |
+
"The `index` argument specifies the index to use for the bot. It can either be a saved index file (e.g., `index.pkl`) or a directory of documents (e.g., `./index`). In the case of the directory the index will be automatically created. If no index is specified `bot` will look for `index.pkl` or `./index`"
|
| 33 |
]
|
| 34 |
},
|
| 35 |
{
|
| 36 |
"cell_type": "code",
|
| 37 |
+
"execution_count": 14,
|
| 38 |
"metadata": {},
|
| 39 |
"outputs": [
|
| 40 |
{
|
|
|
|
| 42 |
"output_type": "stream",
|
| 43 |
"text": [
|
| 44 |
"Using model: gpt-3.5-turbo\n",
|
| 45 |
+
"Loading path from pickle file: ./index.pkl ...\n"
|
| 46 |
]
|
| 47 |
},
|
| 48 |
{
|
|
|
|
| 51 |
"'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
|
| 52 |
]
|
| 53 |
},
|
| 54 |
+
"execution_count": 14,
|
| 55 |
"metadata": {},
|
| 56 |
"output_type": "execute_result"
|
| 57 |
}
|
|
|
|
| 61 |
"qnabot.ask(\"what was the first roster of the avengers?\")"
|
| 62 |
]
|
| 63 |
},
|
| 64 |
+
{
|
| 65 |
+
"attachments": {},
|
| 66 |
+
"cell_type": "markdown",
|
| 67 |
+
"metadata": {},
|
| 68 |
+
"source": [
|
| 69 |
+
"### Changing the bot's prompt\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"You can change the bots promnpt to customize it to your needs."
|
| 72 |
+
]
|
| 73 |
+
},
|
| 74 |
{
|
| 75 |
"cell_type": "code",
|
| 76 |
+
"execution_count": 15,
|
| 77 |
"metadata": {},
|
| 78 |
"outputs": [
|
| 79 |
{
|
|
|
|
| 81 |
"output_type": "stream",
|
| 82 |
"text": [
|
| 83 |
"Using model: gpt-3.5-turbo\n",
|
| 84 |
+
"Loading path from pickle file: ./index.pkl ...\n"
|
| 85 |
]
|
| 86 |
},
|
| 87 |
{
|
|
|
|
| 90 |
"\"Hmmm! Let me think about that... Ah yes, the original Avengers lineup included Iron Man, Thor, Hulk, Ant-Man, and the Wasp. They were like the ultimate superhero squad, except for maybe the Teenage Mutant Ninja Turtles. But let's be real, they were just a bunch of turtles who liked pizza.\""
|
| 91 |
]
|
| 92 |
},
|
| 93 |
+
"execution_count": 15,
|
| 94 |
"metadata": {},
|
| 95 |
"output_type": "execute_result"
|
| 96 |
}
|
|
|
|
| 116 |
")\n",
|
| 117 |
"qnabot.ask(\"what was the first roster of the avengers?\")\n"
|
| 118 |
]
|
| 119 |
+
},
|
| 120 |
+
{
|
| 121 |
+
"attachments": {},
|
| 122 |
+
"cell_type": "markdown",
|
| 123 |
+
"metadata": {},
|
| 124 |
+
"source": [
|
| 125 |
+
"### Using Megabots with Milvus\n",
|
| 126 |
+
"\n",
|
| 127 |
+
"Megabots `bot` can also use Milvus as a backend for its search engine. You can find an example of how to do it below.\n",
|
| 128 |
+
"\n",
|
| 129 |
+
"In order to run Milvus you need to follow [this guide](https://milvus.io/docs/example_code.md) to download a docker compose file and run it.\n",
|
| 130 |
+
"The command is:\n",
|
| 131 |
+
" \n",
|
| 132 |
+
"```bash\n",
|
| 133 |
+
"wget https://raw.githubusercontent.com/milvus-io/pymilvus/v2.2.7/examples/hello_milvus.py\n",
|
| 134 |
+
"```\n",
|
| 135 |
+
"You can then [install Attu](https://milvus.io/docs/attu_install-docker.md) as a management tool for Milvus"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": 11,
|
| 141 |
+
"metadata": {},
|
| 142 |
+
"outputs": [
|
| 143 |
+
{
|
| 144 |
+
"name": "stdout",
|
| 145 |
+
"output_type": "stream",
|
| 146 |
+
"text": [
|
| 147 |
+
"Using model: gpt-3.5-turbo\n"
|
| 148 |
+
]
|
| 149 |
+
},
|
| 150 |
+
{
|
| 151 |
+
"data": {
|
| 152 |
+
"text/plain": [
|
| 153 |
+
"'The first roster of the Avengers included Iron Man, Thor, Hulk, Ant-Man, and the Wasp.'"
|
| 154 |
+
]
|
| 155 |
+
},
|
| 156 |
+
"execution_count": 11,
|
| 157 |
+
"metadata": {},
|
| 158 |
+
"output_type": "execute_result"
|
| 159 |
+
}
|
| 160 |
+
],
|
| 161 |
+
"source": [
|
| 162 |
+
"from megabots import bot, vectorstore\n",
|
| 163 |
+
"\n",
|
| 164 |
+
"# Create a vectorstore object. Default port is 19530 and default host is localhost\n",
|
| 165 |
+
"milvus = vectorstore(\"milvus\")\n",
|
| 166 |
+
"\n",
|
| 167 |
+
"# Point it to your files directory so that it can index the files and add them to the vectorstore\n",
|
| 168 |
+
"bot = bot(\"qna-over-docs\", index=\"./examples/files/\", vectorstore=milvus)\n",
|
| 169 |
+
"\n",
|
| 170 |
+
"bot.ask(\"what was the first roster of the avengers?\")\n"
|
| 171 |
+
]
|
| 172 |
}
|
| 173 |
],
|
| 174 |
"metadata": {
|
megabots/__init__.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
|
|
| 1 |
from langchain.llms import OpenAI
|
| 2 |
from langchain.chat_models import ChatOpenAI
|
| 3 |
from langchain.embeddings import OpenAIEmbeddings
|
| 4 |
-
from langchain.document_loaders import DirectoryLoader, S3DirectoryLoader
|
| 5 |
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
| 6 |
from langchain.vectorstores.faiss import FAISS
|
| 7 |
import gradio as gr
|
|
@@ -11,10 +11,9 @@ import os
|
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
from langchain.chains.question_answering import load_qa_chain
|
| 14 |
-
from langchain.chains.conversational_retrieval.prompts import
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
)
|
| 18 |
|
| 19 |
load_dotenv()
|
| 20 |
|
|
@@ -25,15 +24,17 @@ class Bot:
|
|
| 25 |
model: str | None = None,
|
| 26 |
prompt_template: str | None = None,
|
| 27 |
prompt_variables: list[str] | None = None,
|
| 28 |
-
memory: str | None = None,
|
| 29 |
index: str | None = None,
|
| 30 |
sources: bool | None = False,
|
|
|
|
|
|
|
|
|
|
| 31 |
verbose: bool = False,
|
| 32 |
temperature: int = 0,
|
| 33 |
):
|
| 34 |
self.select_model(model, temperature)
|
| 35 |
self.create_loader(index)
|
| 36 |
-
self.load_or_create_index(index)
|
| 37 |
|
| 38 |
# Load the question-answering chain for the selected model
|
| 39 |
self.chain = self.create_chain(
|
|
@@ -83,18 +84,25 @@ class Bot:
|
|
| 83 |
)
|
| 84 |
self.loader = DirectoryLoader(index, recursive=True)
|
| 85 |
|
| 86 |
-
def load_or_create_index(self,
|
| 87 |
# Load an existing index from disk or create a new one if not available
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
# Is pickle
|
| 90 |
-
if
|
| 91 |
-
print("Loading path from
|
| 92 |
-
with open(
|
| 93 |
self.search_index = pickle.load(f)
|
| 94 |
return
|
| 95 |
|
| 96 |
# Is directory
|
| 97 |
-
if
|
| 98 |
print("Creating index...")
|
| 99 |
self.search_index = FAISS.from_documents(
|
| 100 |
self.loader.load_and_split(), OpenAIEmbeddings()
|
|
@@ -125,9 +133,8 @@ SUPPORTED_TASKS = {
|
|
| 125 |
"impl": Bot,
|
| 126 |
"default": {
|
| 127 |
"model": "gpt-3.5-turbo",
|
| 128 |
-
"prompt": "",
|
| 129 |
"temperature": 0,
|
| 130 |
-
"index": "./
|
| 131 |
},
|
| 132 |
}
|
| 133 |
}
|
|
|
|
| 1 |
+
from typing import Any
|
| 2 |
from langchain.llms import OpenAI
|
| 3 |
from langchain.chat_models import ChatOpenAI
|
| 4 |
from langchain.embeddings import OpenAIEmbeddings
|
|
|
|
| 5 |
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
|
| 6 |
from langchain.vectorstores.faiss import FAISS
|
| 7 |
import gradio as gr
|
|
|
|
| 11 |
from dotenv import load_dotenv
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
from langchain.chains.question_answering import load_qa_chain
|
| 14 |
+
from langchain.chains.conversational_retrieval.prompts import QA_PROMPT
|
| 15 |
+
from langchain.document_loaders import DirectoryLoader
|
| 16 |
+
from megabots.vectorstores import vectorstore
|
|
|
|
| 17 |
|
| 18 |
load_dotenv()
|
| 19 |
|
|
|
|
| 24 |
model: str | None = None,
|
| 25 |
prompt_template: str | None = None,
|
| 26 |
prompt_variables: list[str] | None = None,
|
|
|
|
| 27 |
index: str | None = None,
|
| 28 |
sources: bool | None = False,
|
| 29 |
+
# TODO: Fix this typing
|
| 30 |
+
vectorstore: Any | None = None,
|
| 31 |
+
memory: str | None = None,
|
| 32 |
verbose: bool = False,
|
| 33 |
temperature: int = 0,
|
| 34 |
):
|
| 35 |
self.select_model(model, temperature)
|
| 36 |
self.create_loader(index)
|
| 37 |
+
self.load_or_create_index(index, vectorstore)
|
| 38 |
|
| 39 |
# Load the question-answering chain for the selected model
|
| 40 |
self.chain = self.create_chain(
|
|
|
|
| 84 |
)
|
| 85 |
self.loader = DirectoryLoader(index, recursive=True)
|
| 86 |
|
| 87 |
+
def load_or_create_index(self, index: str, vectorstore=None):
|
| 88 |
# Load an existing index from disk or create a new one if not available
|
| 89 |
+
if vectorstore is not None:
|
| 90 |
+
self.search_index = vectorstore.client.from_documents(
|
| 91 |
+
self.loader.load_and_split(),
|
| 92 |
+
OpenAIEmbeddings(),
|
| 93 |
+
connection_args={"host": vectorstore.host, "port": vectorstore.port},
|
| 94 |
+
)
|
| 95 |
+
return
|
| 96 |
|
| 97 |
# Is pickle
|
| 98 |
+
if index is not None and "pkl" in index or "pickle" in index:
|
| 99 |
+
print("Loading path from pickle file: ", index, "...")
|
| 100 |
+
with open(index, "rb") as f:
|
| 101 |
self.search_index = pickle.load(f)
|
| 102 |
return
|
| 103 |
|
| 104 |
# Is directory
|
| 105 |
+
if index is not None and os.path.isdir(index):
|
| 106 |
print("Creating index...")
|
| 107 |
self.search_index = FAISS.from_documents(
|
| 108 |
self.loader.load_and_split(), OpenAIEmbeddings()
|
|
|
|
| 133 |
"impl": Bot,
|
| 134 |
"default": {
|
| 135 |
"model": "gpt-3.5-turbo",
|
|
|
|
| 136 |
"temperature": 0,
|
| 137 |
+
"index": "./index",
|
| 138 |
},
|
| 139 |
}
|
| 140 |
}
|
megabots/vectorstores.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Type, TypeVar
|
| 2 |
+
from langchain.vectorstores import Milvus, Qdrant
|
| 3 |
+
from abc import ABC
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MilvusVectorStore:
|
| 7 |
+
def __init__(self, host: str, port: int):
|
| 8 |
+
self.host = host
|
| 9 |
+
self.port = port
|
| 10 |
+
self.client = Milvus
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SUPPORTED_VECTORSTORES = {
|
| 14 |
+
"milvus": {
|
| 15 |
+
"impl": MilvusVectorStore,
|
| 16 |
+
"default": {"host": "localhost", "port": 19530},
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def vectorstore(name: str) -> MilvusVectorStore:
|
| 22 |
+
"""Return a vectorstore object."""
|
| 23 |
+
|
| 24 |
+
if name is None:
|
| 25 |
+
raise RuntimeError("Impossible to instantiate a vectorstore without a name.")
|
| 26 |
+
|
| 27 |
+
if name not in SUPPORTED_VECTORSTORES:
|
| 28 |
+
raise ValueError(f"Vectorstore {name} is not supported.")
|
| 29 |
+
|
| 30 |
+
return SUPPORTED_VECTORSTORES[name]["impl"](
|
| 31 |
+
host=SUPPORTED_VECTORSTORES[name]["default"]["host"],
|
| 32 |
+
port=SUPPORTED_VECTORSTORES[name]["default"]["port"],
|
| 33 |
+
)
|