Spaces:
Sleeping
Sleeping
Asaad Almutareb commited on
Commit ·
9237552
1
Parent(s): fa99d8f
corrected streaming callback handler
Browse filesreplaced sentence-transformers with Embed4All from GPT4All
Updated requirements.txt, example.env and README to reflect this repo's settings
- README.md +11 -31
- app/api/v1/agents/hf_mixtral_agent.py +1 -1
- app/structured_tools/structured_tools.py +14 -16
- app/vector_store/chroma_vector_store.py +16 -12
- example.env +16 -3
- requirements.txt +3 -5
README.md
CHANGED
|
@@ -1,37 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
title: Innovation Pathfinder AI
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: gray
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 4.2.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
# InnovationPathfinderAI
|
| 13 |
GenAI Research Assistant for Innovation Labs
|
| 14 |
|
| 15 |
-
##
|
| 16 |
-
|
| 17 |
-
however it is difficult to manage all of this information in a central location. With out tool we
|
| 18 |
-
want to enable people with the capable to discover and manage knowledge bases.
|
| 19 |
-
|
| 20 |
-
## Vector Store
|
| 21 |
-
Documents are embedded and store inside of a Chroma vector store
|
| 22 |
-
|
| 23 |
-
## Agents
|
| 24 |
-
|
| 25 |
-
with agents our application is able to discover and refine the information it collects based on
|
| 26 |
-
the needs and sentiment of the user.
|
| 27 |
-
|
| 28 |
-
## Agent Tools
|
| 29 |
-
The tools our agents have access to. More is being created
|
| 30 |
|
| 31 |
-
|
|
|
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
# FastAPI Backend for InnovationPathfinderAI
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
GenAI Research Assistant for Innovation Labs
|
| 3 |
|
| 4 |
+
## Getting Started
|
| 5 |
+
To get started
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
+
1. install requirements:
|
| 8 |
+
install -r requirements.txt
|
| 9 |
|
| 10 |
+
2. copy example.env to .env and add your API keys and variables
|
| 11 |
|
| 12 |
+
3. run uvicron:
|
| 13 |
+
uvicorn app.main:app
|
| 14 |
|
| 15 |
+
## ToDo
|
| 16 |
+
we are testing replacing the sentence-transformers with GPT4All's Embed4All
|
| 17 |
+
Code still need to be cleaned
|
app/api/v1/agents/hf_mixtral_agent.py
CHANGED
|
@@ -74,7 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
|
|
| 74 |
|
| 75 |
await websocket.send_json(resp.model_dump())
|
| 76 |
message_id: str = utils.generate_uuid()
|
| 77 |
-
custom_handler =
|
| 78 |
websocket, message_id=message_id
|
| 79 |
)
|
| 80 |
|
|
|
|
| 74 |
|
| 75 |
await websocket.send_json(resp.model_dump())
|
| 76 |
message_id: str = utils.generate_uuid()
|
| 77 |
+
custom_handler = CustomAsyncCallbackHandler(
|
| 78 |
websocket, message_id=message_id
|
| 79 |
)
|
| 80 |
|
app/structured_tools/structured_tools.py
CHANGED
|
@@ -8,6 +8,7 @@ from langchain_community.utilities import GoogleSearchAPIWrapper
|
|
| 8 |
from langchain_community.embeddings.sentence_transformer import (
|
| 9 |
SentenceTransformerEmbeddings,
|
| 10 |
)
|
|
|
|
| 11 |
from app.core.config import settings
|
| 12 |
from langchain_community.vectorstores import Chroma
|
| 13 |
import arxiv
|
|
@@ -51,10 +52,11 @@ def memory_search(query:str) -> str:
|
|
| 51 |
collection_name = settings.CONVERSATION_COLLECTION_NAME
|
| 52 |
#store using envar
|
| 53 |
|
| 54 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
vector_db = Chroma(
|
| 60 |
client=client, # client for Chroma
|
|
@@ -78,15 +80,16 @@ def knowledgeBase_search(query:str) -> str:
|
|
| 78 |
collection_name="ArxivPapers"
|
| 79 |
#store using envar
|
| 80 |
|
| 81 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
vector_db = Chroma(
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
)
|
| 91 |
|
| 92 |
retriever = vector_db.as_retriever()
|
|
@@ -153,11 +156,6 @@ def embed_arvix_paper(paper_id:str) -> None:
|
|
| 153 |
collection_name="ArxivPapers"
|
| 154 |
#store using envar
|
| 155 |
|
| 156 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 157 |
-
#model_name=os.getenv("EMBEDDING_MODEL"),
|
| 158 |
-
model_name=settings.EMBEDDING_MODEL
|
| 159 |
-
)
|
| 160 |
-
|
| 161 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
| 162 |
|
| 163 |
add_pdf_to_vector_store(
|
|
|
|
| 8 |
from langchain_community.embeddings.sentence_transformer import (
|
| 9 |
SentenceTransformerEmbeddings,
|
| 10 |
)
|
| 11 |
+
from langchain_community.embeddings import GPT4AllEmbeddings
|
| 12 |
from app.core.config import settings
|
| 13 |
from langchain_community.vectorstores import Chroma
|
| 14 |
import arxiv
|
|
|
|
| 52 |
collection_name = settings.CONVERSATION_COLLECTION_NAME
|
| 53 |
#store using envar
|
| 54 |
|
| 55 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
| 56 |
+
# model_name=settings.EMBEDDING_MODEL
|
| 57 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
| 58 |
+
# )
|
| 59 |
+
embedding_function = GPT4AllEmbeddings()
|
| 60 |
|
| 61 |
vector_db = Chroma(
|
| 62 |
client=client, # client for Chroma
|
|
|
|
| 80 |
collection_name="ArxivPapers"
|
| 81 |
#store using envar
|
| 82 |
|
| 83 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
| 84 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
| 85 |
+
# model_name=settings.EMBEDDING_MODEL
|
| 86 |
+
# )
|
| 87 |
+
embedding_function = GPT4AllEmbeddings()
|
| 88 |
|
| 89 |
vector_db = Chroma(
|
| 90 |
+
client=client, # client for Chroma
|
| 91 |
+
collection_name=collection_name,
|
| 92 |
+
embedding_function=embedding_function,
|
| 93 |
)
|
| 94 |
|
| 95 |
retriever = vector_db.as_retriever()
|
|
|
|
| 156 |
collection_name="ArxivPapers"
|
| 157 |
#store using envar
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
| 160 |
|
| 161 |
add_pdf_to_vector_store(
|
app/vector_store/chroma_vector_store.py
CHANGED
|
@@ -20,6 +20,7 @@ from langchain_community.vectorstores import Chroma
|
|
| 20 |
from langchain_community.embeddings.sentence_transformer import (
|
| 21 |
SentenceTransformerEmbeddings,
|
| 22 |
)
|
|
|
|
| 23 |
from app.utils.utils import (
|
| 24 |
generate_uuid
|
| 25 |
)
|
|
@@ -97,10 +98,11 @@ def add_markdown_to_collection(
|
|
| 97 |
name=collection_name,
|
| 98 |
)
|
| 99 |
|
| 100 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
documents_page_content:list = [i.page_content for i in splits]
|
| 106 |
|
|
@@ -180,10 +182,11 @@ def add_pdf_to_vector_store(
|
|
| 180 |
name=collection_name,
|
| 181 |
)
|
| 182 |
|
| 183 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
|
|
|
| 187 |
|
| 188 |
documents_page_content:list = [i.page_content for i in split_docs]
|
| 189 |
|
|
@@ -239,10 +242,11 @@ if __name__ == "__main__":
|
|
| 239 |
collection_name="ArxivPapers"
|
| 240 |
|
| 241 |
# create the open-source embedding function
|
| 242 |
-
embedding_function = SentenceTransformerEmbeddings(
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
|
|
|
| 246 |
|
| 247 |
#method of integrating Chroma and Langchain
|
| 248 |
vector_db = Chroma(
|
|
|
|
| 20 |
from langchain_community.embeddings.sentence_transformer import (
|
| 21 |
SentenceTransformerEmbeddings,
|
| 22 |
)
|
| 23 |
+
from langchain_community.embeddings import GPT4AllEmbeddings
|
| 24 |
from app.utils.utils import (
|
| 25 |
generate_uuid
|
| 26 |
)
|
|
|
|
| 98 |
name=collection_name,
|
| 99 |
)
|
| 100 |
|
| 101 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
| 102 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
| 103 |
+
# model_name=settings.EMBEDDING_MODEL
|
| 104 |
+
# )
|
| 105 |
+
embedding_function = GPT4AllEmbeddings()
|
| 106 |
|
| 107 |
documents_page_content:list = [i.page_content for i in splits]
|
| 108 |
|
|
|
|
| 182 |
name=collection_name,
|
| 183 |
)
|
| 184 |
|
| 185 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
| 186 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
| 187 |
+
# model_name=settings.EMBEDDING_MODEL
|
| 188 |
+
# )
|
| 189 |
+
embedding_function = GPT4AllEmbeddings()
|
| 190 |
|
| 191 |
documents_page_content:list = [i.page_content for i in split_docs]
|
| 192 |
|
|
|
|
| 242 |
collection_name="ArxivPapers"
|
| 243 |
|
| 244 |
# create the open-source embedding function
|
| 245 |
+
# embedding_function = SentenceTransformerEmbeddings(
|
| 246 |
+
# #model_name=os.getenv("EMBEDDING_MODEL"),
|
| 247 |
+
# model_name=settings.EMBEDDING_MODEL
|
| 248 |
+
# )
|
| 249 |
+
embedding_function = GPT4AllEmbeddings()
|
| 250 |
|
| 251 |
#method of integrating Chroma and Langchain
|
| 252 |
vector_db = Chroma(
|
example.env
CHANGED
|
@@ -5,14 +5,27 @@ HUGGINGFACEHUB_API_TOKEN=
|
|
| 5 |
OLLMA_BASE_URL=
|
| 6 |
|
| 7 |
# environmental varaibles needed to use tools
|
| 8 |
-
SERPAPI_API_KEY=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# for chromadb
|
| 11 |
-
VECTOR_DATABASE_LOCATION="
|
| 12 |
|
| 13 |
# Name for the Conversation Memory Collection
|
| 14 |
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
| 15 |
|
| 16 |
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
| 17 |
|
| 18 |
-
SOURCES_CACHE="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
OLLMA_BASE_URL=
|
| 6 |
|
| 7 |
# environmental varaibles needed to use tools
|
| 8 |
+
#SERPAPI_API_KEY=
|
| 9 |
+
|
| 10 |
+
# we are using Google Custom Search Engine now
|
| 11 |
+
GOOGLE_CSE_ID=
|
| 12 |
+
GOOGLE_API_KEY=
|
| 13 |
|
| 14 |
# for chromadb
|
| 15 |
+
VECTOR_DATABASE_LOCATION="app/knowledge_base/"
|
| 16 |
|
| 17 |
# Name for the Conversation Memory Collection
|
| 18 |
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
| 19 |
|
| 20 |
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
| 21 |
|
| 22 |
+
SOURCES_CACHE="app/database/sources_cache.sqlite3"
|
| 23 |
+
|
| 24 |
+
# local cache
|
| 25 |
+
LOCAL_CACHE=".cache.db"
|
| 26 |
+
|
| 27 |
+
# project name
|
| 28 |
+
PROJECT_NAME=innovation_pathfinder_ai
|
| 29 |
+
|
| 30 |
+
# restricting access to the backend resources, for development it's set to * ('all')
|
| 31 |
+
BACKEND_CORS_ORIGINS=["*"]
|
requirements.txt
CHANGED
|
@@ -2,10 +2,8 @@ langchain-community
|
|
| 2 |
langchain
|
| 3 |
google-search-results
|
| 4 |
langchainhub
|
| 5 |
-
text_generation
|
| 6 |
arxiv
|
| 7 |
wikipedia
|
| 8 |
-
gradio==3.48.0
|
| 9 |
chromadb
|
| 10 |
google_api_python_client
|
| 11 |
pypdf2
|
|
@@ -13,6 +11,6 @@ sqlmodel
|
|
| 13 |
rich
|
| 14 |
fastapi
|
| 15 |
uvicorn
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
| 2 |
langchain
|
| 3 |
google-search-results
|
| 4 |
langchainhub
|
|
|
|
| 5 |
arxiv
|
| 6 |
wikipedia
|
|
|
|
| 7 |
chromadb
|
| 8 |
google_api_python_client
|
| 9 |
pypdf2
|
|
|
|
| 11 |
rich
|
| 12 |
fastapi
|
| 13 |
uvicorn
|
| 14 |
+
adaptive-cards-py
|
| 15 |
+
pydantic_settings
|
| 16 |
+
gpt4all
|