Spaces:
Sleeping
Sleeping
Merge pull request #43 from almutareb/one_embedding_model
Browse files
example.env
CHANGED
|
@@ -11,4 +11,6 @@ SERPAPI_API_KEY=
|
|
| 11 |
VECTOR_DATABASE_LOCATION=
|
| 12 |
|
| 13 |
# Name for the Conversation Memory Collection
|
| 14 |
-
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
|
|
|
|
|
|
|
|
| 11 |
VECTOR_DATABASE_LOCATION=
|
| 12 |
|
| 13 |
# Name for the Conversation Memory Collection
|
| 14 |
+
CONVERSATION_COLLECTION_NAME="ConversationMemory"
|
| 15 |
+
|
| 16 |
+
EMBEDDING_MODEL="sentence-transformers/all-MiniLM-L6-v2"
|
innovation_pathfinder_ai/structured_tools/structured_tools.py
CHANGED
|
@@ -52,7 +52,7 @@ def memory_search(query:str) -> str:
|
|
| 52 |
#store using envar
|
| 53 |
|
| 54 |
embedding_function = SentenceTransformerEmbeddings(
|
| 55 |
-
model_name="
|
| 56 |
)
|
| 57 |
|
| 58 |
vector_db = Chroma(
|
|
@@ -78,7 +78,7 @@ def knowledgeBase_search(query:str) -> str:
|
|
| 78 |
#store using envar
|
| 79 |
|
| 80 |
embedding_function = SentenceTransformerEmbeddings(
|
| 81 |
-
model_name="
|
| 82 |
)
|
| 83 |
|
| 84 |
vector_db = Chroma(
|
|
@@ -152,7 +152,7 @@ def embed_arvix_paper(paper_id:str) -> None:
|
|
| 152 |
#store using envar
|
| 153 |
|
| 154 |
embedding_function = SentenceTransformerEmbeddings(
|
| 155 |
-
model_name="
|
| 156 |
)
|
| 157 |
|
| 158 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
|
|
|
| 52 |
#store using envar
|
| 53 |
|
| 54 |
embedding_function = SentenceTransformerEmbeddings(
|
| 55 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 56 |
)
|
| 57 |
|
| 58 |
vector_db = Chroma(
|
|
|
|
| 78 |
#store using envar
|
| 79 |
|
| 80 |
embedding_function = SentenceTransformerEmbeddings(
|
| 81 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 82 |
)
|
| 83 |
|
| 84 |
vector_db = Chroma(
|
|
|
|
| 152 |
#store using envar
|
| 153 |
|
| 154 |
embedding_function = SentenceTransformerEmbeddings(
|
| 155 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 156 |
)
|
| 157 |
|
| 158 |
full_path = os.path.join(pdf_directory, pdf_file_name)
|
innovation_pathfinder_ai/vector_store/chroma_vector_store.py
CHANGED
|
@@ -8,7 +8,6 @@
|
|
| 8 |
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
|
| 9 |
|
| 10 |
import chromadb
|
| 11 |
-
import chromadb.utils.embedding_functions as embedding_functions
|
| 12 |
|
| 13 |
from langchain.text_splitter import CharacterTextSplitter
|
| 14 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
@@ -99,9 +98,9 @@ def add_markdown_to_collection(
|
|
| 99 |
name=collection_name,
|
| 100 |
)
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
|
| 106 |
documents_page_content:list = [i.page_content for i in splits]
|
| 107 |
|
|
@@ -111,7 +110,7 @@ def add_markdown_to_collection(
|
|
| 111 |
collection.add(
|
| 112 |
ids=[generate_uuid()], # give each document a uuid
|
| 113 |
documents=documents_page_content[i], # contents of document
|
| 114 |
-
embeddings=
|
| 115 |
metadatas=data.metadata, # type: ignore
|
| 116 |
)
|
| 117 |
|
|
@@ -181,13 +180,9 @@ def add_pdf_to_vector_store(
|
|
| 181 |
name=collection_name,
|
| 182 |
)
|
| 183 |
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
)
|
| 188 |
-
|
| 189 |
-
# create the open-source embedding function
|
| 190 |
-
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 191 |
|
| 192 |
documents_page_content:list = [i.page_content for i in split_docs]
|
| 193 |
|
|
@@ -198,7 +193,7 @@ def add_pdf_to_vector_store(
|
|
| 198 |
collection.add(
|
| 199 |
ids=[generate_uuid()], # give each document a uuid
|
| 200 |
documents=documents_page_content[i], # contents of document
|
| 201 |
-
embeddings=
|
| 202 |
metadatas=data.metadata, # type: ignore
|
| 203 |
)
|
| 204 |
|
|
@@ -244,7 +239,7 @@ if __name__ == "__main__":
|
|
| 244 |
|
| 245 |
# create the open-source embedding function
|
| 246 |
embedding_function = SentenceTransformerEmbeddings(
|
| 247 |
-
model_name="
|
| 248 |
)
|
| 249 |
|
| 250 |
#method of integrating Chroma and Langchain
|
|
|
|
| 8 |
# https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore
|
| 9 |
|
| 10 |
import chromadb
|
|
|
|
| 11 |
|
| 12 |
from langchain.text_splitter import CharacterTextSplitter
|
| 13 |
from langchain_text_splitters import MarkdownHeaderTextSplitter
|
|
|
|
| 98 |
name=collection_name,
|
| 99 |
)
|
| 100 |
|
| 101 |
+
embedding_function = SentenceTransformerEmbeddings(
|
| 102 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 103 |
+
)
|
| 104 |
|
| 105 |
documents_page_content:list = [i.page_content for i in splits]
|
| 106 |
|
|
|
|
| 110 |
collection.add(
|
| 111 |
ids=[generate_uuid()], # give each document a uuid
|
| 112 |
documents=documents_page_content[i], # contents of document
|
| 113 |
+
embeddings=embedding_function(documents_page_content[i]),
|
| 114 |
metadatas=data.metadata, # type: ignore
|
| 115 |
)
|
| 116 |
|
|
|
|
| 180 |
name=collection_name,
|
| 181 |
)
|
| 182 |
|
| 183 |
+
embedding_function = SentenceTransformerEmbeddings(
|
| 184 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 185 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
documents_page_content:list = [i.page_content for i in split_docs]
|
| 188 |
|
|
|
|
| 193 |
collection.add(
|
| 194 |
ids=[generate_uuid()], # give each document a uuid
|
| 195 |
documents=documents_page_content[i], # contents of document
|
| 196 |
+
embeddings=embedding_function(documents_page_content[i]),
|
| 197 |
metadatas=data.metadata, # type: ignore
|
| 198 |
)
|
| 199 |
|
|
|
|
| 239 |
|
| 240 |
# create the open-source embedding function
|
| 241 |
embedding_function = SentenceTransformerEmbeddings(
|
| 242 |
+
model_name=os.getenv("EMBEDDING_MODEL"),
|
| 243 |
)
|
| 244 |
|
| 245 |
#method of integrating Chroma and Langchain
|