Spaces:
Sleeping
Sleeping
Update functions.py
Browse files- functions.py +25 -33
functions.py
CHANGED
|
@@ -69,25 +69,22 @@ async def handle_userinput(user_question, custom_graph):
|
|
| 69 |
|
| 70 |
|
| 71 |
|
| 72 |
-
def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30,lambda_mult=
|
| 73 |
-
|
| 74 |
model_name = "Alibaba-NLP/gte-large-en-v1.5"
|
| 75 |
-
model_kwargs = {'device': 'cpu',
|
| 76 |
-
"trust_remote_code" : 'False'}
|
| 77 |
encode_kwargs = {'normalize_embeddings': True}
|
|
|
|
| 78 |
embeddings = HuggingFaceEmbeddings(
|
| 79 |
model_name=model_name,
|
| 80 |
model_kwargs=model_kwargs,
|
| 81 |
encode_kwargs=encode_kwargs
|
| 82 |
)
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
|
| 87 |
-
vectorstore = Chroma(persist_directory=vectorstore_path,embedding_function=embeddings)
|
| 88 |
-
|
| 89 |
else:
|
| 90 |
-
st.write("Vector store
|
|
|
|
| 91 |
urls = [
|
| 92 |
|
| 93 |
"https://github.com/zedr/clean-code-python",
|
|
@@ -190,38 +187,33 @@ def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type=
|
|
| 190 |
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
| 191 |
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
| 192 |
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
| 193 |
-
"https://datasciencedojo.com/blog/on-device-ai/"
|
| 194 |
-
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
| 202 |
|
|
|
|
| 203 |
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
is_separator_regex = True
|
| 211 |
-
)
|
| 212 |
-
split_docs = text_splitter.split_documents(docs)
|
| 213 |
|
| 214 |
-
|
| 215 |
-
vectorstore = Chroma.from_documents(
|
| 216 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
| 217 |
)
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
retriever=vectorstore.as_retriever(search_type = search_type, search_kwargs={"k": k})
|
| 221 |
|
|
|
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
return retriever
|
| 226 |
|
| 227 |
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
|
| 72 |
+
def create_retriever_from_chroma(vectorstore_path="./docs/chroma/", search_type='mmr', k=7, chunk_size=300, chunk_overlap=30, lambda_mult=0.7):
|
|
|
|
| 73 |
model_name = "Alibaba-NLP/gte-large-en-v1.5"
|
| 74 |
+
model_kwargs = {'device': 'cpu', "trust_remote_code": 'False'}
|
|
|
|
| 75 |
encode_kwargs = {'normalize_embeddings': True}
|
| 76 |
+
|
| 77 |
embeddings = HuggingFaceEmbeddings(
|
| 78 |
model_name=model_name,
|
| 79 |
model_kwargs=model_kwargs,
|
| 80 |
encode_kwargs=encode_kwargs
|
| 81 |
)
|
| 82 |
|
|
|
|
|
|
|
| 83 |
if os.path.exists(vectorstore_path) and os.listdir(vectorstore_path):
|
| 84 |
+
vectorstore = Chroma(persist_directory=vectorstore_path, embedding_function=embeddings)
|
|
|
|
| 85 |
else:
|
| 86 |
+
st.write("Vector store doesn't exist and will be created now")
|
| 87 |
+
|
| 88 |
urls = [
|
| 89 |
|
| 90 |
"https://github.com/zedr/clean-code-python",
|
|
|
|
| 187 |
"https://datasciencedojo.com/blog/ensemble-methods-in-machine-learning/",
|
| 188 |
"https://datasciencedojo.com/blog/langgraph-tutorial/",
|
| 189 |
"https://datasciencedojo.com/blog/data-driven-marketing-in-2024/",
|
| 190 |
+
"https://datasciencedojo.com/blog/on-device-ai/",
|
| 191 |
+
|
| 192 |
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
def extract_sentences_from_web(links, chunk_size=500, chunk_overlap=30):
|
| 196 |
+
data = []
|
| 197 |
+
for link in links:
|
| 198 |
+
loader = NewsURLLoader(urls=[link])
|
| 199 |
+
data += loader.load()
|
| 200 |
+
return data
|
| 201 |
|
| 202 |
+
docs = extract_sentences_from_web(links=urls)
|
| 203 |
|
| 204 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
| 205 |
+
chunk_size=chunk_size, chunk_overlap=chunk_overlap,
|
| 206 |
+
separators=["\n\n \n\n", "\n\n\n", "\n\n", r"In \[[0-9]+\]", r"\n+", r"\s+"],
|
| 207 |
+
is_separator_regex=True
|
| 208 |
+
)
|
| 209 |
+
split_docs = text_splitter.split_documents(docs)
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
+
vectorstore = Chroma.from_documents(
|
|
|
|
| 212 |
documents=split_docs, embedding=embeddings, persist_directory=vectorstore_path
|
| 213 |
)
|
|
|
|
|
|
|
|
|
|
| 214 |
|
| 215 |
+
retriever = vectorstore.as_retriever(search_type=search_type, search_kwargs={"k": k})
|
| 216 |
|
|
|
|
|
|
|
| 217 |
return retriever
|
| 218 |
|
| 219 |
|