Spaces:
Runtime error
Runtime error
Commit
·
bc388c1
1
Parent(s):
adc6014
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -6,8 +6,8 @@ import asyncio
|
|
| 6 |
from langchain.document_loaders.sitemap import SitemapLoader
|
| 7 |
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
def get_website_data(sitemap_url):
|
| 12 |
|
| 13 |
loop = asyncio.new_event_loop()
|
|
@@ -20,7 +20,7 @@ def get_website_data(sitemap_url):
|
|
| 20 |
|
| 21 |
return docs
|
| 22 |
|
| 23 |
-
#
|
| 24 |
def split_data(docs):
|
| 25 |
|
| 26 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
@@ -32,13 +32,13 @@ def split_data(docs):
|
|
| 32 |
docs_chunks = text_splitter.split_documents(docs)
|
| 33 |
return docs_chunks
|
| 34 |
|
| 35 |
-
#Function to create embeddings instance
|
| 36 |
def create_embeddings():
|
| 37 |
|
| 38 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 39 |
return embeddings
|
| 40 |
|
| 41 |
-
#
|
| 42 |
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
|
| 43 |
|
| 44 |
pinecone.init(
|
|
@@ -50,7 +50,7 @@ def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,em
|
|
| 50 |
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
|
| 51 |
return index
|
| 52 |
|
| 53 |
-
#
|
| 54 |
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
|
| 55 |
|
| 56 |
pinecone.init(
|
|
@@ -63,7 +63,7 @@ def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,
|
|
| 63 |
index = Pinecone.from_existing_index(index_name, embeddings)
|
| 64 |
return index
|
| 65 |
|
| 66 |
-
#
|
| 67 |
def get_similar_docs(index,query,k=2):
|
| 68 |
|
| 69 |
similar_docs = index.similarity_search(query, k=k)
|
|
|
|
| 6 |
from langchain.document_loaders.sitemap import SitemapLoader
|
| 7 |
|
| 8 |
|
| 9 |
+
#Step 1: Loading data from website
|
| 10 |
+
|
| 11 |
def get_website_data(sitemap_url):
|
| 12 |
|
| 13 |
loop = asyncio.new_event_loop()
|
|
|
|
| 20 |
|
| 21 |
return docs
|
| 22 |
|
| 23 |
+
#Step 2:Split data into smaller chunks
|
| 24 |
def split_data(docs):
|
| 25 |
|
| 26 |
text_splitter = RecursiveCharacterTextSplitter(
|
|
|
|
| 32 |
docs_chunks = text_splitter.split_documents(docs)
|
| 33 |
return docs_chunks
|
| 34 |
|
| 35 |
+
#Step3: Embedding this Function to create embeddings instance
|
| 36 |
def create_embeddings():
|
| 37 |
|
| 38 |
embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
| 39 |
return embeddings
|
| 40 |
|
| 41 |
+
#Step 3: Push data to Pinecone
|
| 42 |
def push_to_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings,docs):
|
| 43 |
|
| 44 |
pinecone.init(
|
|
|
|
| 50 |
index = Pinecone.from_documents(docs, embeddings, index_name=index_name)
|
| 51 |
return index
|
| 52 |
|
| 53 |
+
#Step 4 & 5 pull index data from Pinecone
|
| 54 |
def pull_from_pinecone(pinecone_apikey,pinecone_environment,pinecone_index_name,embeddings):
|
| 55 |
|
| 56 |
pinecone.init(
|
|
|
|
| 63 |
index = Pinecone.from_existing_index(index_name, embeddings)
|
| 64 |
return index
|
| 65 |
|
| 66 |
+
#Step 4 & 5 Fetch the top relevent documents from our vector store - Pinecone Index
|
| 67 |
def get_similar_docs(index,query,k=2):
|
| 68 |
|
| 69 |
similar_docs = index.similarity_search(query, k=k)
|