Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -2,11 +2,13 @@ import gradio as gr
|
|
| 2 |
import bs4
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain_community.document_loaders import WebBaseLoader
|
| 5 |
-
from langchain_community.vectorstores import
|
| 6 |
-
from langchain_community.
|
|
|
|
|
|
|
| 7 |
import ollama
|
| 8 |
|
| 9 |
-
# Function to load, split, and retrieve documents
|
| 10 |
def load_and_retrieve_docs(url):
|
| 11 |
loader = WebBaseLoader(
|
| 12 |
web_paths=(url,),
|
|
@@ -15,10 +17,27 @@ def load_and_retrieve_docs(url):
|
|
| 15 |
docs = loader.load()
|
| 16 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 17 |
splits = text_splitter.split_documents(docs)
|
| 18 |
-
embeddings =
|
| 19 |
-
|
|
|
|
|
|
|
| 20 |
return vectorstore.as_retriever()
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Function to format documents
|
| 23 |
def format_docs(docs):
|
| 24 |
return "\n\n".join(doc.page_content for doc in docs)
|
|
|
|
| 2 |
import bs4
|
| 3 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 4 |
from langchain_community.document_loaders import WebBaseLoader
|
| 5 |
+
from langchain_community.vectorstores import FAISS
|
| 6 |
+
#from langchain_community.vectorstores import Chroma
|
| 7 |
+
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
| 8 |
+
#from langchain_community.embeddings import OllamaEmbeddings
|
| 9 |
import ollama
|
| 10 |
|
| 11 |
+
# Function to load, split, and retrieve documents from a URL
|
| 12 |
def load_and_retrieve_docs(url):
|
| 13 |
loader = WebBaseLoader(
|
| 14 |
web_paths=(url,),
|
|
|
|
| 17 |
docs = loader.load()
|
| 18 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
|
| 19 |
splits = text_splitter.split_documents(docs)
|
| 20 |
+
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
|
| 21 |
+
model_kwargs={'device':'cpu'},
|
| 22 |
+
encode_kwargs={'normalize_embeddings':True})
|
| 23 |
+
vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
|
| 24 |
return vectorstore.as_retriever()
|
| 25 |
|
| 26 |
+
# Function to initialize vector embedding with FAISS vector store
|
| 27 |
+
def vector_embedding():
|
| 28 |
+
if "vectors" not in st.session_state:
|
| 29 |
+
st.session_state.embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
|
| 30 |
+
model_kwargs={'device':'cpu'},
|
| 31 |
+
encode_kwargs={'normalize_embeddings':True})
|
| 32 |
+
st.session_state.loader = PyPDFDirectoryLoader("./Data_Science") # Data Ingestion
|
| 33 |
+
st.session_state.docs = st.session_state.loader.load() # Document Loading
|
| 34 |
+
st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Chunk Creation
|
| 35 |
+
st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs[:20]) # Splitting
|
| 36 |
+
st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings) # Vector HuggingFace embeddings
|
| 37 |
+
st.write("Vector Store DB Is Ready")
|
| 38 |
+
else:
|
| 39 |
+
st.write("Vectors already initialized.")
|
| 40 |
+
|
| 41 |
# Function to format documents
|
| 42 |
def format_docs(docs):
|
| 43 |
return "\n\n".join(doc.page_content for doc in docs)
|