amalsp commited on
Commit
bf8919e
·
verified ·
1 Parent(s): add1033

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -5
app.py CHANGED
@@ -2,11 +2,13 @@ import gradio as gr
2
  import bs4
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.document_loaders import WebBaseLoader
5
- from langchain_community.vectorstores import Chroma
6
- from langchain_community.embeddings import OllamaEmbeddings
 
 
7
  import ollama
8
 
9
- # Function to load, split, and retrieve documents
10
  def load_and_retrieve_docs(url):
11
  loader = WebBaseLoader(
12
  web_paths=(url,),
@@ -15,10 +17,27 @@ def load_and_retrieve_docs(url):
15
  docs = loader.load()
16
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
17
  splits = text_splitter.split_documents(docs)
18
- embeddings = OllamaEmbeddings(model="nomic-embed-text")
19
- vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)
 
 
20
  return vectorstore.as_retriever()
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Function to format documents
23
  def format_docs(docs):
24
  return "\n\n".join(doc.page_content for doc in docs)
 
2
  import bs4
3
  from langchain.text_splitter import RecursiveCharacterTextSplitter
4
  from langchain_community.document_loaders import WebBaseLoader
5
+ from langchain_community.vectorstores import FAISS
6
+ #from langchain_community.vectorstores import Chroma
7
+ from langchain_community.embeddings import HuggingFaceBgeEmbeddings
8
+ #from langchain_community.embeddings import OllamaEmbeddings
9
  import ollama
10
 
11
+ # Function to load, split, and retrieve documents from a URL
12
  def load_and_retrieve_docs(url):
13
  loader = WebBaseLoader(
14
  web_paths=(url,),
 
17
  docs = loader.load()
18
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
19
  splits = text_splitter.split_documents(docs)
20
+ embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
21
+ model_kwargs={'device':'cpu'},
22
+ encode_kwargs={'normalize_embeddings':True})
23
+ vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings)
24
  return vectorstore.as_retriever()
25
 
26
+ # Function to initialize vector embedding with FAISS vector store
27
+ def vector_embedding():
28
+ if "vectors" not in st.session_state:
29
+ st.session_state.embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5",
30
+ model_kwargs={'device':'cpu'},
31
+ encode_kwargs={'normalize_embeddings':True})
32
+ st.session_state.loader = PyPDFDirectoryLoader("./Data_Science") # Data Ingestion
33
+ st.session_state.docs = st.session_state.loader.load() # Document Loading
34
+ st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Chunk Creation
35
+ st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs[:20]) # Splitting
36
+ st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings) # Vector HuggingFace embeddings
37
+ st.write("Vector Store DB Is Ready")
38
+ else:
39
+ st.write("Vectors already initialized.")
40
+
41
  # Function to format documents
42
  def format_docs(docs):
43
  return "\n\n".join(doc.page_content for doc in docs)