Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import bs4 | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import WebBaseLoader | |
| from langchain_community.vectorstores import FAISS | |
| #from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| #from langchain_community.embeddings import OllamaEmbeddings | |
| import ollama | |
| # Function to load, split, and retrieve documents from a URL | |
| def load_and_retrieve_docs(url): | |
| loader = WebBaseLoader( | |
| web_paths=(url,), | |
| bs_kwargs=dict() | |
| ) | |
| docs = loader.load() | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
| splits = text_splitter.split_documents(docs) | |
| embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", | |
| model_kwargs={'device':'cpu'}, | |
| encode_kwargs={'normalize_embeddings':True}) | |
| vectorstore = FAISS.from_documents(documents=splits, embedding=embeddings) | |
| return vectorstore.as_retriever() | |
| # Function to initialize vector embedding with FAISS vector store | |
| def vector_embedding(): | |
| if "vectors" not in st.session_state: | |
| st.session_state.embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en-v1.5", | |
| model_kwargs={'device':'cpu'}, | |
| encode_kwargs={'normalize_embeddings':True}) | |
| st.session_state.loader = PyPDFDirectoryLoader("./Data_Science") # Data Ingestion | |
| st.session_state.docs = st.session_state.loader.load() # Document Loading | |
| st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) # Chunk Creation | |
| st.session_state.final_documents = st.session_state.text_splitter.split_documents(st.session_state.docs[:20]) # Splitting | |
| st.session_state.vectors = FAISS.from_documents(st.session_state.final_documents, st.session_state.embeddings) # Vector HuggingFace embeddings | |
| st.write("Vector Store DB Is Ready") | |
| else: | |
| st.write("Vectors already initialized.") | |
| # Function to format documents | |
| def format_docs(docs): | |
| return "\n\n".join(doc.page_content for doc in docs) | |
| # Function that defines the RAG chain | |
| def rag_chain(url, question): | |
| retriever = load_and_retrieve_docs(url) | |
| retrieved_docs = retriever.invoke(question) | |
| formatted_context = format_docs(retrieved_docs) | |
| formatted_prompt = f"Question: {question}\n\nContext: {formatted_context}" | |
| response = ollama.chat(model='llama3', messages=[{'role': 'user', 'content': formatted_prompt}]) | |
| return response['message']['content'] | |
| # Gradio interface | |
| iface = gr.Interface( | |
| fn=rag_chain, | |
| inputs=["text", "text"], | |
| outputs="text", | |
| title="Rocky Bot", | |
| description="Enter a URL and a query to get answers from the RAG chain." | |
| ) | |
| # Launch the app | |
| iface.launch() |