Spaces:

SateeshAmbesange
/

Streamlit_RAG_WebSite

Sleeping

App Files Files Community

Streamlit_RAG_WebSite / src /Web_RAG_App.py

SateeshAmbesange

Rename Web_RAG_App.py to src/Web_RAG_App.py

5638ebd verified 5 months ago

raw

history blame contribute delete

6.58 kB

	import streamlit as st
	from dotenv import load_dotenv
	import os
	import requests
	from bs4 import BeautifulSoup
	from langchain_groq import ChatGroq
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.chains import create_retrieval_chain
	from langchain_community.vectorstores import FAISS
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain_community.utilities import SerpAPIWrapper
	from langchain_core.documents import Document
	import time

	# Load environment variables for local development (will be ignored by Streamlit Cloud)
	load_dotenv()

	# Set up API keys using Streamlit's secrets management
	# For local development, these will be read from .streamlit/secrets.toml
	# On Streamlit Cloud, they will be read from the repository's secrets
	groq_api_key = st.secrets.get("GROQ_API_KEY")
	os.environ["SERPAPI_API_KEY"] = st.secrets.get("SERPAPI_API_KEY")


	st.set_page_config(page_title="Web RAG with Groq & SerpApi", layout="wide")
	st.image("PragyanAI_Transperent_github.png")
	st.title("Web RAG: Q&A with Website Content and Google Search")

	# Initialize session state and embeddings model
	if "vector" not in st.session_state:
	st.session_state.vector = None
	if "chat_history" not in st.session_state:
	st.session_state.chat_history = []
	if "embeddings" not in st.session_state:
	st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


	# Sidebar for user input
	with st.sidebar:
	st.header("Input Source")
	website_url = st.text_input("Enter Website URL to scrape")

	if st.button("Process Website"):
	if not website_url:
	st.warning("Please enter a website URL.")
	else:
	with st.spinner("Processing website..."):
	try:
	loader = WebBaseLoader(website_url)
	web_docs = loader.load()

	text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
	final_documents = text_splitter.split_documents(web_docs)

	st.session_state.vector = FAISS.from_documents(final_documents, st.session_state.embeddings)
	st.success("Website processed successfully!")
	except Exception as e:
	st.error(f"Failed to scrape or process website: {e}")


	# Main chat interface
	st.header("Chat with the Web")

	# Initialize the language model
	# Check if the API keys are available before initializing the model
	if groq_api_key and os.environ.get("SERPAPI_API_KEY"):
	llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")

	# Create the prompt template
	prompt_template = ChatPromptTemplate.from_template(
	"""
	Answer the questions based on the provided context only.
	Please provide the most accurate and comprehensive response based on the question.
	<context>
	{context}
	<context>
	Questions:{input}
	"""
	)

	# Display previous chat messages
	for message in st.session_state.chat_history:
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Get user input
	if prompt_input := st.chat_input("Ask a question..."):
	if st.session_state.vector is None:
	st.warning("Please process a website URL first.")
	else:
	with st.chat_message("user"):
	st.markdown(prompt_input)
	st.session_state.chat_history.append({"role": "user", "content": prompt_input})

	# --- Part 1: Get answer from the website context ---
	with st.spinner("Analyzing website content..."):
	website_retriever = st.session_state.vector.as_retriever()
	website_chain = create_retrieval_chain(website_retriever, create_stuff_documents_chain(llm, prompt_template))
	response_website = website_chain.invoke({"input": prompt_input})
	website_answer = response_website['answer']

	# --- Part 2: Get answer from Google Search ---
	with st.spinner("Searching Google and analyzing results..."):
	search = SerpAPIWrapper()
	search_results = search.results(prompt_input)

	google_docs = []
	reference_links = []
	if "organic_results" in search_results:
	for result in search_results["organic_results"]:
	google_docs.append(Document(page_content=result.get("snippet", ""), metadata={"source": result.get("link", "")}))
	if result.get("link"):
	reference_links.append(f"- [{result.get('title', 'Source')}]({result.get('link')})")

	google_answer = "Could not find relevant information from Google search."
	if google_docs:
	google_vector_store = FAISS.from_documents(google_docs, st.session_state.embeddings)
	google_retriever = google_vector_store.as_retriever()
	google_chain = create_retrieval_chain(google_retriever, create_stuff_documents_chain(llm, prompt_template))
	response_google = google_chain.invoke({"input": prompt_input})
	google_answer = response_google['answer']

	# --- Part 3: Display combined results ---
	with st.chat_message("assistant"):
	st.markdown("### From Website Content")
	st.markdown(website_answer)
	st.markdown("---")
	st.markdown("### From Google Search")
	st.markdown(google_answer)
	if reference_links:
	st.markdown("#### References:")
	st.markdown("\n".join(reference_links))

	# Save combined answer to history
	combined_answer = (
	f"From Website Content:\n{website_answer}\n\n---\n\n"
	f"From Google Search:\n{google_answer}"
	)
	if reference_links:
	combined_answer += "\n\nReferences:\n" + "\n".join(reference_links)
	st.session_state.chat_history.append({"role": "assistant", "content": combined_answer})

	else:
	st.error("API keys not found. Please set them in your Streamlit secrets.")