Spaces:

SateeshAmbesange
/

Streamlit_RAG_WebSite

Sleeping

File size: 6,578 Bytes
import streamlit as st
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.utilities import SerpAPIWrapper
from langchain_core.documents import Document
import time

# Load environment variables for local development (will be ignored by Streamlit Cloud)
load_dotenv()

# Set up API keys using Streamlit's secrets management
# For local development, these will be read from .streamlit/secrets.toml
# On Streamlit Cloud, they will be read from the repository's secrets
groq_api_key = st.secrets.get("GROQ_API_KEY")
os.environ["SERPAPI_API_KEY"] = st.secrets.get("SERPAPI_API_KEY")


st.set_page_config(page_title="Web RAG with Groq & SerpApi", layout="wide")
st.image("PragyanAI_Transperent_github.png")
st.title("Web RAG: Q&A with Website Content and Google Search")

# Initialize session state and embeddings model
if "vector" not in st.session_state:
    st.session_state.vector = None
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []
if "embeddings" not in st.session_state:
    st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


# Sidebar for user input
with st.sidebar:
    st.header("Input Source")
    website_url = st.text_input("Enter Website URL to scrape")
    
    if st.button("Process Website"):
        if not website_url:
            st.warning("Please enter a website URL.")
        else:
            with st.spinner("Processing website..."):
                try:
                    loader = WebBaseLoader(website_url)
                    web_docs = loader.load()
                    
                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
                    final_documents = text_splitter.split_documents(web_docs)

                    st.session_state.vector = FAISS.from_documents(final_documents, st.session_state.embeddings)
                    st.success("Website processed successfully!")
                except Exception as e:
                    st.error(f"Failed to scrape or process website: {e}")


# Main chat interface
st.header("Chat with the Web")

# Initialize the language model
# Check if the API keys are available before initializing the model
if groq_api_key and os.environ.get("SERPAPI_API_KEY"):
    llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")

    # Create the prompt template
    prompt_template = ChatPromptTemplate.from_template(
        """
        Answer the questions based on the provided context only.
        Please provide the most accurate and comprehensive response based on the question.
        <context>
        {context}
        <context>
        Questions:{input}
        """
    )

    # Display previous chat messages
    for message in st.session_state.chat_history:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Get user input
    if prompt_input := st.chat_input("Ask a question..."):
        if st.session_state.vector is None:
            st.warning("Please process a website URL first.")
        else:
            with st.chat_message("user"):
                st.markdown(prompt_input)
            st.session_state.chat_history.append({"role": "user", "content": prompt_input})

            # --- Part 1: Get answer from the website context ---
            with st.spinner("Analyzing website content..."):
                website_retriever = st.session_state.vector.as_retriever()
                website_chain = create_retrieval_chain(website_retriever, create_stuff_documents_chain(llm, prompt_template))
                response_website = website_chain.invoke({"input": prompt_input})
                website_answer = response_website['answer']

            # --- Part 2: Get answer from Google Search ---
            with st.spinner("Searching Google and analyzing results..."):
                search = SerpAPIWrapper()
                search_results = search.results(prompt_input)
                
                google_docs = []
                reference_links = []
                if "organic_results" in search_results:
                    for result in search_results["organic_results"]:
                        google_docs.append(Document(page_content=result.get("snippet", ""), metadata={"source": result.get("link", "")}))
                        if result.get("link"):
                            reference_links.append(f"- [{result.get('title', 'Source')}]({result.get('link')})")

                google_answer = "Could not find relevant information from Google search."
                if google_docs:
                    google_vector_store = FAISS.from_documents(google_docs, st.session_state.embeddings)
                    google_retriever = google_vector_store.as_retriever()
                    google_chain = create_retrieval_chain(google_retriever, create_stuff_documents_chain(llm, prompt_template))
                    response_google = google_chain.invoke({"input": prompt_input})
                    google_answer = response_google['answer']

            # --- Part 3: Display combined results ---
            with st.chat_message("assistant"):
                st.markdown("### From Website Content")
                st.markdown(website_answer)
                st.markdown("---")
                st.markdown("### From Google Search")
                st.markdown(google_answer)
                if reference_links:
                    st.markdown("#### References:")
                    st.markdown("\n".join(reference_links))
            
            # Save combined answer to history
            combined_answer = (
                f"**From Website Content:**\n{website_answer}\n\n---\n\n"
                f"**From Google Search:**\n{google_answer}"
            )
            if reference_links:
                combined_answer += "\n\n**References:**\n" + "\n".join(reference_links)
            st.session_state.chat_history.append({"role": "assistant", "content": combined_answer})

else:
    st.error("API keys not found. Please set them in your Streamlit secrets.")