File size: 6,578 Bytes
9092984
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import streamlit as st
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.utilities import SerpAPIWrapper
from langchain_core.documents import Document
import time

# Load environment variables for local development (will be ignored by Streamlit Cloud)
load_dotenv()

# Set up API keys using Streamlit's secrets management
# For local development, these will be read from .streamlit/secrets.toml
# On Streamlit Cloud, they will be read from the repository's secrets
groq_api_key = st.secrets.get("GROQ_API_KEY")
os.environ["SERPAPI_API_KEY"] = st.secrets.get("SERPAPI_API_KEY")


st.set_page_config(page_title="Web RAG with Groq & SerpApi", layout="wide")
st.image("PragyanAI_Transperent_github.png")
st.title("Web RAG: Q&A with Website Content and Google Search")

# Initialize session state and embeddings model
if "vector" not in st.session_state:
    st.session_state.vector = None
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []
if "embeddings" not in st.session_state:
    st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


# Sidebar for user input
with st.sidebar:
    st.header("Input Source")
    website_url = st.text_input("Enter Website URL to scrape")
    
    if st.button("Process Website"):
        if not website_url:
            st.warning("Please enter a website URL.")
        else:
            with st.spinner("Processing website..."):
                try:
                    loader = WebBaseLoader(website_url)
                    web_docs = loader.load()
                    
                    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
                    final_documents = text_splitter.split_documents(web_docs)

                    st.session_state.vector = FAISS.from_documents(final_documents, st.session_state.embeddings)
                    st.success("Website processed successfully!")
                except Exception as e:
                    st.error(f"Failed to scrape or process website: {e}")


# Main chat interface
st.header("Chat with the Web")

# Initialize the language model
# Check if the API keys are available before initializing the model
if groq_api_key and os.environ.get("SERPAPI_API_KEY"):
    llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")

    # Create the prompt template
    prompt_template = ChatPromptTemplate.from_template(
        """
        Answer the questions based on the provided context only.
        Please provide the most accurate and comprehensive response based on the question.
        <context>
        {context}
        <context>
        Questions:{input}
        """
    )

    # Display previous chat messages
    for message in st.session_state.chat_history:
        with st.chat_message(message["role"]):
            st.markdown(message["content"])

    # Get user input
    if prompt_input := st.chat_input("Ask a question..."):
        if st.session_state.vector is None:
            st.warning("Please process a website URL first.")
        else:
            with st.chat_message("user"):
                st.markdown(prompt_input)
            st.session_state.chat_history.append({"role": "user", "content": prompt_input})

            # --- Part 1: Get answer from the website context ---
            with st.spinner("Analyzing website content..."):
                website_retriever = st.session_state.vector.as_retriever()
                website_chain = create_retrieval_chain(website_retriever, create_stuff_documents_chain(llm, prompt_template))
                response_website = website_chain.invoke({"input": prompt_input})
                website_answer = response_website['answer']

            # --- Part 2: Get answer from Google Search ---
            with st.spinner("Searching Google and analyzing results..."):
                search = SerpAPIWrapper()
                search_results = search.results(prompt_input)
                
                google_docs = []
                reference_links = []
                if "organic_results" in search_results:
                    for result in search_results["organic_results"]:
                        google_docs.append(Document(page_content=result.get("snippet", ""), metadata={"source": result.get("link", "")}))
                        if result.get("link"):
                            reference_links.append(f"- [{result.get('title', 'Source')}]({result.get('link')})")

                google_answer = "Could not find relevant information from Google search."
                if google_docs:
                    google_vector_store = FAISS.from_documents(google_docs, st.session_state.embeddings)
                    google_retriever = google_vector_store.as_retriever()
                    google_chain = create_retrieval_chain(google_retriever, create_stuff_documents_chain(llm, prompt_template))
                    response_google = google_chain.invoke({"input": prompt_input})
                    google_answer = response_google['answer']

            # --- Part 3: Display combined results ---
            with st.chat_message("assistant"):
                st.markdown("### From Website Content")
                st.markdown(website_answer)
                st.markdown("---")
                st.markdown("### From Google Search")
                st.markdown(google_answer)
                if reference_links:
                    st.markdown("#### References:")
                    st.markdown("\n".join(reference_links))
            
            # Save combined answer to history
            combined_answer = (
                f"**From Website Content:**\n{website_answer}\n\n---\n\n"
                f"**From Google Search:**\n{google_answer}"
            )
            if reference_links:
                combined_answer += "\n\n**References:**\n" + "\n".join(reference_links)
            st.session_state.chat_history.append({"role": "assistant", "content": combined_answer})

else:
    st.error("API keys not found. Please set them in your Streamlit secrets.")