File size: 6,578 Bytes
9092984 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import streamlit as st
from dotenv import load_dotenv
import os
import requests
from bs4 import BeautifulSoup
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import WebBaseLoader
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.utilities import SerpAPIWrapper
from langchain_core.documents import Document
import time
# Load environment variables for local development (will be ignored by Streamlit Cloud)
load_dotenv()
# Set up API keys using Streamlit's secrets management
# For local development, these will be read from .streamlit/secrets.toml
# On Streamlit Cloud, they will be read from the repository's secrets
groq_api_key = st.secrets.get("GROQ_API_KEY")
os.environ["SERPAPI_API_KEY"] = st.secrets.get("SERPAPI_API_KEY")
st.set_page_config(page_title="Web RAG with Groq & SerpApi", layout="wide")
st.image("PragyanAI_Transperent_github.png")
st.title("Web RAG: Q&A with Website Content and Google Search")
# Initialize session state and embeddings model
if "vector" not in st.session_state:
st.session_state.vector = None
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
if "embeddings" not in st.session_state:
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Sidebar for user input
with st.sidebar:
st.header("Input Source")
website_url = st.text_input("Enter Website URL to scrape")
if st.button("Process Website"):
if not website_url:
st.warning("Please enter a website URL.")
else:
with st.spinner("Processing website..."):
try:
loader = WebBaseLoader(website_url)
web_docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(web_docs)
st.session_state.vector = FAISS.from_documents(final_documents, st.session_state.embeddings)
st.success("Website processed successfully!")
except Exception as e:
st.error(f"Failed to scrape or process website: {e}")
# Main chat interface
st.header("Chat with the Web")
# Initialize the language model
# Check if the API keys are available before initializing the model
if groq_api_key and os.environ.get("SERPAPI_API_KEY"):
llm = ChatGroq(groq_api_key=groq_api_key, model_name="Llama3-8b-8192")
# Create the prompt template
prompt_template = ChatPromptTemplate.from_template(
"""
Answer the questions based on the provided context only.
Please provide the most accurate and comprehensive response based on the question.
<context>
{context}
<context>
Questions:{input}
"""
)
# Display previous chat messages
for message in st.session_state.chat_history:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Get user input
if prompt_input := st.chat_input("Ask a question..."):
if st.session_state.vector is None:
st.warning("Please process a website URL first.")
else:
with st.chat_message("user"):
st.markdown(prompt_input)
st.session_state.chat_history.append({"role": "user", "content": prompt_input})
# --- Part 1: Get answer from the website context ---
with st.spinner("Analyzing website content..."):
website_retriever = st.session_state.vector.as_retriever()
website_chain = create_retrieval_chain(website_retriever, create_stuff_documents_chain(llm, prompt_template))
response_website = website_chain.invoke({"input": prompt_input})
website_answer = response_website['answer']
# --- Part 2: Get answer from Google Search ---
with st.spinner("Searching Google and analyzing results..."):
search = SerpAPIWrapper()
search_results = search.results(prompt_input)
google_docs = []
reference_links = []
if "organic_results" in search_results:
for result in search_results["organic_results"]:
google_docs.append(Document(page_content=result.get("snippet", ""), metadata={"source": result.get("link", "")}))
if result.get("link"):
reference_links.append(f"- [{result.get('title', 'Source')}]({result.get('link')})")
google_answer = "Could not find relevant information from Google search."
if google_docs:
google_vector_store = FAISS.from_documents(google_docs, st.session_state.embeddings)
google_retriever = google_vector_store.as_retriever()
google_chain = create_retrieval_chain(google_retriever, create_stuff_documents_chain(llm, prompt_template))
response_google = google_chain.invoke({"input": prompt_input})
google_answer = response_google['answer']
# --- Part 3: Display combined results ---
with st.chat_message("assistant"):
st.markdown("### From Website Content")
st.markdown(website_answer)
st.markdown("---")
st.markdown("### From Google Search")
st.markdown(google_answer)
if reference_links:
st.markdown("#### References:")
st.markdown("\n".join(reference_links))
# Save combined answer to history
combined_answer = (
f"**From Website Content:**\n{website_answer}\n\n---\n\n"
f"**From Google Search:**\n{google_answer}"
)
if reference_links:
combined_answer += "\n\n**References:**\n" + "\n".join(reference_links)
st.session_state.chat_history.append({"role": "assistant", "content": combined_answer})
else:
st.error("API keys not found. Please set them in your Streamlit secrets.")
|