Spaces:

ytrsoymr
/

WebQueryBot

Sleeping

File size: 3,134 Bytes

import os
import streamlit as st
from dotenv import load_dotenv
from tavily import TavilyClient
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA
from langchain_chroma import Chroma

# === 🛠 Fix protobuf issue ===
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# === 🔐 Load Environment Variables ===
load_dotenv()
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") or st.secrets.get("GOOGLE_API_KEY", "")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") or st.secrets.get("TAVILY_API_KEY", "")

# === 🚨 Validate keys ===
if not GOOGLE_API_KEY or not TAVILY_API_KEY:
    st.error("API keys missing! Please check your .env file or Streamlit secrets.")
    st.stop()

# === 🤖 Set up clients ===
tavily_client = TavilyClient(api_key=TAVILY_API_KEY)
embedding_model = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001", google_api_key=GOOGLE_API_KEY
)
llm = ChatGoogleGenerativeAI(
    model="models/gemini-1.5-flash", google_api_key=GOOGLE_API_KEY
)

# === 🌐 Streamlit UI ===
st.title("🌐 Ask Questions About Any Website!")

# --- Step 1: Website input ---
url = st.text_input("🔗 Enter a website URL:")

if st.button("🚀 Extract and Index"):
    if not url.strip():
        st.warning("Please enter a valid URL.")
    else:
        with st.spinner("Extracting content..."):
            try:
                data = tavily_client.extract(urls=url)
                raw_text = data.get("text") or data.get("results", [{}])[0].get("raw_content", "")
                if not raw_text.strip():
                    st.error("❌ Failed to extract content from the website.")
                    st.stop()

                doc = Document(page_content=raw_text)
                splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
                chunks = splitter.split_documents([doc])

                # Vectorstore with Chroma
                vectorstore = Chroma.from_documents(
                    chunks,
                    embedding=embedding_model,
                    collection_name="website_collection",
                    persist_directory="./chroma_db"
                )

                st.session_state.vectorstore = vectorstore
                st.success("✅ Website content indexed successfully!")

            except Exception as e:
                st.error(f"❌ Error during extraction/indexing: {str(e)}")

# --- Step 2: Ask a question ---
question = st.text_input("💬 Ask a question about the website content:")

if question and "vectorstore" in st.session_state:
    with st.spinner("Thinking..."):
        try:
            retriever = st.session_state.vectorstore.as_retriever()
            qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
            result = qa_chain.run(question)
            st.subheader("✅ Answer")
            st.write(result)
        except Exception as e:
            st.error(f"❌ Failed to generate answer: {str(e)}")