import os import streamlit as st import time from langchain_community.document_loaders import UnstructuredURLLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain_core.prompts import ChatPromptTemplate from langchain_core.runnables import RunnablePassthrough, RunnableLambda from langchain_core.output_parsers import StrOutputParser from langchain_groq import ChatGroq # --------------------------------- # LLM # --------------------------------- llm=ChatGroq( model="llama-3.1-8b-instant", temperature=0, api_key="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", max_tokens=100 ) # --------------------------------- # Prompt # --------------------------------- rag_prompt = ChatPromptTemplate.from_messages([ ("system", "You are a helpful AI assistant.\n" "Answer ONLY using the context provided.\n" "If the context does not contain the answer, say " "'I don't have enough information.'"), ("human", "Context:\n{context}\n\nQuestion:\n{question}") ]) # --------------------------------- # Streamlit config # --------------------------------- st.set_page_config(page_title="RAG URL Chat", layout="wide") st.title("🧠 RAG Chatbot with URLs") # --------------------------------- # Session state # --------------------------------- if "retriever" not in st.session_state: st.session_state.retriever = None # --------------------------------- # Sidebar # --------------------------------- st.sidebar.header("🔗 Input URLs") urls_text = st.sidebar.text_area( "Enter URLs (one per line)", height=200, placeholder="https://example.com\nhttps://another.com" ) process_btn = st.sidebar.button("🚀 Process URLs") # --------------------------------- # Process URLs # --------------------------------- if process_btn: if not urls_text.strip(): st.sidebar.warning("Please enter at least one URL") else: with st.sidebar.spinner("Processing URLs..."): st.session_state.retriever = None st.session_state.vectorstore = None urls = [u.strip() for u in urls_text.split("\n") if u.strip()] headers = { "User-Agent": "Mozilla/5.0 (compatible; RAGBot/1.0; +https://example.com)" } loader = UnstructuredURLLoader(urls=urls,headers=headers) docs = loader.load() splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200 ) splits = splitter.split_documents(docs) embeddings = HuggingFaceEmbeddings( model_name="sentence-transformers/all-MiniLM-L6-v2" ) vectorstore = Chroma.from_documents(splits, embeddings,collection_name=f"rag-{time.time()}") st.session_state.retriever = vectorstore.as_retriever( search_kwargs={"k": 4} ) st.sidebar.success("✅ URLs processed successfully!") # --------------------------------- # Main UI # --------------------------------- st.subheader("💬 Ask a Question") with st.form("chat_form", clear_on_submit=False): question = st.text_input( "Enter your question", placeholder="Ask something from the provided URLs..." ) ask_btn = st.form_submit_button("Ask") # --------------------------------- # Answer + Sources # --------------------------------- if ask_btn: if st.session_state.retriever is None: st.warning("Please process URLs first") elif not question.strip(): st.warning("Please enter a question") else: if ask_btn: if st.session_state.retriever is None: st.warning("Please process URLs first") elif not question.strip(): st.warning("Please enter a question") else: with st.spinner("🤖 Generating answer..."): time.sleep(0.3) # ensures spinner renders retriever = st.session_state.retriever rag_chain = ( { "context": retriever, "question": RunnablePassthrough() } | rag_prompt | llm | StrOutputParser() ) answer = rag_chain.invoke(question) docs = retriever.invoke(question) # Answer st.markdown("### ✅ Answer") st.write(answer) # Sources st.markdown("### 📚 Sources") for i, doc in enumerate(docs): source = doc.metadata.get("source", "Unknown source") st.write(f"{i+1}. {source}")