import streamlit as st from dotenv import load_dotenv from langchain_community.document_loaders import WebBaseLoader from langchain.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores.faiss import FAISS from langchain.text_splitter import RecursiveCharacterTextSplitter import os import time from langchain_groq import ChatGroq from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain # Load environment variables (optional) load_dotenv() # Hardcoded Groq API key (NOT RECOMMENDED for production) GROQ_API_KEY = "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" # Set Streamlit app title st.title("News Research Tool 📈") st.sidebar.title("News Article URLs") # Initialize session state for FAISS index if "index_created" not in st.session_state: st.session_state.index_created = False # Get URLs from user input urls = [] for i in range(3): url = st.sidebar.text_input(f"URL {i+1}") if url: urls.append(url) # Button to process URLs process_url_clicked = st.sidebar.button("Process URLs") faiss_index_path = "faiss_index" # Placeholder for main content main_placeholder = st.empty() # Initialize the Groq LLM llm = ChatGroq( api_key=GROQ_API_KEY, model="llama3-70b-8192" ) def save_faiss_index(vectorstore, path): vectorstore.save_local(path) def load_faiss_index(path, embeddings): return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True) if process_url_clicked: if not urls: main_placeholder.error("Please provide at least one valid URL.") else: try: main_placeholder.text("Data Loading...Started...✅✅✅") loader = WebBaseLoader(urls) data = loader.load() # Check loaded data if not data or all(len(doc.page_content.strip()) == 0 for doc in data): main_placeholder.error("No content loaded from URLs. Try different URLs.") st.stop() main_placeholder.text("Text Splitter...Started...✅✅✅") text_splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=1000 ) docs = text_splitter.split_documents(data) main_placeholder.text(f"Split into {len(docs)} document chunks.") main_placeholder.text("Embedding Vector Started Building...✅✅✅") embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore_openai = FAISS.from_documents(docs, embeddings) save_faiss_index(vectorstore_openai, faiss_index_path) st.session_state.index_created = True main_placeholder.text("FAISS index saved successfully! ✅✅✅") time.sleep(2) main_placeholder.empty() except Exception as e: main_placeholder.error(f"Error processing URLs: {str(e)}") query = main_placeholder.text_input("Question: ") if query: if not st.session_state.index_created or not os.path.exists(faiss_index_path): main_placeholder.error("No FAISS index found. Please process URLs first.") else: with st.spinner("Processing your question..."): try: embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = load_faiss_index(faiss_index_path, embeddings) chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) result = chain({"question": query}, return_only_outputs=True) if not result.get("answer"): main_placeholder.warning("No answer generated. Try a different question or URLs.") st.stop() st.header("Answer") st.write(result["answer"]) sources = result.get("sources", "") if sources: st.subheader("Sources:") sources_list = sources.split("\n") for source in sources_list: st.write(source) else: st.write("No sources found.") except Exception as e: main_placeholder.error(f"Error answering query: {str(e)}")