import os import streamlit as st import pickle import time import requests from bs4 import BeautifulSoup # ---- LangChain Community Packages ---- from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import FAISS from langchain.chains import RetrievalQAWithSourcesChain # ---- LangChain Core Packages ---- from langchain_core.prompts import ChatPromptTemplate from langchain_core.output_parsers import StrOutputParser from langchain_core.documents import Document # ✅ Correct import # ---- Text Splitters ---- from langchain_text_splitters import RecursiveCharacterTextSplitter # ---- LLM ---- from langchain_groq import ChatGroq st.title("RockyBot: News Research Tool 📈") st.sidebar.title("News Article URLs") # Collect URLs from user input urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)] process_url_clicked = st.sidebar.button("Process URLs") file_path = "faiss_store_openai.pkl" main_placeholder = st.empty() llm = ChatGroq( api_key=os.environ["GROQ_API_KEY"], model_name="llama-3.1-8b-instant" ) def fetch_web_content(url): """Fetch text content from URL using BeautifulSoup.""" try: response = requests.get(url, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.text, "html.parser") return soup.get_text() except Exception as e: return f"Error fetching {url}: {str(e)}" if process_url_clicked: main_placeholder.text("Data Loading...Started...✅") data = [(url, fetch_web_content(url)) for url in urls if url.strip()] main_placeholder.text("Data Loading...Completed...✅") # Split into chunks text_splitter = RecursiveCharacterTextSplitter( separators=['\n\n', '\n', '.', ','], chunk_size=1000 ) main_placeholder.text("Text Splitting...Started...") docs = [] for url, text in data: split_docs = text_splitter.split_text(text) docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs]) main_placeholder.text("Text Splitting...Completed...") # Embeddings + FAISS embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") vectorstore = FAISS.from_documents(docs, embedding_model) with open(file_path, "wb") as f: pickle.dump(vectorstore, f) main_placeholder.text("Vectorstore Ready! ✅") # -------------------------- # 🔥 NEW CHAIN (REPLACES RetrievalQAWithSourcesChain) # -------------------------- prompt = ChatPromptTemplate.from_template(""" You are a financial news analyst. Use ONLY the provided context to answer. Context: {context} Question: {question} Provide: - a concise answer - list of sources at the end """) def build_qa_with_sources_chain(retriever): """ New-style LangChain retrieval chain that returns answer + sources. """ # LLM + prompt → document chain document_chain = create_stuff_documents_chain( llm=llm, prompt=prompt, output_parser=StrOutputParser() ) # Retriever + doc chain retrieval_chain = create_retrieval_chain( retriever=retriever, combine_docs_chain=document_chain, ) # Wrapper to format consistent outputs class QAWrapper: def __init__(self, chain): self.chain = chain def invoke(self, query): result = self.chain.invoke({"question": query}) answer = result.get("answer") or result.get("result") or "" docs = result.get("context") or [] sources = [d.metadata.get("source", "N/A") for d in docs] return {"answer": answer, "sources": sources} return QAWrapper(retrieval_chain) # -------------------------- # QUERY EXECUTION # -------------------------- query = st.text_input("Question: ") if query: if os.path.exists(file_path): # Load FAISS DB with open(file_path, "rb") as f: vectorstore = pickle.load(f) retriever = vectorstore.as_retriever(search_kwargs={"k": 4}) qa_chain = build_qa_with_sources_chain(retriever) result = qa_chain.invoke(query) # Display result st.header("Answer") st.write(result["answer"]) st.subheader("Sources") if result["sources"]: for s in result["sources"]: st.write("🔗", s) else: st.write("No sources found.")