research-tool / app.py
Deaksh's picture
Update app.py
9de15c5 verified
import os
import streamlit as st
import pickle
import time
import requests
from bs4 import BeautifulSoup
# ---- LangChain Community Packages ----
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
# ---- LangChain Core Packages ----
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document # βœ… Correct import
# ---- Text Splitters ----
from langchain_text_splitters import RecursiveCharacterTextSplitter
# ---- LLM ----
from langchain_groq import ChatGroq
st.title("RockyBot: News Research Tool πŸ“ˆ")
st.sidebar.title("News Article URLs")
# Collect URLs from user input
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = ChatGroq(
api_key=os.environ["GROQ_API_KEY"],
model_name="llama-3.1-8b-instant"
)
def fetch_web_content(url):
"""Fetch text content from URL using BeautifulSoup."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()
except Exception as e:
return f"Error fetching {url}: {str(e)}"
if process_url_clicked:
main_placeholder.text("Data Loading...Started...βœ…")
data = [(url, fetch_web_content(url)) for url in urls if url.strip()]
main_placeholder.text("Data Loading...Completed...βœ…")
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitting...Started...")
docs = []
for url, text in data:
split_docs = text_splitter.split_text(text)
docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs])
main_placeholder.text("Text Splitting...Completed...")
# Embeddings + FAISS
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)
with open(file_path, "wb") as f:
pickle.dump(vectorstore, f)
main_placeholder.text("Vectorstore Ready! βœ…")
# --------------------------
# πŸ”₯ NEW CHAIN (REPLACES RetrievalQAWithSourcesChain)
# --------------------------
prompt = ChatPromptTemplate.from_template("""
You are a financial news analyst.
Use ONLY the provided context to answer.
Context:
{context}
Question: {question}
Provide:
- a concise answer
- list of sources at the end
""")
def build_qa_with_sources_chain(retriever):
"""
New-style LangChain retrieval chain that returns answer + sources.
"""
# LLM + prompt β†’ document chain
document_chain = create_stuff_documents_chain(
llm=llm,
prompt=prompt,
output_parser=StrOutputParser()
)
# Retriever + doc chain
retrieval_chain = create_retrieval_chain(
retriever=retriever,
combine_docs_chain=document_chain,
)
# Wrapper to format consistent outputs
class QAWrapper:
def __init__(self, chain):
self.chain = chain
def invoke(self, query):
result = self.chain.invoke({"question": query})
answer = result.get("answer") or result.get("result") or ""
docs = result.get("context") or []
sources = [d.metadata.get("source", "N/A") for d in docs]
return {"answer": answer, "sources": sources}
return QAWrapper(retrieval_chain)
# --------------------------
# QUERY EXECUTION
# --------------------------
query = st.text_input("Question: ")
if query:
if os.path.exists(file_path):
# Load FAISS DB
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
qa_chain = build_qa_with_sources_chain(retriever)
result = qa_chain.invoke(query)
# Display result
st.header("Answer")
st.write(result["answer"])
st.subheader("Sources")
if result["sources"]:
for s in result["sources"]:
st.write("πŸ”—", s)
else:
st.write("No sources found.")