Spaces:
Sleeping
Sleeping
File size: 4,476 Bytes
a78722c 4d6a1a6 8f231d5 a78722c 60bf7e5 9de15c5 8f231d5 e6b4ab8 8f231d5 4d6a1a6 b4ecdbb a78722c e6b4ab8 4d6a1a6 a78722c e6b4ab8 a78722c cf2e27f e6b4ab8 f5980c7 cf2e27f a78722c e6b4ab8 4d6a1a6 e6b4ab8 4d6a1a6 d809e9e 4d6a1a6 d809e9e e6b4ab8 4d6a1a6 e6b4ab8 1dc72a3 e6b4ab8 4d6a1a6 a78722c 4d6a1a6 e6b4ab8 1dc72a3 e6b4ab8 1dc72a3 e6b4ab8 a78722c e6b4ab8 2c7dad0 e6b4ab8 a78722c 4d6a1a6 e6b4ab8 a78722c e6b4ab8 a78722c e6b4ab8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 | import os
import streamlit as st
import pickle
import time
import requests
from bs4 import BeautifulSoup
# ---- LangChain Community Packages ----
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQAWithSourcesChain
# ---- LangChain Core Packages ----
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.documents import Document # β
Correct import
# ---- Text Splitters ----
from langchain_text_splitters import RecursiveCharacterTextSplitter
# ---- LLM ----
from langchain_groq import ChatGroq
st.title("RockyBot: News Research Tool π")
st.sidebar.title("News Article URLs")
# Collect URLs from user input
urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)]
process_url_clicked = st.sidebar.button("Process URLs")
file_path = "faiss_store_openai.pkl"
main_placeholder = st.empty()
llm = ChatGroq(
api_key=os.environ["GROQ_API_KEY"],
model_name="llama-3.1-8b-instant"
)
def fetch_web_content(url):
"""Fetch text content from URL using BeautifulSoup."""
try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
return soup.get_text()
except Exception as e:
return f"Error fetching {url}: {str(e)}"
if process_url_clicked:
main_placeholder.text("Data Loading...Started...β
")
data = [(url, fetch_web_content(url)) for url in urls if url.strip()]
main_placeholder.text("Data Loading...Completed...β
")
# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(
separators=['\n\n', '\n', '.', ','],
chunk_size=1000
)
main_placeholder.text("Text Splitting...Started...")
docs = []
for url, text in data:
split_docs = text_splitter.split_text(text)
docs.extend([Document(page_content=chunk, metadata={"source": url}) for chunk in split_docs])
main_placeholder.text("Text Splitting...Completed...")
# Embeddings + FAISS
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embedding_model)
with open(file_path, "wb") as f:
pickle.dump(vectorstore, f)
main_placeholder.text("Vectorstore Ready! β
")
# --------------------------
# π₯ NEW CHAIN (REPLACES RetrievalQAWithSourcesChain)
# --------------------------
prompt = ChatPromptTemplate.from_template("""
You are a financial news analyst.
Use ONLY the provided context to answer.
Context:
{context}
Question: {question}
Provide:
- a concise answer
- list of sources at the end
""")
def build_qa_with_sources_chain(retriever):
"""
New-style LangChain retrieval chain that returns answer + sources.
"""
# LLM + prompt β document chain
document_chain = create_stuff_documents_chain(
llm=llm,
prompt=prompt,
output_parser=StrOutputParser()
)
# Retriever + doc chain
retrieval_chain = create_retrieval_chain(
retriever=retriever,
combine_docs_chain=document_chain,
)
# Wrapper to format consistent outputs
class QAWrapper:
def __init__(self, chain):
self.chain = chain
def invoke(self, query):
result = self.chain.invoke({"question": query})
answer = result.get("answer") or result.get("result") or ""
docs = result.get("context") or []
sources = [d.metadata.get("source", "N/A") for d in docs]
return {"answer": answer, "sources": sources}
return QAWrapper(retrieval_chain)
# --------------------------
# QUERY EXECUTION
# --------------------------
query = st.text_input("Question: ")
if query:
if os.path.exists(file_path):
# Load FAISS DB
with open(file_path, "rb") as f:
vectorstore = pickle.load(f)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})
qa_chain = build_qa_with_sources_chain(retriever)
result = qa_chain.invoke(query)
# Display result
st.header("Answer")
st.write(result["answer"])
st.subheader("Sources")
if result["sources"]:
for s in result["sources"]:
st.write("π", s)
else:
st.write("No sources found.")
|