Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import os
|
|
| 8 |
import time
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
|
|
|
| 11 |
from bs4 import SoupStrainer
|
| 12 |
|
| 13 |
# Load environment variables (optional)
|
|
@@ -77,15 +78,36 @@ st.markdown("""
|
|
| 77 |
.stSpinner > div {
|
| 78 |
color: #00ddeb;
|
| 79 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
</style>
|
| 81 |
""", unsafe_allow_html=True)
|
| 82 |
|
|
|
|
|
|
|
|
|
|
| 83 |
# Set Streamlit app title
|
| 84 |
st.title("WebChatter π¬")
|
| 85 |
|
| 86 |
# Initialize session state
|
| 87 |
if "index_created" not in st.session_state:
|
| 88 |
st.session_state.index_created = False
|
|
|
|
|
|
|
| 89 |
|
| 90 |
# Sidebar for URL input
|
| 91 |
with st.sidebar:
|
|
@@ -99,9 +121,31 @@ main_container = st.container()
|
|
| 99 |
# Initialize the Groq LLM
|
| 100 |
llm = ChatGroq(
|
| 101 |
api_key=GROQ_API_KEY,
|
| 102 |
-
model="llama3-70b-8192"
|
|
|
|
| 103 |
)
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
def save_faiss_index(vectorstore, path):
|
| 106 |
vectorstore.save_local(path)
|
| 107 |
|
|
@@ -117,7 +161,6 @@ if process_url_clicked:
|
|
| 117 |
with st.spinner("Processing URL..."):
|
| 118 |
try:
|
| 119 |
st.text("Data Loading...Started...β
β
β
")
|
| 120 |
-
# Use SoupStrainer to specify tags to parse
|
| 121 |
parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
|
| 122 |
loader = WebBaseLoader(
|
| 123 |
web_paths=[url.strip()],
|
|
@@ -130,6 +173,9 @@ if process_url_clicked:
|
|
| 130 |
st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
|
| 131 |
st.stop()
|
| 132 |
|
|
|
|
|
|
|
|
|
|
| 133 |
st.text("Text Splitter...Started...β
β
β
")
|
| 134 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 135 |
separators=['\n\n', '\n', '.', ','],
|
|
@@ -154,6 +200,14 @@ if process_url_clicked:
|
|
| 154 |
except Exception as e:
|
| 155 |
st.error(f"Error processing URL: {str(e)}")
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
# Query input with Ask button
|
| 158 |
with main_container:
|
| 159 |
st.header("Ask a Question")
|
|
@@ -169,7 +223,11 @@ if ask_clicked and query:
|
|
| 169 |
try:
|
| 170 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 171 |
vectorstore = load_faiss_index("faiss_index", embeddings)
|
| 172 |
-
chain = RetrievalQAWithSourcesChain.from_llm(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
result = chain({"question": query}, return_only_outputs=True)
|
| 174 |
|
| 175 |
if not result.get("answer"):
|
|
@@ -188,4 +246,15 @@ if ask_clicked and query:
|
|
| 188 |
else:
|
| 189 |
st.write("No sources found.")
|
| 190 |
except Exception as e:
|
| 191 |
-
st.error(f"Error answering query: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
import time
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 11 |
+
from langchain.prompts import PromptTemplate
|
| 12 |
from bs4 import SoupStrainer
|
| 13 |
|
| 14 |
# Load environment variables (optional)
|
|
|
|
| 78 |
.stSpinner > div {
|
| 79 |
color: #00ddeb;
|
| 80 |
}
|
| 81 |
+
.footer {
|
| 82 |
+
display: flex;
|
| 83 |
+
align-items: center;
|
| 84 |
+
justify-content: center;
|
| 85 |
+
padding: 10px;
|
| 86 |
+
background: rgba(255, 255, 255, 0.1);
|
| 87 |
+
border-top: 1px solid rgba(255, 255, 255, 0.2);
|
| 88 |
+
position: fixed;
|
| 89 |
+
bottom: 0;
|
| 90 |
+
width: 100%;
|
| 91 |
+
color: #e0e0e0;
|
| 92 |
+
font-size: 14px;
|
| 93 |
+
}
|
| 94 |
+
.footer img {
|
| 95 |
+
margin-right: 10px;
|
| 96 |
+
}
|
| 97 |
</style>
|
| 98 |
""", unsafe_allow_html=True)
|
| 99 |
|
| 100 |
+
# Display large logo at the top of the main page
|
| 101 |
+
st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=300)
|
| 102 |
+
|
| 103 |
# Set Streamlit app title
|
| 104 |
st.title("WebChatter π¬")
|
| 105 |
|
| 106 |
# Initialize session state
|
| 107 |
if "index_created" not in st.session_state:
|
| 108 |
st.session_state.index_created = False
|
| 109 |
+
if "url_content" not in st.session_state:
|
| 110 |
+
st.session_state.url_content = None
|
| 111 |
|
| 112 |
# Sidebar for URL input
|
| 113 |
with st.sidebar:
|
|
|
|
| 121 |
# Initialize the Groq LLM
|
| 122 |
llm = ChatGroq(
|
| 123 |
api_key=GROQ_API_KEY,
|
| 124 |
+
model="llama3-70b-8192",
|
| 125 |
+
max_tokens=2048 # Increased for detailed answers
|
| 126 |
)
|
| 127 |
|
| 128 |
+
# Custom prompt for detailed answers
|
| 129 |
+
qa_prompt = PromptTemplate(
|
| 130 |
+
template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response.
|
| 131 |
+
|
| 132 |
+
Context: {context}
|
| 133 |
+
|
| 134 |
+
Question: {question}
|
| 135 |
+
|
| 136 |
+
Answer with sources: """
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
# Function to summarize URL content
|
| 140 |
+
def summarize_content(content, llm):
|
| 141 |
+
summary_prompt = f"""Summarize the following content in 3-5 sentences, capturing the main points and key details:
|
| 142 |
+
|
| 143 |
+
{content}
|
| 144 |
+
|
| 145 |
+
Summary: """
|
| 146 |
+
summary = llm.invoke(summary_prompt).content
|
| 147 |
+
return summary
|
| 148 |
+
|
| 149 |
def save_faiss_index(vectorstore, path):
|
| 150 |
vectorstore.save_local(path)
|
| 151 |
|
|
|
|
| 161 |
with st.spinner("Processing URL..."):
|
| 162 |
try:
|
| 163 |
st.text("Data Loading...Started...β
β
β
")
|
|
|
|
| 164 |
parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
|
| 165 |
loader = WebBaseLoader(
|
| 166 |
web_paths=[url.strip()],
|
|
|
|
| 173 |
st.error("No content loaded from URL. Try a different URL (e.g., https://www.bbc.com/news/science-environment-67299122).")
|
| 174 |
st.stop()
|
| 175 |
|
| 176 |
+
# Store content for summarization
|
| 177 |
+
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 178 |
+
|
| 179 |
st.text("Text Splitter...Started...β
β
β
")
|
| 180 |
text_splitter = RecursiveCharacterTextSplitter(
|
| 181 |
separators=['\n\n', '\n', '.', ','],
|
|
|
|
| 200 |
except Exception as e:
|
| 201 |
st.error(f"Error processing URL: {str(e)}")
|
| 202 |
|
| 203 |
+
# Display summary if content is available
|
| 204 |
+
if st.session_state.url_content:
|
| 205 |
+
with main_container:
|
| 206 |
+
st.header("Summary of the URL Content")
|
| 207 |
+
with st.spinner("Generating summary..."):
|
| 208 |
+
summary = summarize_content(st.session_state.url_content, llm)
|
| 209 |
+
st.write(summary)
|
| 210 |
+
|
| 211 |
# Query input with Ask button
|
| 212 |
with main_container:
|
| 213 |
st.header("Ask a Question")
|
|
|
|
| 223 |
try:
|
| 224 |
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 225 |
vectorstore = load_faiss_index("faiss_index", embeddings)
|
| 226 |
+
chain = RetrievalQAWithSourcesChain.from_llm(
|
| 227 |
+
llm=llm,
|
| 228 |
+
retriever=vectorstore.as_retriever(),
|
| 229 |
+
question_prompt=qa_prompt
|
| 230 |
+
)
|
| 231 |
result = chain({"question": query}, return_only_outputs=True)
|
| 232 |
|
| 233 |
if not result.get("answer"):
|
|
|
|
| 246 |
else:
|
| 247 |
st.write("No sources found.")
|
| 248 |
except Exception as e:
|
| 249 |
+
st.error(f"Error answering query: {str(e)}")
|
| 250 |
+
|
| 251 |
+
# Footer with tiny logo and text
|
| 252 |
+
st.markdown(
|
| 253 |
+
"""
|
| 254 |
+
<div class="footer">
|
| 255 |
+
<img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="20">
|
| 256 |
+
WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
|
| 257 |
+
</div>
|
| 258 |
+
""",
|
| 259 |
+
unsafe_allow_html=True
|
| 260 |
+
)
|