Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,11 +12,14 @@ from langchain.prompts import PromptTemplate
|
|
| 12 |
from bs4 import SoupStrainer
|
| 13 |
import PyPDF2
|
| 14 |
|
| 15 |
-
# Load environment variables
|
| 16 |
load_dotenv()
|
| 17 |
|
| 18 |
-
#
|
| 19 |
-
GROQ_API_KEY = "
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
# Custom CSS
|
| 22 |
st.markdown("""
|
|
@@ -98,12 +101,9 @@ st.markdown("""
|
|
| 98 |
</style>
|
| 99 |
""", unsafe_allow_html=True)
|
| 100 |
|
| 101 |
-
# Display
|
| 102 |
st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
|
| 103 |
|
| 104 |
-
# Set Streamlit app title
|
| 105 |
-
st.title("WebChatter π¬")
|
| 106 |
-
|
| 107 |
# Initialize session state
|
| 108 |
if "url_content" not in st.session_state:
|
| 109 |
st.session_state.url_content = None
|
|
@@ -115,6 +115,8 @@ if "index_created" not in st.session_state:
|
|
| 115 |
st.session_state.index_created = False
|
| 116 |
if "content_type" not in st.session_state:
|
| 117 |
st.session_state.content_type = None
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# Initialize LLM once at the start
|
| 120 |
if "llm" not in st.session_state:
|
|
@@ -131,7 +133,7 @@ with st.sidebar:
|
|
| 131 |
process_url_clicked = st.button("Process URL")
|
| 132 |
|
| 133 |
st.header("Upload PDF File")
|
| 134 |
-
pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
|
| 135 |
process_pdf_clicked = st.button("Process PDF")
|
| 136 |
|
| 137 |
# Main content container
|
|
@@ -148,6 +150,13 @@ Question: {question}
|
|
| 148 |
Answer with sources: """
|
| 149 |
)
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# Function to summarize content
|
| 152 |
def summarize_content(content, llm):
|
| 153 |
# Shorter summary for web URLs and PDFs (5-10 sentences)
|
|
@@ -168,6 +177,9 @@ def extract_text_from_pdf(pdf_file):
|
|
| 168 |
page_text = page.extract_text()
|
| 169 |
if page_text:
|
| 170 |
text += page_text + "\n"
|
|
|
|
|
|
|
|
|
|
| 171 |
return text
|
| 172 |
except Exception as e:
|
| 173 |
st.error(f"Error extracting text from PDF: {str(e)}")
|
|
@@ -204,6 +216,17 @@ def create_qa_chain(vectorstore, llm):
|
|
| 204 |
)
|
| 205 |
return qa_chain
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
# Process Web URL
|
| 208 |
if process_url_clicked:
|
| 209 |
with main_container:
|
|
@@ -212,6 +235,8 @@ if process_url_clicked:
|
|
| 212 |
else:
|
| 213 |
with st.spinner("Processing URL..."):
|
| 214 |
try:
|
|
|
|
|
|
|
| 215 |
st.text("Data Loading...Started...β
β
β
")
|
| 216 |
parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
|
| 217 |
loader = WebBaseLoader(
|
|
@@ -233,7 +258,8 @@ if process_url_clicked:
|
|
| 233 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
|
| 234 |
st.session_state.index_created = True
|
| 235 |
st.session_state.content_type = "web"
|
| 236 |
-
st.session_state.
|
|
|
|
| 237 |
st.text("Content processed successfully! β
β
β
")
|
| 238 |
except Exception as e:
|
| 239 |
st.error(f"Error processing URL: {str(e)}")
|
|
@@ -247,11 +273,12 @@ if process_pdf_clicked:
|
|
| 247 |
else:
|
| 248 |
with st.spinner("Processing PDF..."):
|
| 249 |
try:
|
|
|
|
|
|
|
| 250 |
st.text("Extracting Text from PDF...Started...β
β
β
")
|
| 251 |
pdf_text = extract_text_from_pdf(pdf_file)
|
| 252 |
|
| 253 |
if not pdf_text:
|
| 254 |
-
st.error("No text could be extracted from the PDF. Try a different file.")
|
| 255 |
st.stop()
|
| 256 |
|
| 257 |
# Initialize embeddings only when needed
|
|
@@ -263,17 +290,31 @@ if process_pdf_clicked:
|
|
| 263 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
|
| 264 |
st.session_state.index_created = True
|
| 265 |
st.session_state.content_type = "pdf"
|
| 266 |
-
st.session_state.
|
|
|
|
| 267 |
st.text("PDF processed successfully! β
β
β
")
|
| 268 |
except Exception as e:
|
| 269 |
st.error(f"Error processing PDF: {str(e)}")
|
| 270 |
st.stop()
|
| 271 |
|
| 272 |
-
# Summary button
|
| 273 |
with main_container:
|
| 274 |
-
if st.session_state.url_content
|
| 275 |
-
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
# Display summary if generated
|
| 279 |
if st.session_state.summary:
|
|
@@ -322,7 +363,7 @@ if st.session_state.url_content and st.session_state.index_created:
|
|
| 322 |
st.markdown(
|
| 323 |
"""
|
| 324 |
<div class="footer">
|
| 325 |
-
<img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="
|
| 326 |
WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
|
| 327 |
</div>
|
| 328 |
""",
|
|
|
|
| 12 |
from bs4 import SoupStrainer
|
| 13 |
import PyPDF2
|
| 14 |
|
| 15 |
+
# Load environment variables
|
| 16 |
load_dotenv()
|
| 17 |
|
| 18 |
+
# Get Groq API key from environment variable (recommended) or use hardcoded fallback
|
| 19 |
+
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
|
| 20 |
+
if not GROQ_API_KEY:
|
| 21 |
+
st.warning("GROQ_API_KEY not found in environment variables. Using hardcoded key (not recommended for production).")
|
| 22 |
+
GROQ_API_KEY = "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"
|
| 23 |
|
| 24 |
# Custom CSS
|
| 25 |
st.markdown("""
|
|
|
|
| 101 |
</style>
|
| 102 |
""", unsafe_allow_html=True)
|
| 103 |
|
| 104 |
+
# Display logo as the title
|
| 105 |
st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
|
| 106 |
|
|
|
|
|
|
|
|
|
|
| 107 |
# Initialize session state
|
| 108 |
if "url_content" not in st.session_state:
|
| 109 |
st.session_state.url_content = None
|
|
|
|
| 115 |
st.session_state.index_created = False
|
| 116 |
if "content_type" not in st.session_state:
|
| 117 |
st.session_state.content_type = None
|
| 118 |
+
if "token_count" not in st.session_state:
|
| 119 |
+
st.session_state.token_count = 0
|
| 120 |
|
| 121 |
# Initialize LLM once at the start
|
| 122 |
if "llm" not in st.session_state:
|
|
|
|
| 133 |
process_url_clicked = st.button("Process URL")
|
| 134 |
|
| 135 |
st.header("Upload PDF File")
|
| 136 |
+
pdf_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Upload a text-based PDF for best results.")
|
| 137 |
process_pdf_clicked = st.button("Process PDF")
|
| 138 |
|
| 139 |
# Main content container
|
|
|
|
| 150 |
Answer with sources: """
|
| 151 |
)
|
| 152 |
|
| 153 |
+
# Function to estimate token count (approximation: 1 token β 4 characters for English text)
|
| 154 |
+
def estimate_token_count(text):
|
| 155 |
+
if not text:
|
| 156 |
+
return 0
|
| 157 |
+
# Approximate token count: 1 token β 4 characters (including spaces and punctuation)
|
| 158 |
+
return len(text) // 4
|
| 159 |
+
|
| 160 |
# Function to summarize content
|
| 161 |
def summarize_content(content, llm):
|
| 162 |
# Shorter summary for web URLs and PDFs (5-10 sentences)
|
|
|
|
| 177 |
page_text = page.extract_text()
|
| 178 |
if page_text:
|
| 179 |
text += page_text + "\n"
|
| 180 |
+
if not text.strip():
|
| 181 |
+
st.error("No text could be extracted from the PDF. This may be a scanned or image-based PDF. Please upload a text-based PDF.")
|
| 182 |
+
return None
|
| 183 |
return text
|
| 184 |
except Exception as e:
|
| 185 |
st.error(f"Error extracting text from PDF: {str(e)}")
|
|
|
|
| 216 |
)
|
| 217 |
return qa_chain
|
| 218 |
|
| 219 |
+
# Reset session state when switching content types
|
| 220 |
+
def reset_session_state():
|
| 221 |
+
st.session_state.url_content = None
|
| 222 |
+
st.session_state.summary = None
|
| 223 |
+
st.session_state.vectorstore = None
|
| 224 |
+
st.session_state.index_created = False
|
| 225 |
+
st.session_state.content_type = None
|
| 226 |
+
st.session_state.token_count = 0
|
| 227 |
+
if "qa_chain" in st.session_state:
|
| 228 |
+
st.session_state.qa_chain = None
|
| 229 |
+
|
| 230 |
# Process Web URL
|
| 231 |
if process_url_clicked:
|
| 232 |
with main_container:
|
|
|
|
| 235 |
else:
|
| 236 |
with st.spinner("Processing URL..."):
|
| 237 |
try:
|
| 238 |
+
# Reset session state to avoid stale data
|
| 239 |
+
reset_session_state()
|
| 240 |
st.text("Data Loading...Started...β
β
β
")
|
| 241 |
parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
|
| 242 |
loader = WebBaseLoader(
|
|
|
|
| 258 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
|
| 259 |
st.session_state.index_created = True
|
| 260 |
st.session_state.content_type = "web"
|
| 261 |
+
st.session_state.token_count = estimate_token_count(st.session_state.url_content)
|
| 262 |
+
st.text(f"Estimated token count: {st.session_state.token_count}")
|
| 263 |
st.text("Content processed successfully! β
β
β
")
|
| 264 |
except Exception as e:
|
| 265 |
st.error(f"Error processing URL: {str(e)}")
|
|
|
|
| 273 |
else:
|
| 274 |
with st.spinner("Processing PDF..."):
|
| 275 |
try:
|
| 276 |
+
# Reset session state to avoid stale data
|
| 277 |
+
reset_session_state()
|
| 278 |
st.text("Extracting Text from PDF...Started...β
β
β
")
|
| 279 |
pdf_text = extract_text_from_pdf(pdf_file)
|
| 280 |
|
| 281 |
if not pdf_text:
|
|
|
|
| 282 |
st.stop()
|
| 283 |
|
| 284 |
# Initialize embeddings only when needed
|
|
|
|
| 290 |
st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
|
| 291 |
st.session_state.index_created = True
|
| 292 |
st.session_state.content_type = "pdf"
|
| 293 |
+
st.session_state.token_count = estimate_token_count(st.session_state.url_content)
|
| 294 |
+
st.text(f"Estimated token count: {st.session_state.token_count}")
|
| 295 |
st.text("PDF processed successfully! β
β
β
")
|
| 296 |
except Exception as e:
|
| 297 |
st.error(f"Error processing PDF: {str(e)}")
|
| 298 |
st.stop()
|
| 299 |
|
| 300 |
+
# Summary button with token limit check
|
| 301 |
with main_container:
|
| 302 |
+
if st.session_state.url_content:
|
| 303 |
+
# Check if content is too large for summarization (threshold: 5,000 tokens to stay under 6,000 TPM limit)
|
| 304 |
+
if st.session_state.token_count > 5000 and st.session_state.content_type == "pdf":
|
| 305 |
+
st.warning("If the PDF is large, users are requested not to summarize it, rather they can keep asking questions.")
|
| 306 |
+
elif st.session_state.token_count > 5000 and st.session_state.content_type == "web":
|
| 307 |
+
st.warning("The web content is too large to summarize (estimated tokens: " + str(st.session_state.token_count) + "). Please ask questions instead.")
|
| 308 |
+
else:
|
| 309 |
+
if st.button("Generate Summary"):
|
| 310 |
+
with st.spinner("Generating summary..."):
|
| 311 |
+
try:
|
| 312 |
+
st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
|
| 313 |
+
except Exception as e:
|
| 314 |
+
st.error(f"Error generating summary: {str(e)}")
|
| 315 |
+
if "rate_limit_exceeded" in str(e):
|
| 316 |
+
st.warning("The content is too large for summarization due to API rate limits. Please ask questions instead or try a smaller document.")
|
| 317 |
+
st.stop()
|
| 318 |
|
| 319 |
# Display summary if generated
|
| 320 |
if st.session_state.summary:
|
|
|
|
| 363 |
st.markdown(
|
| 364 |
"""
|
| 365 |
<div class="footer">
|
| 366 |
+
<img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="120">
|
| 367 |
WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
|
| 368 |
</div>
|
| 369 |
""",
|