MahatirTusher commited on
Commit
c2d337a
Β·
verified Β·
1 Parent(s): eca7766

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -16
app.py CHANGED
@@ -12,11 +12,14 @@ from langchain.prompts import PromptTemplate
12
  from bs4 import SoupStrainer
13
  import PyPDF2
14
 
15
- # Load environment variables (optional)
16
  load_dotenv()
17
 
18
- # Hardcoded Groq API key
19
- GROQ_API_KEY = "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"
 
 
 
20
 
21
  # Custom CSS
22
  st.markdown("""
@@ -98,12 +101,9 @@ st.markdown("""
98
  </style>
99
  """, unsafe_allow_html=True)
100
 
101
- # Display large logo at the top of the main page
102
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
103
 
104
- # Set Streamlit app title
105
- st.title("WebChatter πŸ’¬")
106
-
107
  # Initialize session state
108
  if "url_content" not in st.session_state:
109
  st.session_state.url_content = None
@@ -115,6 +115,8 @@ if "index_created" not in st.session_state:
115
  st.session_state.index_created = False
116
  if "content_type" not in st.session_state:
117
  st.session_state.content_type = None
 
 
118
 
119
  # Initialize LLM once at the start
120
  if "llm" not in st.session_state:
@@ -131,7 +133,7 @@ with st.sidebar:
131
  process_url_clicked = st.button("Process URL")
132
 
133
  st.header("Upload PDF File")
134
- pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
135
  process_pdf_clicked = st.button("Process PDF")
136
 
137
  # Main content container
@@ -148,6 +150,13 @@ Question: {question}
148
  Answer with sources: """
149
  )
150
 
 
 
 
 
 
 
 
151
  # Function to summarize content
152
  def summarize_content(content, llm):
153
  # Shorter summary for web URLs and PDFs (5-10 sentences)
@@ -168,6 +177,9 @@ def extract_text_from_pdf(pdf_file):
168
  page_text = page.extract_text()
169
  if page_text:
170
  text += page_text + "\n"
 
 
 
171
  return text
172
  except Exception as e:
173
  st.error(f"Error extracting text from PDF: {str(e)}")
@@ -204,6 +216,17 @@ def create_qa_chain(vectorstore, llm):
204
  )
205
  return qa_chain
206
 
 
 
 
 
 
 
 
 
 
 
 
207
  # Process Web URL
208
  if process_url_clicked:
209
  with main_container:
@@ -212,6 +235,8 @@ if process_url_clicked:
212
  else:
213
  with st.spinner("Processing URL..."):
214
  try:
 
 
215
  st.text("Data Loading...Started...βœ…βœ…βœ…")
216
  parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
217
  loader = WebBaseLoader(
@@ -233,7 +258,8 @@ if process_url_clicked:
233
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
234
  st.session_state.index_created = True
235
  st.session_state.content_type = "web"
236
- st.session_state.summary = None
 
237
  st.text("Content processed successfully! βœ…βœ…βœ…")
238
  except Exception as e:
239
  st.error(f"Error processing URL: {str(e)}")
@@ -247,11 +273,12 @@ if process_pdf_clicked:
247
  else:
248
  with st.spinner("Processing PDF..."):
249
  try:
 
 
250
  st.text("Extracting Text from PDF...Started...βœ…βœ…βœ…")
251
  pdf_text = extract_text_from_pdf(pdf_file)
252
 
253
  if not pdf_text:
254
- st.error("No text could be extracted from the PDF. Try a different file.")
255
  st.stop()
256
 
257
  # Initialize embeddings only when needed
@@ -263,17 +290,31 @@ if process_pdf_clicked:
263
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
264
  st.session_state.index_created = True
265
  st.session_state.content_type = "pdf"
266
- st.session_state.summary = None
 
267
  st.text("PDF processed successfully! βœ…βœ…βœ…")
268
  except Exception as e:
269
  st.error(f"Error processing PDF: {str(e)}")
270
  st.stop()
271
 
272
- # Summary button
273
  with main_container:
274
- if st.session_state.url_content and st.button("Generate Summary"):
275
- with st.spinner("Generating summary..."):
276
- st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
  # Display summary if generated
279
  if st.session_state.summary:
@@ -322,7 +363,7 @@ if st.session_state.url_content and st.session_state.index_created:
322
  st.markdown(
323
  """
324
  <div class="footer">
325
- <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="80">
326
  WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
327
  </div>
328
  """,
 
12
  from bs4 import SoupStrainer
13
  import PyPDF2
14
 
15
+ # Load environment variables
16
  load_dotenv()
17
 
18
+ # Get Groq API key from environment variable (recommended) or use hardcoded fallback
19
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
20
+ if not GROQ_API_KEY:
21
+ st.warning("GROQ_API_KEY not found in environment variables. Using hardcoded key (not recommended for production).")
22
+ GROQ_API_KEY = "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"
23
 
24
  # Custom CSS
25
  st.markdown("""
 
101
  </style>
102
  """, unsafe_allow_html=True)
103
 
104
+ # Display logo as the title
105
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
106
 
 
 
 
107
  # Initialize session state
108
  if "url_content" not in st.session_state:
109
  st.session_state.url_content = None
 
115
  st.session_state.index_created = False
116
  if "content_type" not in st.session_state:
117
  st.session_state.content_type = None
118
+ if "token_count" not in st.session_state:
119
+ st.session_state.token_count = 0
120
 
121
  # Initialize LLM once at the start
122
  if "llm" not in st.session_state:
 
133
  process_url_clicked = st.button("Process URL")
134
 
135
  st.header("Upload PDF File")
136
+ pdf_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Upload a text-based PDF for best results.")
137
  process_pdf_clicked = st.button("Process PDF")
138
 
139
  # Main content container
 
150
  Answer with sources: """
151
  )
152
 
153
+ # Function to estimate token count (approximation: 1 token β‰ˆ 4 characters for English text)
154
+ def estimate_token_count(text):
155
+ if not text:
156
+ return 0
157
+ # Approximate token count: 1 token β‰ˆ 4 characters (including spaces and punctuation)
158
+ return len(text) // 4
159
+
160
  # Function to summarize content
161
  def summarize_content(content, llm):
162
  # Shorter summary for web URLs and PDFs (5-10 sentences)
 
177
  page_text = page.extract_text()
178
  if page_text:
179
  text += page_text + "\n"
180
+ if not text.strip():
181
+ st.error("No text could be extracted from the PDF. This may be a scanned or image-based PDF. Please upload a text-based PDF.")
182
+ return None
183
  return text
184
  except Exception as e:
185
  st.error(f"Error extracting text from PDF: {str(e)}")
 
216
  )
217
  return qa_chain
218
 
219
+ # Reset session state when switching content types
220
+ def reset_session_state():
221
+ st.session_state.url_content = None
222
+ st.session_state.summary = None
223
+ st.session_state.vectorstore = None
224
+ st.session_state.index_created = False
225
+ st.session_state.content_type = None
226
+ st.session_state.token_count = 0
227
+ if "qa_chain" in st.session_state:
228
+ st.session_state.qa_chain = None
229
+
230
  # Process Web URL
231
  if process_url_clicked:
232
  with main_container:
 
235
  else:
236
  with st.spinner("Processing URL..."):
237
  try:
238
+ # Reset session state to avoid stale data
239
+ reset_session_state()
240
  st.text("Data Loading...Started...βœ…βœ…βœ…")
241
  parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
242
  loader = WebBaseLoader(
 
258
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
259
  st.session_state.index_created = True
260
  st.session_state.content_type = "web"
261
+ st.session_state.token_count = estimate_token_count(st.session_state.url_content)
262
+ st.text(f"Estimated token count: {st.session_state.token_count}")
263
  st.text("Content processed successfully! βœ…βœ…βœ…")
264
  except Exception as e:
265
  st.error(f"Error processing URL: {str(e)}")
 
273
  else:
274
  with st.spinner("Processing PDF..."):
275
  try:
276
+ # Reset session state to avoid stale data
277
+ reset_session_state()
278
  st.text("Extracting Text from PDF...Started...βœ…βœ…βœ…")
279
  pdf_text = extract_text_from_pdf(pdf_file)
280
 
281
  if not pdf_text:
 
282
  st.stop()
283
 
284
  # Initialize embeddings only when needed
 
290
  st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
291
  st.session_state.index_created = True
292
  st.session_state.content_type = "pdf"
293
+ st.session_state.token_count = estimate_token_count(st.session_state.url_content)
294
+ st.text(f"Estimated token count: {st.session_state.token_count}")
295
  st.text("PDF processed successfully! βœ…βœ…βœ…")
296
  except Exception as e:
297
  st.error(f"Error processing PDF: {str(e)}")
298
  st.stop()
299
 
300
+ # Summary button with token limit check
301
  with main_container:
302
+ if st.session_state.url_content:
303
+ # Check if content is too large for summarization (threshold: 5,000 tokens to stay under 6,000 TPM limit)
304
+ if st.session_state.token_count > 5000 and st.session_state.content_type == "pdf":
305
+ st.warning("If the PDF is large, users are requested not to summarize it, rather they can keep asking questions.")
306
+ elif st.session_state.token_count > 5000 and st.session_state.content_type == "web":
307
+ st.warning("The web content is too large to summarize (estimated tokens: " + str(st.session_state.token_count) + "). Please ask questions instead.")
308
+ else:
309
+ if st.button("Generate Summary"):
310
+ with st.spinner("Generating summary..."):
311
+ try:
312
+ st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
313
+ except Exception as e:
314
+ st.error(f"Error generating summary: {str(e)}")
315
+ if "rate_limit_exceeded" in str(e):
316
+ st.warning("The content is too large for summarization due to API rate limits. Please ask questions instead or try a smaller document.")
317
+ st.stop()
318
 
319
  # Display summary if generated
320
  if st.session_state.summary:
 
363
  st.markdown(
364
  """
365
  <div class="footer">
366
+ <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="120">
367
  WebChatter Β© 2025 | Developed by Mahatir Ahmed Tusher
368
  </div>
369
  """,