Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 23, 2025

Commit

c2d337a

verified ·

1 Parent(s): eca7766

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -16

app.py CHANGED Viewed

@@ -12,11 +12,14 @@ from langchain.prompts import PromptTemplate
 from bs4 import SoupStrainer
 import PyPDF2
-# Load environment variables (optional)
 load_dotenv()
-# Hardcoded Groq API key
-GROQ_API_KEY = "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"
 # Custom CSS
 st.markdown("""
@@ -98,12 +101,9 @@ st.markdown("""
     </style>
 """, unsafe_allow_html=True)
-# Display large logo at the top of the main page
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
-# Set Streamlit app title
-st.title("WebChatter 💬")
 # Initialize session state
 if "url_content" not in st.session_state:
     st.session_state.url_content = None
@@ -115,6 +115,8 @@ if "index_created" not in st.session_state:
     st.session_state.index_created = False
 if "content_type" not in st.session_state:
     st.session_state.content_type = None
 # Initialize LLM once at the start
 if "llm" not in st.session_state:
@@ -131,7 +133,7 @@ with st.sidebar:
     process_url_clicked = st.button("Process URL")
     st.header("Upload PDF File")
-    pdf_file = st.file_uploader("Upload a PDF", type=["pdf"])
     process_pdf_clicked = st.button("Process PDF")
 # Main content container
@@ -148,6 +150,13 @@ Question: {question}
 Answer with sources: """
 )
 # Function to summarize content
 def summarize_content(content, llm):
     # Shorter summary for web URLs and PDFs (5-10 sentences)
@@ -168,6 +177,9 @@ def extract_text_from_pdf(pdf_file):
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
         return text
     except Exception as e:
         st.error(f"Error extracting text from PDF: {str(e)}")
@@ -204,6 +216,17 @@ def create_qa_chain(vectorstore, llm):
     )
     return qa_chain
 # Process Web URL
 if process_url_clicked:
     with main_container:
@@ -212,6 +235,8 @@ if process_url_clicked:
         else:
             with st.spinner("Processing URL..."):
                 try:
                     st.text("Data Loading...Started...✅✅✅")
                     parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
                     loader = WebBaseLoader(
@@ -233,7 +258,8 @@ if process_url_clicked:
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
                     st.session_state.content_type = "web"
-                    st.session_state.summary = None
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
@@ -247,11 +273,12 @@ if process_pdf_clicked:
         else:
             with st.spinner("Processing PDF..."):
                 try:
                     st.text("Extracting Text from PDF...Started...✅✅✅")
                     pdf_text = extract_text_from_pdf(pdf_file)
                     if not pdf_text:
-                        st.error("No text could be extracted from the PDF. Try a different file.")
                         st.stop()
                     # Initialize embeddings only when needed
@@ -263,17 +290,31 @@ if process_pdf_clicked:
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
                     st.session_state.index_created = True
                     st.session_state.content_type = "pdf"
-                    st.session_state.summary = None
                     st.text("PDF processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing PDF: {str(e)}")
                     st.stop()
-# Summary button
 with main_container:
-    if st.session_state.url_content and st.button("Generate Summary"):
-        with st.spinner("Generating summary..."):
-            st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 # Display summary if generated
 if st.session_state.summary:
@@ -322,7 +363,7 @@ if st.session_state.url_content and st.session_state.index_created:
 st.markdown(
     """
     <div class="footer">
-        <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="80">
         WebChatter © 2025 | Developed by Mahatir Ahmed Tusher
     </div>
     """,

 from bs4 import SoupStrainer
 import PyPDF2
+# Load environment variables
 load_dotenv()
+# Get Groq API key from environment variable (recommended) or use hardcoded fallback
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    st.warning("GROQ_API_KEY not found in environment variables. Using hardcoded key (not recommended for production).")
+    GROQ_API_KEY = "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"
 # Custom CSS
 st.markdown("""
     </style>
 """, unsafe_allow_html=True)
+# Display logo as the title
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
 # Initialize session state
 if "url_content" not in st.session_state:
     st.session_state.url_content = None
     st.session_state.index_created = False
 if "content_type" not in st.session_state:
     st.session_state.content_type = None
+if "token_count" not in st.session_state:
+    st.session_state.token_count = 0
 # Initialize LLM once at the start
 if "llm" not in st.session_state:
     process_url_clicked = st.button("Process URL")
     st.header("Upload PDF File")
+    pdf_file = st.file_uploader("Upload a PDF", type=["pdf"], help="Upload a text-based PDF for best results.")
     process_pdf_clicked = st.button("Process PDF")
 # Main content container
 Answer with sources: """
 )
+# Function to estimate token count (approximation: 1 token ≈ 4 characters for English text)
+def estimate_token_count(text):
+    if not text:
+        return 0
+    # Approximate token count: 1 token ≈ 4 characters (including spaces and punctuation)
+    return len(text) // 4
 # Function to summarize content
 def summarize_content(content, llm):
     # Shorter summary for web URLs and PDFs (5-10 sentences)
             page_text = page.extract_text()
             if page_text:
                 text += page_text + "\n"
+        if not text.strip():
+            st.error("No text could be extracted from the PDF. This may be a scanned or image-based PDF. Please upload a text-based PDF.")
+            return None
         return text
     except Exception as e:
         st.error(f"Error extracting text from PDF: {str(e)}")
     )
     return qa_chain
+# Reset session state when switching content types
+def reset_session_state():
+    st.session_state.url_content = None
+    st.session_state.summary = None
+    st.session_state.vectorstore = None
+    st.session_state.index_created = False
+    st.session_state.content_type = None
+    st.session_state.token_count = 0
+    if "qa_chain" in st.session_state:
+        st.session_state.qa_chain = None
 # Process Web URL
 if process_url_clicked:
     with main_container:
         else:
             with st.spinner("Processing URL..."):
                 try:
+                    # Reset session state to avoid stale data
+                    reset_session_state()
                     st.text("Data Loading...Started...✅✅✅")
                     parse_only = SoupStrainer(['title', 'p', 'h1', 'h2', 'h3'])
                     loader = WebBaseLoader(
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
                     st.session_state.content_type = "web"
+                    st.session_state.token_count = estimate_token_count(st.session_state.url_content)
+                    st.text(f"Estimated token count: {st.session_state.token_count}")
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
         else:
             with st.spinner("Processing PDF..."):
                 try:
+                    # Reset session state to avoid stale data
+                    reset_session_state()
                     st.text("Extracting Text from PDF...Started...✅✅✅")
                     pdf_text = extract_text_from_pdf(pdf_file)
                     if not pdf_text:
                         st.stop()
                     # Initialize embeddings only when needed
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=pdf_file.name)
                     st.session_state.index_created = True
                     st.session_state.content_type = "pdf"
+                    st.session_state.token_count = estimate_token_count(st.session_state.url_content)
+                    st.text(f"Estimated token count: {st.session_state.token_count}")
                     st.text("PDF processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing PDF: {str(e)}")
                     st.stop()
+# Summary button with token limit check
 with main_container:
+    if st.session_state.url_content:
+        # Check if content is too large for summarization (threshold: 5,000 tokens to stay under 6,000 TPM limit)
+        if st.session_state.token_count > 5000 and st.session_state.content_type == "pdf":
+            st.warning("If the PDF is large, users are requested not to summarize it, rather they can keep asking questions.")
+        elif st.session_state.token_count > 5000 and st.session_state.content_type == "web":
+            st.warning("The web content is too large to summarize (estimated tokens: " + str(st.session_state.token_count) + "). Please ask questions instead.")
+        else:
+            if st.button("Generate Summary"):
+                with st.spinner("Generating summary..."):
+                    try:
+                        st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
+                    except Exception as e:
+                        st.error(f"Error generating summary: {str(e)}")
+                        if "rate_limit_exceeded" in str(e):
+                            st.warning("The content is too large for summarization due to API rate limits. Please ask questions instead or try a smaller document.")
+                        st.stop()
 # Display summary if generated
 if st.session_state.summary:
 st.markdown(
     """
     <div class="footer">
+        <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="120">
         WebChatter © 2025 | Developed by Mahatir Ahmed Tusher
     </div>
     """,