Spaces:

aaporosh
/

SmartPDF_Q_A

Sleeping

App Files Files Community

aaporosh commited on Aug 18

Commit

c06d586

verified ·

1 Parent(s): 7678f2a

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -66

app.py CHANGED Viewed

@@ -2,136 +2,237 @@ import streamlit as st
 import logging
 import os
 from io import BytesIO
-from PyPDF2 import PdfReader
-from langchain.text_splitter import CharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
 # Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-# Lazy load models
-@st.cache_resource(ttl=3600)
 def load_embeddings_model():
     logger.info("Loading embeddings model")
     try:
-        return SentenceTransformer("all-MiniLM-L6-v2")
     except Exception as e:
         logger.error(f"Embeddings load error: {str(e)}")
         st.error(f"Embedding model error: {str(e)}")
         return None
-@st.cache_resource(ttl=3600)
 def load_qa_pipeline():
     logger.info("Loading QA pipeline")
     try:
-        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=200)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
         return None
-# Process PDF
 def process_pdf(uploaded_file):
-    logger.info("Processing PDF")
     try:
-        pdf_reader = PdfReader(BytesIO(uploaded_file.getvalue()))
-        text = "".join(page.extract_text() or "" for page in pdf_reader.pages)
         if not text:
-            # Optional OCR (uncomment if needed, requires pdf2image, pytesseract)
-            # from pdf2image import convert_from_bytes
-            # import pytesseract
-            # images = convert_from_bytes(uploaded_file.getvalue())
-            # text = "".join(pytesseract.image_to_string(img) for img in images)
-            if not text:
-                raise ValueError("No text extracted from PDF")
-        text_splitter = CharacterTextSplitter(separator="\n", chunk_size=600, chunk_overlap=150)
-        chunks = text_splitter.split_text(text)
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
-            return None
-        embeddings = [embeddings_model.encode(chunk) for chunk in chunks]
-        vector_store = FAISS.from_embeddings(zip(chunks, embeddings), embeddings_model.encode)
-        logger.info("PDF processed successfully")
-        return vector_store
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
-        return None
-# Answer question
-def answer_question(vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
-        if not vector_store:
-            return "Please upload a PDF first."
         qa_pipeline = load_qa_pipeline()
         if not qa_pipeline:
-            return "QA model unavailable."
-        docs = vector_store.similarity_search(query, k=3)
         context = "\n".join(doc.page_content for doc in docs)
-        prompt = f"Context: {context}\nQuestion: {query}\nAnswer concisely:"
         response = qa_pipeline(prompt)[0]['generated_text']
         logger.info("Answer generated")
-        return response.strip()
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
-        return f"Error answering: {str(e)}"
 # Streamlit UI
 try:
-    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄")
-    st.title("Smart PDF Q&A")
     st.markdown("""
-        Upload a PDF and ask questions about its content. Chat history is preserved.
         <style>
-        .stChatMessage { border-radius: 10px; padding: 10px; margin: 5px; }
-        .stChatMessage.user { background-color: #e6f3ff; }
-        .stChatMessage.assistant { background-color: #f0f0f0; }
         </style>
     """, unsafe_allow_html=True)
     # Initialize session state
     if "messages" not in st.session_state:
         st.session_state.messages = []
-    if "vector_store" not in st.session_state:
-        st.session_state.vector_store = None
-    # PDF upload
     uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
-    if uploaded_file and st.button("Process PDF"):
-        with st.spinner("Processing PDF..."):
-            st.session_state.vector_store = process_pdf(uploaded_file)
-            if st.session_state.vector_store:
-                st.success("PDF processed! Ask questions below.")
-                st.session_state.messages = []
-            else:
-                st.error("Failed to process PDF.")
     # Chat interface
-    if st.session_state.vector_store:
-        prompt = st.chat_input("Ask a question about the PDF:")
         if prompt:
             st.session_state.messages.append({"role": "user", "content": prompt})
             with st.chat_message("user"):
-                st.markdown(prompt)
             with st.chat_message("assistant"):
-                with st.spinner("Generating answer..."):
-                    answer = answer_question(st.session_state.vector_store, prompt)
-                st.markdown(answer)
             st.session_state.messages.append({"role": "assistant", "content": answer})
     # Display chat history
     for message in st.session_state.messages:
-        with st.chat_message(message["role"]):
-            st.markdown(message["content"])
     # Download chat history
     if st.session_state.messages:
@@ -140,4 +241,4 @@ try:
 except Exception as e:
     logger.error(f"App initialization failed: {str(e)}")
-    st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")

 import logging
 import os
 from io import BytesIO
+import pdfplumber
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from sentence_transformers import SentenceTransformer
 from transformers import pipeline
+import re
 # Setup logging for Spaces
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+# Lazy load models with caching
+@st.cache_resource(ttl=1800)
 def load_embeddings_model():
     logger.info("Loading embeddings model")
     try:
+        return SentenceTransformer("all-MiniLM-L12-v2")
     except Exception as e:
         logger.error(f"Embeddings load error: {str(e)}")
         st.error(f"Embedding model error: {str(e)}")
         return None
+@st.cache_resource(ttl=1800)
 def load_qa_pipeline():
     logger.info("Loading QA pipeline")
     try:
+        return pipeline("text2text-generation", model="google/flan-t5-small", max_length=300)
     except Exception as e:
         logger.error(f"QA model load error: {str(e)}")
         st.error(f"QA model error: {str(e)}")
         return None
+@st.cache_resource(ttl=1800)
+def load_summary_pipeline():
+    logger.info("Loading summary pipeline")
+    try:
+        return pipeline("summarization", model="sshleifer/distilbart-cnn-6-6", max_length=150)
+    except Exception as e:
+        logger.error(f"Summary model load error: {str(e)}")
+        st.error(f"Summary model error: {str(e)}")
+        return None
+# Process PDF with improved extraction
 def process_pdf(uploaded_file):
+    logger.info("Processing PDF with enhanced extraction")
     try:
+        text = ""
+        code_blocks = []
+        with pdfplumber.open(BytesIO(uploaded_file.getvalue())) as pdf:
+            for page in pdf.pages[:20]:
+                extracted = page.extract_text(layout=False)
+                if extracted:
+                    text += extracted + "\n"
+                for char in page.chars:
+                    if 'fontname' in char and 'mono' in char['fontname'].lower():
+                        code_blocks.append(char['text'])
+                code_text_page = page.extract_text()
+                code_matches = re.finditer(r'(^\s{2,}.*?(?:\n\s{2,}.*?)*)', code_text_page or "", re.MULTILINE)
+                for match in code_matches:
+                    code_blocks.append(match.group().strip())
+                tables = page.extract_tables()
+                if tables:
+                    for table in tables:
+                        text += "\n".join([" | ".join(map(str, row)) for row in table if row]) + "\n"
+                for obj in page.extract_words():
+                    if obj.get('size', 0) > 12:
+                        text += f"\n{obj['text']}\n"
+        code_text = "\n".join(code_blocks).strip()
         if not text:
+            raise ValueError("No text extracted from PDF")
+        # Use RecursiveCharacterTextSplitter for better semantic splitting
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500, chunk_overlap=100, separators=["\n\n", "\n", ".", " "]
+        )
+        text_chunks = text_splitter.split_text(text)[:50]
+        code_chunks = text_splitter.split_text(code_text)[:25] if code_text else []
         embeddings_model = load_embeddings_model()
         if not embeddings_model:
+            return None, None, text, code_text
+        # Build FAISS vector stores efficiently
+        text_vectors = [embeddings_model.encode(chunk) for chunk in text_chunks]
+        code_vectors = [embeddings_model.encode(chunk) for chunk in code_chunks]
+        text_vector_store = FAISS.from_embeddings(zip(text_chunks, text_vectors), embeddings_model.encode) if text_chunks else None
+        code_vector_store = FAISS.from_embeddings(zip(code_chunks, code_vectors), embeddings_model.encode) if code_chunks else None
+        logger.info("PDF processed successfully with enhanced extraction")
+        return text_vector_store, code_vector_store, text, code_text
     except Exception as e:
         logger.error(f"PDF processing error: {str(e)}")
         st.error(f"PDF error: {str(e)}")
+        return None, None, "", ""
+# Summarize PDF
+def summarize_pdf(text):
+    logger.info("Generating summary")
+    try:
+        summary_pipeline = load_summary_pipeline()
+        if not summary_pipeline:
+            return "Summary model unavailable."
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=500, chunk_overlap=50, separators=["\n\n", "\n", ".", " "]
+        )
+        chunks = text_splitter.split_text(text)[:2]
+        summaries = []
+        for chunk in chunks:
+            summary = summary_pipeline(chunk[:500], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+            summaries.append(summary.strip())
+        combined_summary = " ".join(summaries)
+        if len(combined_summary.split()) > 150:
+            combined_summary = " ".join(combined_summary.split()[:150])
+        logger.info("Summary generated")
+        return f"Sure, here's a concise summary of the PDF:\n{combined_summary}"
+    except Exception as e:
+        logger.error(f"Summary error: {str(e)}")
+        return f"Oops, something went wrong summarizing: {str(e)}"
+# Answer question with improved response
+def answer_question(text_vector_store, code_vector_store, query):
     logger.info(f"Processing query: {query}")
     try:
+        if not text_vector_store and not code_vector_store:
+            return "Please upload a PDF first!"
         qa_pipeline = load_qa_pipeline()
         if not qa_pipeline:
+            return "Sorry, the QA model is unavailable right now."
+        is_code_query = any(keyword in query.lower() for keyword in ["code", "script", "function", "programming", "give me code", "show code"])
+        if is_code_query and code_vector_store:
+            return f"Here's the code from the PDF:\n```python\n{st.session_state.code_text}\n```"
+        vector_store = text_vector_store
+        if not vector_store:
+            return "No relevant content found for your query."
+        docs = vector_store.similarity_search(query, k=5)
         context = "\n".join(doc.page_content for doc in docs)
+        prompt = f"Context: {context}\nQuestion: {query}\nProvide a detailed, accurate answer based on the context, prioritizing relevant information. Respond as a helpful assistant:"
         response = qa_pipeline(prompt)[0]['generated_text']
         logger.info("Answer generated")
+        return f"Got it! Here's a detailed answer:\n{response.strip()}"
     except Exception as e:
         logger.error(f"Query error: {str(e)}")
+        return f"Sorry, something went wrong: {str(e)}"
 # Streamlit UI
 try:
+    st.set_page_config(page_title="Smart PDF Q&A", page_icon="📄", layout="wide")
     st.markdown("""
         <style>
+        .main { max-width: 900px; margin: 0 auto; padding: 20px; }
+        .sidebar { background-color: #f8f9fa; padding: 10px; border-radius: 5px; }
+        .chat-container { border: 1px solid #ddd; border-radius: 10px; padding: 10px; height: 65vh; overflow-y: auto; margin-top: 20px; }
+        .user-bubble { background-color: #e6f3ff; border-radius: 15px; padding: 10px; margin: 5px; text-align: right; }
+        .assistant-bubble { background-color: #f0f0f0; border-radius: 15px; padding: 10px; margin: 5px; text-align: left; }
+        .stButton>button { background-color: #4CAF50; color: white; border: none; padding: 8px 16px; border-radius: 5px; }
+        .stButton>button:hover { background-color: #45a049; }
+        pre { background-color: #f8f8f8; padding: 10px; border-radius: 5px; overflow-x: auto; }
+        .header { background: linear-gradient(90deg, #4CAF50, #81C784); color: white; padding: 10px; border-radius: 5px; text-align: center; }
+        .stChatInput { position: fixed; bottom: 10px; width: 80%; }
         </style>
     """, unsafe_allow_html=True)
+    st.markdown('<div class="header"><h1>Smart PDF Q&A</h1></div>', unsafe_allow_html=True)
+    st.markdown("Upload a PDF to ask questions, summarize (~150 words), or extract code with 'give me code'. Fast and friendly responses!")
     # Initialize session state
     if "messages" not in st.session_state:
         st.session_state.messages = []
+    if "text_vector_store" not in st.session_state:
+        st.session_state.text_vector_store = None
+    if "code_vector_store" not in st.session_state:
+        st.session_state.code_vector_store = None
+    if "pdf_text" not in st.session_state:
+        st.session_state.pdf_text = ""
+    if "code_text" not in st.session_state:
+        st.session_state.code_text = ""
+    # Sidebar
+    with st.sidebar:
+        st.markdown('<div class="sidebar">', unsafe_allow_html=True)
+        theme = st.radio("Theme", ["Light", "Dark"], index=0)
+        st.markdown('</div>', unsafe_allow_html=True)
+    # PDF upload and processing
     uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
+    col1, col2 = st.columns([1, 1])
+    with col1:
+        if st.button("Process PDF") and uploaded_file:
+            with st.spinner("Processing PDF..."):
+                st.session_state.text_vector_store, st.session_state.code_vector_store, st.session_state.pdf_text, st.session_state.code_text = process_pdf(uploaded_file)
+                if st.session_state.text_vector_store or st.session_state.code_vector_store:
+                    st.success("PDF processed! Ask away or summarize.")
+                    st.session_state.messages = []
+                else:
+                    st.error("Failed to process PDF.")
+    with col2:
+        if st.button("Summarize PDF") and st.session_state.pdf_text:
+            with st.spinner("Summarizing..."):
+                summary = summarize_pdf(st.session_state.pdf_text)
+                st.session_state.messages.append({"role": "assistant", "content": summary})
+                st.markdown(summary, unsafe_allow_html=True)
     # Chat interface
+    st.markdown('<div class="chat-container">', unsafe_allow_html=True)
+    if st.session_state.text_vector_store or st.session_state.code_vector_store:
+        prompt = st.chat_input("Ask a question (e.g., 'Give me code' or 'What’s the main idea?'):")
         if prompt:
             st.session_state.messages.append({"role": "user", "content": prompt})
             with st.chat_message("user"):
+                st.markdown(f"<div class='user-bubble'>{prompt}</div>", unsafe_allow_html=True)
             with st.chat_message("assistant"):
+                with st.spinner('<div class="spinner">⏳</div>'):
+                    answer = answer_question(st.session_state.text_vector_store, st.session_state.code_vector_store, prompt)
+                st.markdown(f"<div class='assistant-bubble'>{answer}</div>", unsafe_allow_html=True)
             st.session_state.messages.append({"role": "assistant", "content": answer})
     # Display chat history
     for message in st.session_state.messages:
+        css_class = "user-bubble" if message["role"] == "user" else "assistant-bubble"
+        st.markdown(f"<div class='{css_class}'>{message['content']}</div>", unsafe_allow_html=True)
+    st.markdown('</div>', unsafe_allow_html=True)
     # Download chat history
     if st.session_state.messages:
 except Exception as e:
     logger.error(f"App initialization failed: {str(e)}")
+    st.error(f"App failed to start: {str(e)}. Check Spaces logs or contact support.")