Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

cadd6a8

verified ·

1 Parent(s): 48716e1

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +199 -73

src/streamlit_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Solution 1: Updated SimplePDFRAG with cache directory fix
 import streamlit as st
 import PyPDF2
 from sentence_transformers import SentenceTransformer
@@ -22,6 +22,7 @@ class SimplePDFRAG:
         self.embedding_model = None
         self.granite_model = None
         self.tokenizer = None
     def setup_cache_directory(self):
         """Setup a custom cache directory with proper permissions"""
@@ -77,60 +78,113 @@ class SimplePDFRAG:
             return False
     def extract_pdf_text(self, pdf_file):
-        """Extract text from PDF file"""
         try:
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
             for page_num, page in enumerate(pdf_reader.pages):
-                page_text = page.extract_text()
-                if page_text:
-                    text += page_text + "\n"
-            return text
         except Exception as e:
-            st.error(f"Error extracting PDF text: {e}")
             return None
     def chunk_text(self, text, chunk_size=500):
         """Split text into chunks"""
         words = text.split()
         chunks = []
         for i in range(0, len(words), chunk_size):
             chunk = " ".join(words[i:i + chunk_size])
-            chunks.append(chunk)
         return chunks
-    def process_pdf(self, pdf_file):
         """Process PDF and create embeddings"""
-        # Extract text
-        text = self.extract_pdf_text(pdf_file)
-        if not text:
-            return False
-        # Chunk text
-        chunks = self.chunk_text(text)
-        # Create embeddings
-        st.info(f"Creating embeddings for {len(chunks)} chunks...")
         try:
-            embeddings = self.embedding_model.encode(chunks)
-            # Store documents and embeddings
-            self.documents = chunks
-            self.embeddings = embeddings
-            st.success(f"Processed PDF: {len(chunks)} chunks created")
-            return True
         except Exception as e:
-            st.error(f"Error creating embeddings: {e}")
             return False
     def search_documents(self, query, top_k=3):
         """Search for relevant documents"""
-        if not self.documents:
             return []
         try:
@@ -151,9 +205,12 @@ class SimplePDFRAG:
                         'score': similarities[idx]
                     })
             return results
         except Exception as e:
             st.error(f"Error searching documents: {e}")
             return []
     def generate_answer(self, query, context_docs):
@@ -210,6 +267,12 @@ Answer:"""
     def answer_question(self, query):
         """Main function to answer questions"""
         # Search for relevant documents
         relevant_docs = self.search_documents(query)
@@ -234,7 +297,7 @@ def main():
         layout="wide"
     )
-    st.title("📄 Simple PDF RAG with IBM Granite (Cache Fixed)")
     st.write("Upload a PDF and ask questions about its content")
     # Initialize session state
@@ -246,75 +309,138 @@ def main():
     if 'pdf_processed' not in st.session_state:
         st.session_state.pdf_processed = False
     # Load models button
     if not st.session_state.models_loaded:
-        if st.button("🤖 Load Models"):
             with st.spinner("Loading models... This may take a few minutes"):
                 success = st.session_state.rag_system.load_models()
-                st.session_state.models_loaded = success
     # Only show PDF upload if models are loaded
     if st.session_state.models_loaded:
-        st.success("✅ Models loaded successfully!")
         # PDF Upload
-        uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
-        if uploaded_file and st.button("📖 Process PDF"):
-            with st.spinner("Processing PDF..."):
-                success = st.session_state.rag_system.process_pdf(uploaded_file)
-                st.session_state.pdf_processed = success
-        # Question answering
         if st.session_state.pdf_processed:
-            st.success("�� PDF processed successfully!")
-            query = st.text_input("❓ Ask a question about the PDF:")
-            if query and st.button("🔍 Get Answer"):
-                with st.spinner("Searching and generating answer..."):
-                    result = st.session_state.rag_system.answer_question(query)
-                # Display answer
-                st.subheader("🤖 Answer:")
-                st.write(result['answer'])
-                # Display sources
-                if result.get('sources'):
-                    st.subheader("📚 Relevant Sources:")
-                    for i, source in enumerate(result['sources']):
-                        with st.expander(f"Source {i+1} (Score: {source['score']:.3f})"):
-                            st.write(source['text'][:300] + "..." if len(source['text']) > 300 else source['text'])
-    # Instructions with troubleshooting
     with st.sidebar:
         st.header("📋 Instructions")
         st.write("""
-        1. Click 'Load Models' to initialize the system
-        2. Upload a PDF file
-        3. Click 'Process PDF' to extract and index content
-        4. Ask questions about the PDF content
-        5. Get AI-generated answers with source citations
         """)
-        st.header("🔧 Troubleshooting")
-        st.write("""
-        **Cache Permission Error Fixed:**
-        - Uses temporary directory for model cache
-        - Automatically handles permission issues
-        - No manual cache cleanup needed
-        """)
-        st.header("⚙️ Alternative Solutions")
-        st.code("""
-# Manual cache cleanup (if needed):
-rm -rf ~/.cache/huggingface/
-rm -rf ~/.cache/torch/
-# Or set environment variables:
-export HF_HOME=/tmp/hf_cache
-export TRANSFORMERS_CACHE=/tmp/transformers_cache
         """)
 if __name__ == "__main__":

+# Fixed SimplePDFRAG with better state management and debugging
 import streamlit as st
 import PyPDF2
 from sentence_transformers import SentenceTransformer
         self.embedding_model = None
         self.granite_model = None
         self.tokenizer = None
+        self.pdf_name = None
     def setup_cache_directory(self):
         """Setup a custom cache directory with proper permissions"""
             return False
     def extract_pdf_text(self, pdf_file):
+        """Extract text from PDF file with better error handling"""
         try:
+            # Reset file pointer to beginning
+            pdf_file.seek(0)
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
+            st.info(f"PDF has {len(pdf_reader.pages)} pages")
             for page_num, page in enumerate(pdf_reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+                        st.write(f"✅ Extracted text from page {page_num + 1}")
+                    else:
+                        st.warning(f"⚠️ No text found on page {page_num + 1}")
+                except Exception as page_error:
+                    st.error(f"Error extracting page {page_num + 1}: {page_error}")
+                    continue
+            if text.strip():
+                st.success(f"Total extracted text length: {len(text)} characters")
+                # Show preview of extracted text
+                st.write("📄 **Text Preview:**")
+                st.text(text[:500] + "..." if len(text) > 500 else text)
+                return text
+            else:
+                st.error("No text could be extracted from the PDF")
+                return None
         except Exception as e:
+            st.error(f"Error reading PDF file: {e}")
+            logger.error(f"PDF extraction error: {e}")
             return None
     def chunk_text(self, text, chunk_size=500):
         """Split text into chunks"""
+        if not text or not text.strip():
+            return []
         words = text.split()
         chunks = []
         for i in range(0, len(words), chunk_size):
             chunk = " ".join(words[i:i + chunk_size])
+            if chunk.strip():  # Only add non-empty chunks
+                chunks.append(chunk)
+        st.info(f"Created {len(chunks)} text chunks")
         return chunks
+    def process_pdf(self, pdf_file, pdf_name):
         """Process PDF and create embeddings"""
         try:
+            # Store PDF name
+            self.pdf_name = pdf_name
+            # Extract text
+            st.info("🔍 Extracting text from PDF...")
+            text = self.extract_pdf_text(pdf_file)
+            if not text:
+                st.error("❌ Failed to extract text from PDF")
+                return False
+            # Chunk text
+            st.info("✂️ Splitting text into chunks...")
+            chunks = self.chunk_text(text)
+            if not chunks:
+                st.error("❌ No text chunks created")
+                return False
+            # Create embeddings
+            st.info(f"🔄 Creating embeddings for {len(chunks)} chunks...")
+            try:
+                embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
+                # Store documents and embeddings
+                self.documents = chunks
+                self.embeddings = embeddings
+                st.success(f"✅ Successfully processed PDF: {len(chunks)} chunks created with embeddings")
+                # Show some stats
+                st.info(f"📊 **Processing Summary:**")
+                st.write(f"- PDF Name: {pdf_name}")
+                st.write(f"- Text length: {len(text)} characters")
+                st.write(f"- Number of chunks: {len(chunks)}")
+                st.write(f"- Embeddings shape: {embeddings.shape}")
+                return True
+            except Exception as e:
+                st.error(f"❌ Error creating embeddings: {e}")
+                logger.error(f"Embedding error: {e}")
+                return False
         except Exception as e:
+            st.error(f"❌ Error processing PDF: {e}")
+            logger.error(f"PDF processing error: {e}")
             return False
     def search_documents(self, query, top_k=3):
         """Search for relevant documents"""
+        if not self.documents or len(self.embeddings) == 0:
+            st.warning("No documents available for search")
             return []
         try:
                         'score': similarities[idx]
                     })
+            st.info(f"Found {len(results)} relevant document chunks")
             return results
         except Exception as e:
             st.error(f"Error searching documents: {e}")
+            logger.error(f"Search error: {e}")
             return []
     def generate_answer(self, query, context_docs):
     def answer_question(self, query):
         """Main function to answer questions"""
+        if not self.documents:
+            return {
+                'answer': "No PDF has been processed yet. Please upload and process a PDF first.",
+                'sources': []
+            }
         # Search for relevant documents
         relevant_docs = self.search_documents(query)
         layout="wide"
     )
+    st.title("📄 Simple PDF RAG with IBM Granite (Fixed)")
     st.write("Upload a PDF and ask questions about its content")
     # Initialize session state
     if 'pdf_processed' not in st.session_state:
         st.session_state.pdf_processed = False
+    if 'current_pdf_name' not in st.session_state:
+        st.session_state.current_pdf_name = None
+    # Status display
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        if st.session_state.models_loaded:
+            st.success("🤖 Models: Loaded")
+        else:
+            st.error("🤖 Models: Not Loaded")
+    with col2:
+        if st.session_state.pdf_processed and st.session_state.current_pdf_name:
+            st.success(f"📄 PDF: {st.session_state.current_pdf_name}")
+        else:
+            st.error("📄 PDF: Not Processed")
+    with col3:
+        if st.session_state.models_loaded and st.session_state.pdf_processed:
+            st.success("🟢 Ready for Questions")
+        else:
+            st.error("🔴 Not Ready")
     # Load models button
     if not st.session_state.models_loaded:
+        if st.button("🤖 Load Models", key="load_models"):
             with st.spinner("Loading models... This may take a few minutes"):
                 success = st.session_state.rag_system.load_models()
+                if success:
+                    st.session_state.models_loaded = True
+                    st.rerun()
     # Only show PDF upload if models are loaded
     if st.session_state.models_loaded:
+        st.markdown("---")
+        st.subheader("📁 PDF Upload and Processing")
         # PDF Upload
+        uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], key="pdf_uploader")
+        if uploaded_file is not None:
+            st.info(f"📄 Uploaded: {uploaded_file.name}")
+            if st.button("📖 Process PDF", key="process_pdf"):
+                with st.spinner("Processing PDF..."):
+                    success = st.session_state.rag_system.process_pdf(uploaded_file, uploaded_file.name)
+                    if success:
+                        st.session_state.pdf_processed = True
+                        st.session_state.current_pdf_name = uploaded_file.name
+                        st.rerun()
+                    else:
+                        st.session_state.pdf_processed = False
+                        st.session_state.current_pdf_name = None
+        # Question answering section
         if st.session_state.pdf_processed:
+            st.markdown("---")
+            st.subheader("❓ Ask Questions")
+            # Show current document info
+            st.info(f"📚 Current document: {st.session_state.current_pdf_name}")
+            st.info(f"📊 Document chunks: {len(st.session_state.rag_system.documents)}")
+            query = st.text_input("Ask a question about the PDF:", key="question_input",
+                                placeholder="e.g., What is the main topic of this document?")
+            if query:
+                if st.button("🔍 Get Answer", key="get_answer"):
+                    with st.spinner("Searching and generating answer..."):
+                        result = st.session_state.rag_system.answer_question(query)
+                    # Display answer
+                    st.markdown("### 🤖 Answer:")
+                    st.write(result['answer'])
+                    # Display sources
+                    if result.get('sources'):
+                        st.markdown("### 📚 Relevant Sources:")
+                        for i, source in enumerate(result['sources']):
+                            with st.expander(f"Source {i+1} (Relevance Score: {source['score']:.3f})"):
+                                st.write(source['text'][:500] + "..." if len(source['text']) > 500 else source['text'])
+                # Add some example questions
+                st.markdown("### 💡 Example Questions:")
+                example_questions = [
+                    "What is the main topic of this document?",
+                    "Can you summarize the key points?",
+                    "What are the important details mentioned?",
+                    "Who are the main people or entities discussed?"
+                ]
+                for i, example in enumerate(example_questions):
+                    if st.button(f"📝 {example}", key=f"example_{i}"):
+                        st.session_state.question_input = example
+                        st.rerun()
+    # Sidebar with instructions and debugging
     with st.sidebar:
         st.header("📋 Instructions")
         st.write("""
+        1. **Load Models**: Click to initialize AI models
+        2. **Upload PDF**: Select a PDF file to analyze
+        3. **Process PDF**: Extract and index PDF content
+        4. **Ask Questions**: Get AI-powered answers
         """)
+        st.header("🔧 Debug Info")
+        if st.session_state.models_loaded:
+            st.write("✅ Models loaded")
+        else:
+            st.write("❌ Models not loaded")
+        if st.session_state.pdf_processed:
+            st.write(f"✅ PDF processed: {st.session_state.current_pdf_name}")
+            if hasattr(st.session_state.rag_system, 'documents'):
+                st.write(f"📊 Chunks: {len(st.session_state.rag_system.documents)}")
+        else:
+            st.write("❌ No PDF processed")
+        # Reset button
+        if st.button("🔄 Reset All", key="reset_all"):
+            for key in list(st.session_state.keys()):
+                del st.session_state[key]
+            st.rerun()
+        st.header("⚙️ Tips")
+        st.write("""
+        - **PDF not working?** Try a different PDF file
+        - **No text extracted?** PDF might be image-based
+        - **Poor answers?** Try more specific questions
+        - **Slow performance?** Use smaller PDF files
         """)
 if __name__ == "__main__":