Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

a2146e1

verified ·

1 Parent(s): cadd6a8

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +67 -262

src/streamlit_app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# Fixed SimplePDFRAG with better state management and debugging
 import streamlit as st
 import PyPDF2
 from sentence_transformers import SentenceTransformer
@@ -9,7 +9,6 @@ from sklearn.metrics.pairwise import cosine_similarity
 import logging
 import os
 import tempfile
-import shutil
 # Configure logging
 logging.basicConfig(level=logging.INFO)
@@ -25,14 +24,11 @@ class SimplePDFRAG:
         self.pdf_name = None
     def setup_cache_directory(self):
-        """Setup a custom cache directory with proper permissions"""
         try:
-            # Create a temporary directory for models
             cache_dir = tempfile.mkdtemp(prefix="model_cache_")
             os.environ['HF_HOME'] = cache_dir
             os.environ['TRANSFORMERS_CACHE'] = cache_dir
             os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir
             st.info(f"Using cache directory: {cache_dir}")
             return cache_dir
         except Exception as e:
@@ -40,54 +36,33 @@ class SimplePDFRAG:
             return None
     def load_models(self):
-        """Load embedding model and Granite model with cache fix"""
         try:
-            # Setup cache directory
             cache_dir = self.setup_cache_directory()
-            # Load embedding model with cache directory
             st.info("Loading embedding model...")
             self.embedding_model = SentenceTransformer(
-                'all-MiniLM-L6-v2',
-                cache_folder=cache_dir
             )
-            # Load IBM Granite model
             st.info("Loading IBM Granite model...")
-            model_name = "ibm-granite/granite-3.0-2b-instruct"  # IBM Granite model
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                model_name,
-                cache_dir=cache_dir
-            )
             self.granite_model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                cache_dir=cache_dir,
-                torch_dtype=torch.float32  # Use float32 for compatibility
             )
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
             st.success("Models loaded successfully!")
             return True
         except Exception as e:
             st.error(f"Error loading models: {e}")
             logger.error(f"Model loading error: {e}")
             return False
     def extract_pdf_text(self, pdf_file):
-        """Extract text from PDF file with better error handling"""
         try:
-            # Reset file pointer to beginning
             pdf_file.seek(0)
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
             st.info(f"PDF has {len(pdf_reader.pages)} pages")
             for page_num, page in enumerate(pdf_reader.pages):
                 try:
                     page_text = page.extract_text()
@@ -98,130 +73,66 @@ class SimplePDFRAG:
                         st.warning(f"⚠️ No text found on page {page_num + 1}")
                 except Exception as page_error:
                     st.error(f"Error extracting page {page_num + 1}: {page_error}")
-                    continue
             if text.strip():
-                st.success(f"Total extracted text length: {len(text)} characters")
-                # Show preview of extracted text
                 st.write("📄 **Text Preview:**")
                 st.text(text[:500] + "..." if len(text) > 500 else text)
                 return text
             else:
                 st.error("No text could be extracted from the PDF")
                 return None
         except Exception as e:
             st.error(f"Error reading PDF file: {e}")
             logger.error(f"PDF extraction error: {e}")
             return None
     def chunk_text(self, text, chunk_size=500):
-        """Split text into chunks"""
         if not text or not text.strip():
             return []
         words = text.split()
-        chunks = []
-        for i in range(0, len(words), chunk_size):
-            chunk = " ".join(words[i:i + chunk_size])
-            if chunk.strip():  # Only add non-empty chunks
-                chunks.append(chunk)
-        st.info(f"Created {len(chunks)} text chunks")
-        return chunks
     def process_pdf(self, pdf_file, pdf_name):
-        """Process PDF and create embeddings"""
         try:
-            # Store PDF name
             self.pdf_name = pdf_name
-            # Extract text
             st.info("🔍 Extracting text from PDF...")
             text = self.extract_pdf_text(pdf_file)
             if not text:
-                st.error("❌ Failed to extract text from PDF")
                 return False
-            # Chunk text
             st.info("✂️ Splitting text into chunks...")
             chunks = self.chunk_text(text)
             if not chunks:
-                st.error("❌ No text chunks created")
                 return False
-            # Create embeddings
             st.info(f"🔄 Creating embeddings for {len(chunks)} chunks...")
-            try:
-                embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
-                # Store documents and embeddings
-                self.documents = chunks
-                self.embeddings = embeddings
-                st.success(f"✅ Successfully processed PDF: {len(chunks)} chunks created with embeddings")
-                # Show some stats
-                st.info(f"📊 **Processing Summary:**")
-                st.write(f"- PDF Name: {pdf_name}")
-                st.write(f"- Text length: {len(text)} characters")
-                st.write(f"- Number of chunks: {len(chunks)}")
-                st.write(f"- Embeddings shape: {embeddings.shape}")
-                return True
-            except Exception as e:
-                st.error(f"❌ Error creating embeddings: {e}")
-                logger.error(f"Embedding error: {e}")
-                return False
         except Exception as e:
             st.error(f"❌ Error processing PDF: {e}")
             logger.error(f"PDF processing error: {e}")
             return False
     def search_documents(self, query, top_k=3):
-        """Search for relevant documents"""
         if not self.documents or len(self.embeddings) == 0:
             st.warning("No documents available for search")
             return []
         try:
-            # Get query embedding
             query_embedding = self.embedding_model.encode([query])
-            # Calculate similarities
             similarities = cosine_similarity(query_embedding, self.embeddings)[0]
-            # Get top k results
             top_indices = np.argsort(similarities)[-top_k:][::-1]
-            results = []
-            for idx in top_indices:
-                if similarities[idx] > 0.1:  # Minimum similarity threshold
-                    results.append({
-                        'text': self.documents[idx],
-                        'score': similarities[idx]
-                    })
-            st.info(f"Found {len(results)} relevant document chunks")
-            return results
         except Exception as e:
             st.error(f"Error searching documents: {e}")
             logger.error(f"Search error: {e}")
             return []
     def generate_answer(self, query, context_docs):
-        """Generate answer using the language model"""
         if not self.granite_model or not context_docs:
             return "I don't have enough information to answer your question."
-        # Prepare context
-        context = "\n\n".join([doc['text'][:200] for doc in context_docs])  # Limit context
-        # Create a more sophisticated prompt for Granite
         prompt = f"""You are a helpful AI assistant. Based on the following context, provide a clear and accurate answer to the question.
 Context:
@@ -230,218 +141,112 @@ Context:
 Question: {query}
 Answer:"""
         try:
-            # Tokenize
-            inputs = self.tokenizer.encode(
-                prompt,
-                return_tensors='pt',
-                max_length=512,  # Reduced length
-                truncation=True
-            )
-            # Generate response
             with torch.no_grad():
                 outputs = self.granite_model.generate(
                     inputs,
-                    max_length=inputs.shape[1] + 100,  # Shorter response
                     temperature=0.7,
                     do_sample=True,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
-            # Decode response
             response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
-            # If response is empty or too short, provide context-based answer
-            if not response or len(response.strip()) < 10:
-                response = f"Based on the document: {context[:300]}..."
-            return response.strip()
         except Exception as e:
             logger.error(f"Generation error: {e}")
-            # Fallback to simple context-based answer
-            return f"Based on the available information: {context[:300]}..."
     def answer_question(self, query):
-        """Main function to answer questions"""
         if not self.documents:
-            return {
-                'answer': "No PDF has been processed yet. Please upload and process a PDF first.",
-                'sources': []
-            }
-        # Search for relevant documents
         relevant_docs = self.search_documents(query)
         if not relevant_docs:
-            return {
-                'answer': "I couldn't find relevant information in the PDF to answer your question.",
-                'sources': []
-            }
-        # Generate answer
-        answer = self.generate_answer(query, relevant_docs)
         return {
-            'answer': answer,
             'sources': relevant_docs
         }
 def main():
-    st.set_page_config(
-        page_title="Simple PDF RAG with IBM Granite (Fixed)",
-        page_icon="📄",
-        layout="wide"
-    )
     st.title("📄 Simple PDF RAG with IBM Granite (Fixed)")
     st.write("Upload a PDF and ask questions about its content")
-    # Initialize session state
     if 'rag_system' not in st.session_state:
         st.session_state.rag_system = SimplePDFRAG()
     if 'models_loaded' not in st.session_state:
         st.session_state.models_loaded = False
     if 'pdf_processed' not in st.session_state:
         st.session_state.pdf_processed = False
     if 'current_pdf_name' not in st.session_state:
         st.session_state.current_pdf_name = None
-    # Status display
     col1, col2, col3 = st.columns(3)
     with col1:
-        if st.session_state.models_loaded:
-            st.success("🤖 Models: Loaded")
-        else:
-            st.error("🤖 Models: Not Loaded")
     with col2:
-        if st.session_state.pdf_processed and st.session_state.current_pdf_name:
-            st.success(f"📄 PDF: {st.session_state.current_pdf_name}")
-        else:
-            st.error("📄 PDF: Not Processed")
     with col3:
-        if st.session_state.models_loaded and st.session_state.pdf_processed:
-            st.success("🟢 Ready for Questions")
-        else:
-            st.error("🔴 Not Ready")
-    # Load models button
     if not st.session_state.models_loaded:
-        if st.button("🤖 Load Models", key="load_models"):
-            with st.spinner("Loading models... This may take a few minutes"):
                 success = st.session_state.rag_system.load_models()
-                if success:
-                    st.session_state.models_loaded = True
-                    st.rerun()
-    # Only show PDF upload if models are loaded
     if st.session_state.models_loaded:
         st.markdown("---")
         st.subheader("📁 PDF Upload and Processing")
-        # PDF Upload
-        uploaded_file = st.file_uploader("Upload PDF", type=['pdf'], key="pdf_uploader")
-        if uploaded_file is not None:
-            st.info(f"📄 Uploaded: {uploaded_file.name}")
-            if st.button("📖 Process PDF", key="process_pdf"):
                 with st.spinner("Processing PDF..."):
-                    success = st.session_state.rag_system.process_pdf(uploaded_file, uploaded_file.name)
                     if success:
                         st.session_state.pdf_processed = True
-                        st.session_state.current_pdf_name = uploaded_file.name
                         st.rerun()
-                    else:
-                        st.session_state.pdf_processed = False
-                        st.session_state.current_pdf_name = None
-        # Question answering section
         if st.session_state.pdf_processed:
             st.markdown("---")
             st.subheader("❓ Ask Questions")
-            # Show current document info
             st.info(f"📚 Current document: {st.session_state.current_pdf_name}")
-            st.info(f"📊 Document chunks: {len(st.session_state.rag_system.documents)}")
-            query = st.text_input("Ask a question about the PDF:", key="question_input",
-                                placeholder="e.g., What is the main topic of this document?")
-            if query:
-                if st.button("🔍 Get Answer", key="get_answer"):
-                    with st.spinner("Searching and generating answer..."):
-                        result = st.session_state.rag_system.answer_question(query)
-                    # Display answer
-                    st.markdown("### 🤖 Answer:")
-                    st.write(result['answer'])
-                    # Display sources
-                    if result.get('sources'):
-                        st.markdown("### 📚 Relevant Sources:")
-                        for i, source in enumerate(result['sources']):
-                            with st.expander(f"Source {i+1} (Relevance Score: {source['score']:.3f})"):
-                                st.write(source['text'][:500] + "..." if len(source['text']) > 500 else source['text'])
-                # Add some example questions
-                st.markdown("### 💡 Example Questions:")
-                example_questions = [
-                    "What is the main topic of this document?",
-                    "Can you summarize the key points?",
-                    "What are the important details mentioned?",
-                    "Who are the main people or entities discussed?"
-                ]
-                for i, example in enumerate(example_questions):
-                    if st.button(f"📝 {example}", key=f"example_{i}"):
-                        st.session_state.question_input = example
-                        st.rerun()
-    # Sidebar with instructions and debugging
     with st.sidebar:
         st.header("📋 Instructions")
-        st.write("""
-        1. **Load Models**: Click to initialize AI models
-        2. **Upload PDF**: Select a PDF file to analyze
-        3. **Process PDF**: Extract and index PDF content
-        4. **Ask Questions**: Get AI-powered answers
-        """)
         st.header("🔧 Debug Info")
-        if st.session_state.models_loaded:
-            st.write("✅ Models loaded")
-        else:
-            st.write("❌ Models not loaded")
-        if st.session_state.pdf_processed:
-            st.write(f"✅ PDF processed: {st.session_state.current_pdf_name}")
-            if hasattr(st.session_state.rag_system, 'documents'):
-                st.write(f"📊 Chunks: {len(st.session_state.rag_system.documents)}")
-        else:
-            st.write("❌ No PDF processed")
-        # Reset button
-        if st.button("🔄 Reset All", key="reset_all"):
             for key in list(st.session_state.keys()):
                 del st.session_state[key]
             st.rerun()
-        st.header("⚙️ Tips")
-        st.write("""
-        - **PDF not working?** Try a different PDF file
-        - **No text extracted?** PDF might be image-based
-        - **Poor answers?** Try more specific questions
-        - **Slow performance?** Use smaller PDF files
-        """)
 if __name__ == "__main__":
-    main()

+# Fixed SimplePDFRAG with better state management and PDF caching
 import streamlit as st
 import PyPDF2
 from sentence_transformers import SentenceTransformer
 import logging
 import os
 import tempfile
 # Configure logging
 logging.basicConfig(level=logging.INFO)
         self.pdf_name = None
     def setup_cache_directory(self):
         try:
             cache_dir = tempfile.mkdtemp(prefix="model_cache_")
             os.environ['HF_HOME'] = cache_dir
             os.environ['TRANSFORMERS_CACHE'] = cache_dir
             os.environ['SENTENCE_TRANSFORMERS_HOME'] = cache_dir
             st.info(f"Using cache directory: {cache_dir}")
             return cache_dir
         except Exception as e:
             return None
     def load_models(self):
         try:
             cache_dir = self.setup_cache_directory()
             st.info("Loading embedding model...")
             self.embedding_model = SentenceTransformer(
+                'all-MiniLM-L6-v2', cache_folder=cache_dir
             )
             st.info("Loading IBM Granite model...")
+            model_name = "ibm-granite/granite-3.0-2b-instruct"
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
             self.granite_model = AutoModelForCausalLM.from_pretrained(
+                model_name, cache_dir=cache_dir, torch_dtype=torch.float32
             )
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
             st.success("Models loaded successfully!")
             return True
         except Exception as e:
             st.error(f"Error loading models: {e}")
             logger.error(f"Model loading error: {e}")
             return False
     def extract_pdf_text(self, pdf_file):
         try:
             pdf_file.seek(0)
             pdf_reader = PyPDF2.PdfReader(pdf_file)
             text = ""
             st.info(f"PDF has {len(pdf_reader.pages)} pages")
             for page_num, page in enumerate(pdf_reader.pages):
                 try:
                     page_text = page.extract_text()
                         st.warning(f"⚠️ No text found on page {page_num + 1}")
                 except Exception as page_error:
                     st.error(f"Error extracting page {page_num + 1}: {page_error}")
             if text.strip():
+                st.success(f"Extracted {len(text)} characters")
                 st.write("📄 **Text Preview:**")
                 st.text(text[:500] + "..." if len(text) > 500 else text)
                 return text
             else:
                 st.error("No text could be extracted from the PDF")
                 return None
         except Exception as e:
             st.error(f"Error reading PDF file: {e}")
             logger.error(f"PDF extraction error: {e}")
             return None
     def chunk_text(self, text, chunk_size=500):
         if not text or not text.strip():
             return []
         words = text.split()
+        return [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
     def process_pdf(self, pdf_file, pdf_name):
         try:
             self.pdf_name = pdf_name
             st.info("🔍 Extracting text from PDF...")
             text = self.extract_pdf_text(pdf_file)
             if not text:
                 return False
             st.info("✂️ Splitting text into chunks...")
             chunks = self.chunk_text(text)
             if not chunks:
                 return False
             st.info(f"🔄 Creating embeddings for {len(chunks)} chunks...")
+            embeddings = self.embedding_model.encode(chunks, show_progress_bar=True)
+            self.documents = chunks
+            self.embeddings = embeddings
+            st.success(f"✅ Successfully processed PDF: {len(chunks)} chunks created with embeddings")
+            return True
         except Exception as e:
             st.error(f"❌ Error processing PDF: {e}")
             logger.error(f"PDF processing error: {e}")
             return False
     def search_documents(self, query, top_k=3):
         if not self.documents or len(self.embeddings) == 0:
             st.warning("No documents available for search")
             return []
         try:
             query_embedding = self.embedding_model.encode([query])
             similarities = cosine_similarity(query_embedding, self.embeddings)[0]
             top_indices = np.argsort(similarities)[-top_k:][::-1]
+            return [{'text': self.documents[i], 'score': similarities[i]}
+                    for i in top_indices if similarities[i] > 0.1]
         except Exception as e:
             st.error(f"Error searching documents: {e}")
             logger.error(f"Search error: {e}")
             return []
     def generate_answer(self, query, context_docs):
         if not self.granite_model or not context_docs:
             return "I don't have enough information to answer your question."
+        context = "\n\n".join([doc['text'][:200] for doc in context_docs])
         prompt = f"""You are a helpful AI assistant. Based on the following context, provide a clear and accurate answer to the question.
 Context:
 Question: {query}
 Answer:"""
         try:
+            inputs = self.tokenizer.encode(prompt, return_tensors='pt', max_length=512, truncation=True)
             with torch.no_grad():
                 outputs = self.granite_model.generate(
                     inputs,
+                    max_length=inputs.shape[1] + 100,
                     temperature=0.7,
                     do_sample=True,
                     pad_token_id=self.tokenizer.eos_token_id,
                     eos_token_id=self.tokenizer.eos_token_id
                 )
             response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+            return response.strip() if len(response.strip()) >= 10 else context[:300] + "..."
         except Exception as e:
             logger.error(f"Generation error: {e}")
+            return context[:300] + "..."
     def answer_question(self, query):
         if not self.documents:
+            return {'answer': "No PDF has been processed yet.", 'sources': []}
         relevant_docs = self.search_documents(query)
         if not relevant_docs:
+            return {'answer': "No relevant information found.", 'sources': []}
         return {
+            'answer': self.generate_answer(query, relevant_docs),
             'sources': relevant_docs
         }
 def main():
+    st.set_page_config(page_title="Simple PDF RAG with IBM Granite (Fixed)", page_icon="📄", layout="wide")
     st.title("📄 Simple PDF RAG with IBM Granite (Fixed)")
     st.write("Upload a PDF and ask questions about its content")
     if 'rag_system' not in st.session_state:
         st.session_state.rag_system = SimplePDFRAG()
     if 'models_loaded' not in st.session_state:
         st.session_state.models_loaded = False
     if 'pdf_processed' not in st.session_state:
         st.session_state.pdf_processed = False
     if 'current_pdf_name' not in st.session_state:
         st.session_state.current_pdf_name = None
     col1, col2, col3 = st.columns(3)
     with col1:
+        st.success("🤖 Models: Loaded" if st.session_state.models_loaded else "🤖 Models: Not Loaded")
     with col2:
+        st.success(f"📄 PDF: {st.session_state.current_pdf_name}" if st.session_state.pdf_processed else "📄 PDF: Not Processed")
     with col3:
+        st.success("🟢 Ready" if st.session_state.models_loaded and st.session_state.pdf_processed else "🔴 Not Ready")
     if not st.session_state.models_loaded:
+        if st.button("🤖 Load Models"):
+            with st.spinner("Loading models..."):
                 success = st.session_state.rag_system.load_models()
+                st.session_state.models_loaded = success
+                st.rerun()
     if st.session_state.models_loaded:
         st.markdown("---")
         st.subheader("📁 PDF Upload and Processing")
+        uploaded_file = st.file_uploader("Upload PDF", type=['pdf'])
+        if uploaded_file and 'uploaded_file_path' not in st.session_state:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(uploaded_file.read())
+                st.session_state.uploaded_file_path = tmp.name
+                st.session_state.uploaded_file_name = uploaded_file.name
+            st.rerun()
+        if 'uploaded_file_path' in st.session_state:
+            st.info(f"📄 Uploaded: {st.session_state.uploaded_file_name}")
+            if st.button("📖 Process PDF"):
                 with st.spinner("Processing PDF..."):
+                    with open(st.session_state.uploaded_file_path, "rb") as f:
+                        success = st.session_state.rag_system.process_pdf(f, st.session_state.uploaded_file_name)
                     if success:
                         st.session_state.pdf_processed = True
+                        st.session_state.current_pdf_name = st.session_state.uploaded_file_name
                         st.rerun()
         if st.session_state.pdf_processed:
             st.markdown("---")
             st.subheader("❓ Ask Questions")
             st.info(f"📚 Current document: {st.session_state.current_pdf_name}")
+            query = st.text_input("Ask a question:", placeholder="e.g., What is the main topic?")
+            if query and st.button("🔍 Get Answer"):
+                with st.spinner("Searching and generating answer..."):
+                    result = st.session_state.rag_system.answer_question(query)
+                st.markdown("### 🤖 Answer:")
+                st.write(result['answer'])
+                if result.get('sources'):
+                    st.markdown("### 📚 Sources:")
+                    for i, src in enumerate(result['sources']):
+                        with st.expander(f"Source {i+1} (Score: {src['score']:.3f})"):
+                            st.write(src['text'][:500] + "..." if len(src['text']) > 500 else src['text'])
     with st.sidebar:
         st.header("📋 Instructions")
+        st.markdown("1. Load Models\n2. Upload PDF\n3. Process PDF\n4. Ask Questions")
         st.header("🔧 Debug Info")
+        st.write("✅ Models loaded" if st.session_state.models_loaded else "❌ Models not loaded")
+        st.write(f"✅ PDF: {st.session_state.current_pdf_name}" if st.session_state.pdf_processed else "❌ No PDF processed")
+        if st.button("🔄 Reset All"):
             for key in list(st.session_state.keys()):
                 del st.session_state[key]
             st.rerun()
 if __name__ == "__main__":
+    main()