Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

4088bc5

verified ·

1 Parent(s): c6df280

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +218 -106

src/streamlit_app.py CHANGED Viewed

@@ -7,7 +7,6 @@ os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
 os.environ['HF_HOME'] = tempfile.gettempdir()
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import PyPDF2
@@ -19,7 +18,7 @@ from chromadb.config import Settings
 import tempfile
 import uuid
-# Page config
 st.set_page_config(
     page_title="FinanceGPT - Enterprise AI Assistant",
     page_icon="💰",
@@ -55,69 +54,127 @@ st.markdown("""
 @st.cache_resource
 def load_models():
     """Load and cache all models"""
-    # Initialize embedding model
-    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-    # Initialize Granite model (using a smaller model for demo)
-    model_name = "microsoft/DialoGPT-medium"  # Fallback for demo
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    # Initialize vector database (in-memory for HF Spaces)
-    client = chromadb.Client()
-    collection = client.create_collection(
-        name="documents",
-        metadata={"hnsw:space": "cosine"}
-    )
-    return embedding_model, tokenizer, model, collection
 @st.cache_data
 def process_document(uploaded_file):
-    """Process uploaded document"""
-    # Create temporary file
-    with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
-        tmp_file.write(uploaded_file.getvalue())
-        tmp_path = tmp_file.name
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
         if file_extension == 'pdf':
-            with open(tmp_path, 'rb') as file:
-                reader = PyPDF2.PdfReader(file)
-                text = ""
-                for page in reader.pages:
-                    text += page.extract_text() + "\n"
         elif file_extension == 'docx':
-            doc = docx.Document(tmp_path)
-            text = ""
-            for paragraph in doc.paragraphs:
-                text += paragraph.text + "\n"
         elif file_extension == 'txt':
-            with open(tmp_path, 'r', encoding='utf-8') as file:
-                text = file.read()
         elif file_extension in ['xlsx', 'xls']:
-            df = pd.read_excel(tmp_path)
-            text = df.to_string()
         else:
-            text = "Unsupported file format"
         return text, uploaded_file.name
     finally:
-        # Clean up
-        if os.path.exists(tmp_path):
-            os.remove(tmp_path)
 def chunk_text(text, chunk_size=1000, overlap=200):
     """Split text into chunks"""
     chunks = []
     start = 0
@@ -131,7 +188,9 @@ def chunk_text(text, chunk_size=1000, overlap=200):
                 end = start + last_period + 1
                 chunk = text[start:end]
-        chunks.append(chunk.strip())
         start = end - overlap
         if start >= len(text):
@@ -142,33 +201,45 @@ def chunk_text(text, chunk_size=1000, overlap=200):
 def search_documents(query, collection, embedding_model, n_results=3):
     """Search for relevant document chunks"""
     try:
         query_embedding = embedding_model.encode([query]).tolist()
         results = collection.query(
             query_embeddings=query_embedding,
-            n_results=n_results,
             include=['documents', 'metadatas', 'distances']
         )
         search_results = []
-        for i in range(len(results['documents'][0])):
-            search_results.append({
-                'content': results['documents'][0][i],
-                'metadata': results['metadatas'][0][i],
-                'score': 1 - results['distances'][0][i]
-            })
         return search_results
-    except:
         return []
 def generate_response(query, context_chunks):
     """Generate response using available model"""
     # Build context
     context = ""
     for i, chunk in enumerate(context_chunks):
-        context += f"[Document {i+1}: {chunk['metadata']['filename']}]\n"
         context += f"{chunk['content'][:500]}...\n\n"
     # For demo purposes, create a structured response
@@ -178,9 +249,9 @@ def generate_response(query, context_chunks):
 {context[:800]}...
 💡 **Analysis:**
-The documents contain relevant information that addresses your question. The most relevant sections have been identified and analyzed.
-📚 **Sources:** {len(context_chunks)} document sections were used to generate this response.
 """
     return response
@@ -196,60 +267,92 @@ def main():
     </div>
     """, unsafe_allow_html=True)
-    # Load models
     with st.spinner("🔄 Loading AI models..."):
-        embedding_model, tokenizer, model, collection = load_models()
     # Sidebar for document upload
     with st.sidebar:
         st.header("📁 Document Management")
         st.markdown("Upload your financial documents to get started!")
         uploaded_files = st.file_uploader(
             "Choose files",
             accept_multiple_files=True,
             type=['pdf', 'docx', 'txt', 'xlsx'],
-            help="Supported formats: PDF, DOCX, TXT, XLSX"
         )
         if uploaded_files:
-            st.success(f"✅ {len(uploaded_files)} files uploaded!")
-            if st.button("🔄 Process Documents", type="primary"):
-                progress_bar = st.progress(0)
-                status_text = st.empty()
-                for i, file in enumerate(uploaded_files):
-                    status_text.text(f"Processing {file.name}...")
-                    try:
-                        # Process document
-                        text, filename = process_document(file)
-                        # Create chunks
-                        chunks = chunk_text(text)
-                        # Generate embeddings and store
-                        for j, chunk in enumerate(chunks):
-                            chunk_id = f"{filename}_{j}"
-                            embedding = embedding_model.encode([chunk]).tolist()
-                            collection.add(
-                                embeddings=embedding,
-                                documents=[chunk],
-                                metadatas=[{'filename': filename, 'chunk_id': j}],
-                                ids=[chunk_id]
-                            )
-                        st.success(f"✅ {filename}")
-                    except Exception as e:
-                        st.error(f"❌ Error processing {file.name}: {str(e)}")
-                    progress_bar.progress((i + 1) / len(uploaded_files))
-                status_text.text("✅ All documents processed!")
-                st.balloons()
     # Main interface
     col1, col2 = st.columns([2, 1])
@@ -289,24 +392,30 @@ def main():
                 return
             with st.spinner("🤖 Analyzing documents and generating response..."):
-                # Search for relevant context
-                search_results = search_documents(query, collection, embedding_model)
-                if search_results:
-                    # Generate response
-                    response = generate_response(query, search_results)
-                    # Display response
-                    st.markdown("### 🤖 AI Response")
-                    st.markdown(f'<div class="chat-message">{response}</div>', unsafe_allow_html=True)
-                    # Show sources
-                    st.markdown("### 📚 Sources")
-                    for i, result in enumerate(search_results):
-                        with st.expander(f"📄 Source {i+1}: {result['metadata']['filename']} (Relevance: {result['score']:.1%})"):
-                            st.markdown(f'<div class="source-box">{result["content"][:500]}...</div>', unsafe_allow_html=True)
-                else:
-                    st.error("❌ No relevant information found in the uploaded documents.")
     with col2:
         st.header("📊 Project Info")
@@ -337,9 +446,11 @@ def main():
         """)
         # Stats
-        if 'collection' in locals():
             doc_count = collection.count()
-            st.metric("📄 Documents Processed", doc_count)
         # Demo link
         st.markdown("""
@@ -348,9 +459,10 @@ def main():
         This is a fully functional prototype!
         **Try it:**
-        1. Upload financial documents
-        2. Ask intelligent questions
-        3. Get instant answers with sources
         """)
 if __name__ == "__main__":

 os.environ['HF_HOME'] = tempfile.gettempdir()
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import PyPDF2
 import tempfile
 import uuid
+# Page config - ADD FILE SIZE LIMIT
 st.set_page_config(
     page_title="FinanceGPT - Enterprise AI Assistant",
     page_icon="💰",
 @st.cache_resource
 def load_models():
     """Load and cache all models"""
+    try:
+        # Initialize embedding model
+        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Initialize Granite model (using a smaller model for demo)
+        model_name = "microsoft/DialoGPT-medium"  # Fallback for demo
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        # Initialize vector database (in-memory for HF Spaces)
+        client = chromadb.Client()
+        # Check if collection exists, if not create it
+        try:
+            collection = client.get_collection("documents")
+        except:
+            collection = client.create_collection(
+                name="documents",
+                metadata={"hnsw:space": "cosine"}
+            )
+        return embedding_model, tokenizer, model, collection
+    except Exception as e:
+        st.error(f"Error loading models: {str(e)}")
+        return None, None, None, None
+# FIX: Add file validation function
+def validate_file(uploaded_file):
+    """Validate uploaded file"""
+    # Check file size (limit to 50MB to avoid 403 errors)
+    max_size = 50 * 1024 * 1024  # 50MB in bytes
+    if uploaded_file.size > max_size:
+        return False, f"File {uploaded_file.name} is too large. Maximum size is 50MB."
+    # Check file type
+    allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
+    file_extension = uploaded_file.name.split('.')[-1].lower()
+    if file_extension not in allowed_extensions:
+        return False, f"File type .{file_extension} is not supported."
+    return True, "Valid file"
 @st.cache_data
 def process_document(uploaded_file):
+    """Process uploaded document with better error handling"""
+    # Validate file first
+    is_valid, message = validate_file(uploaded_file)
+    if not is_valid:
+        raise ValueError(message)
+    # Create temporary file with better error handling
+    try:
+        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
+            tmp_file.write(uploaded_file.getvalue())
+            tmp_path = tmp_file.name
+    except Exception as e:
+        raise ValueError(f"Failed to create temporary file: {str(e)}")
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
+        text = ""
         if file_extension == 'pdf':
+            try:
+                with open(tmp_path, 'rb') as file:
+                    reader = PyPDF2.PdfReader(file)
+                    for page in reader.pages:
+                        text += page.extract_text() + "\n"
+            except Exception as e:
+                raise ValueError(f"Error reading PDF: {str(e)}")
         elif file_extension == 'docx':
+            try:
+                doc = docx.Document(tmp_path)
+                for paragraph in doc.paragraphs:
+                    text += paragraph.text + "\n"
+            except Exception as e:
+                raise ValueError(f"Error reading DOCX: {str(e)}")
         elif file_extension == 'txt':
+            try:
+                with open(tmp_path, 'r', encoding='utf-8') as file:
+                    text = file.read()
+            except UnicodeDecodeError:
+                # Try with different encoding
+                with open(tmp_path, 'r', encoding='latin-1') as file:
+                    text = file.read()
+            except Exception as e:
+                raise ValueError(f"Error reading TXT: {str(e)}")
         elif file_extension in ['xlsx', 'xls']:
+            try:
+                df = pd.read_excel(tmp_path)
+                text = df.to_string()
+            except Exception as e:
+                raise ValueError(f"Error reading Excel: {str(e)}")
         else:
+            raise ValueError("Unsupported file format")
+        if not text.strip():
+            raise ValueError("No text content found in the file")
         return text, uploaded_file.name
     finally:
+        # Clean up temporary file
+        try:
+            if os.path.exists(tmp_path):
+                os.remove(tmp_path)
+        except:
+            pass
 def chunk_text(text, chunk_size=1000, overlap=200):
     """Split text into chunks"""
+    if not text or not text.strip():
+        return []
     chunks = []
     start = 0
                 end = start + last_period + 1
                 chunk = text[start:end]
+        if chunk.strip():  # Only add non-empty chunks
+            chunks.append(chunk.strip())
         start = end - overlap
         if start >= len(text):
 def search_documents(query, collection, embedding_model, n_results=3):
     """Search for relevant document chunks"""
     try:
+        if collection.count() == 0:
+            return []
         query_embedding = embedding_model.encode([query]).tolist()
         results = collection.query(
             query_embeddings=query_embedding,
+            n_results=min(n_results, collection.count()),
             include=['documents', 'metadatas', 'distances']
         )
         search_results = []
+        if results['documents'] and results['documents'][0]:
+            for i in range(len(results['documents'][0])):
+                search_results.append({
+                    'content': results['documents'][0][i],
+                    'metadata': results['metadatas'][0][i],
+                    'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
+                })
         return search_results
+    except Exception as e:
+        st.error(f"Search error: {str(e)}")
         return []
 def generate_response(query, context_chunks):
     """Generate response using available model"""
+    if not context_chunks:
+        return "No relevant information found in the uploaded documents."
     # Build context
     context = ""
+    source_files = set()
     for i, chunk in enumerate(context_chunks):
+        filename = chunk['metadata'].get('filename', 'Unknown')
+        source_files.add(filename)
+        context += f"[Document {i+1}: {filename}]\n"
         context += f"{chunk['content'][:500]}...\n\n"
     # For demo purposes, create a structured response
 {context[:800]}...
 💡 **Analysis:**
+The documents contain relevant information that addresses your question. The analysis is based on {len(context_chunks)} relevant sections from your uploaded documents.
+📚 **Sources:** {len(source_files)} document(s) - {', '.join(source_files)}
 """
     return response
     </div>
     """, unsafe_allow_html=True)
+    # Load models with error handling
     with st.spinner("🔄 Loading AI models..."):
+        models = load_models()
+        if models[0] is None:
+            st.error("Failed to load AI models. Please refresh the page.")
+            return
+        embedding_model, tokenizer, model, collection = models
     # Sidebar for document upload
     with st.sidebar:
         st.header("📁 Document Management")
         st.markdown("Upload your financial documents to get started!")
+        # ADD FILE SIZE WARNING
+        st.info("📋 **File Requirements:**\n- Max size: 50MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
         uploaded_files = st.file_uploader(
             "Choose files",
             accept_multiple_files=True,
             type=['pdf', 'docx', 'txt', 'xlsx'],
+            help="Supported formats: PDF, DOCX, TXT, XLSX (Max 50MB each)"
         )
         if uploaded_files:
+            # Validate files before processing
+            valid_files = []
+            for file in uploaded_files:
+                is_valid, message = validate_file(file)
+                if is_valid:
+                    valid_files.append(file)
+                else:
+                    st.error(f"❌ {message}")
+            if valid_files:
+                st.success(f"✅ {len(valid_files)} valid files ready for processing!")
+                if st.button("🔄 Process Documents", type="primary"):
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    processed_count = 0
+                    for i, file in enumerate(valid_files):
+                        status_text.text(f"Processing {file.name}...")
+                        try:
+                            # Process document
+                            text, filename = process_document(file)
+                            # Create chunks
+                            chunks = chunk_text(text)
+                            if not chunks:
+                                st.warning(f"⚠️ No content extracted from {filename}")
+                                continue
+                            # Generate embeddings and store
+                            for j, chunk in enumerate(chunks):
+                                try:
+                                    chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
+                                    embedding = embedding_model.encode([chunk]).tolist()
+                                    collection.add(
+                                        embeddings=embedding,
+                                        documents=[chunk],
+                                        metadatas=[{'filename': filename, 'chunk_id': j}],
+                                        ids=[chunk_id]
+                                    )
+                                except Exception as e:
+                                    st.warning(f"⚠️ Error adding chunk {j} from {filename}: {str(e)}")
+                                    continue
+                            st.success(f"✅ {filename} ({len(chunks)} chunks)")
+                            processed_count += 1
+                        except Exception as e:
+                            st.error(f"❌ Error processing {file.name}: {str(e)}")
+                        progress_bar.progress((i + 1) / len(valid_files))
+                    if processed_count > 0:
+                        status_text.text(f"✅ {processed_count} documents processed successfully!")
+                        st.balloons()
+                    else:
+                        status_text.text("❌ No documents were processed successfully.")
+            else:
+                st.error("❌ No valid files to process!")
     # Main interface
     col1, col2 = st.columns([2, 1])
                 return
             with st.spinner("🤖 Analyzing documents and generating response..."):
+                try:
+                    # Search for relevant context
+                    search_results = search_documents(query, collection, embedding_model)
+                    if search_results:
+                        # Generate response
+                        response = generate_response(query, search_results)
+                        # Display response
+                        st.markdown("### 🤖 AI Response")
+                        st.markdown(f'<div class="chat-message">{response}</div>', unsafe_allow_html=True)
+                        # Show sources
+                        st.markdown("### 📚 Sources")
+                        for i, result in enumerate(search_results):
+                            score_percent = f"{result['score']:.1%}" if result['score'] else "N/A"
+                            filename = result['metadata'].get('filename', 'Unknown')
+                            with st.expander(f"📄 Source {i+1}: {filename} (Relevance: {score_percent})"):
+                                st.markdown(f'<div class="source-box">{result["content"][:500]}...</div>', unsafe_allow_html=True)
+                    else:
+                        st.error("❌ No relevant information found in the uploaded documents.")
+                except Exception as e:
+                    st.error(f"❌ Error processing your question: {str(e)}")
     with col2:
         st.header("📊 Project Info")
         """)
         # Stats
+        try:
             doc_count = collection.count()
+            st.metric("📄 Document Chunks", doc_count)
+        except:
+            st.metric("📄 Document Chunks", 0)
         # Demo link
         st.markdown("""
         This is a fully functional prototype!
         **Try it:**
+        1. Upload financial documents (max 50MB each)
+        2. Process the documents
+        3. Ask intelligent questions
+        4. Get instant answers with sources
         """)
 if __name__ == "__main__":