Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

b8bcf74

verified ·

1 Parent(s): cd9e823

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +109 -49

src/streamlit_app.py CHANGED Viewed

@@ -7,7 +7,6 @@ os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
 os.environ['HF_HOME'] = tempfile.gettempdir()
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 import torch
 import PyPDF2
 import docx
@@ -126,15 +125,14 @@ ANALYSIS_TYPES = {
 @st.cache_resource
 def load_models():
-    """Load and cache all models"""
     try:
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-        model_name = "microsoft/DialoGPT-medium"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(model_name)
         client = chromadb.Client()
         try:
             collection = client.get_collection("documents")
@@ -144,10 +142,13 @@ def load_models():
                 metadata={"hnsw:space": "cosine"}
             )
-        return embedding_model, tokenizer, model, collection
     except Exception as e:
-        st.error(f"Error loading models: {str(e)}")
-        return None, None, None, None
 def validate_file(uploaded_file):
     """Validate uploaded file"""
@@ -168,7 +169,7 @@ def analyze_document_structure(text, filename):
         'filename': filename,
         'word_count': len(text.split()),
         'char_count': len(text),
-        'estimated_pages': len(text) // 2000,  # Rough estimate
         'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
         'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
         'sections': [],
@@ -177,28 +178,33 @@ def analyze_document_structure(text, filename):
     }
     # Detect document type
-    if any(term in text.lower() for term in ['financial statement', 'balance sheet', 'income statement']):
         analysis['document_type'] = 'Financial Statement'
-    elif any(term in text.lower() for term in ['annual report', '10-k', '10-q']):
         analysis['document_type'] = 'Annual Report'
-    elif any(term in text.lower() for term in ['investment', 'portfolio', 'fund']):
         analysis['document_type'] = 'Investment Document'
-    elif any(term in text.lower() for term in ['contract', 'agreement', 'terms']):
         analysis['document_type'] = 'Legal Document'
     # Extract sections (headers)
     headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
     analysis['sections'] = headers[:10]  # Top 10 sections
     # Extract key financial terms
-    financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin)\b', text, re.IGNORECASE)
     analysis['key_terms'] = list(set(financial_terms))[:15]
     return analysis
 @st.cache_data
 def process_document(uploaded_file):
-    """Process uploaded document with enhanced analysis"""
     is_valid, message = validate_file(uploaded_file)
     if not is_valid:
         raise ValueError(message)
@@ -218,8 +224,14 @@ def process_document(uploaded_file):
             try:
                 with open(tmp_path, 'rb') as file:
                     reader = PyPDF2.PdfReader(file)
                     for page in reader.pages:
-                        text += page.extract_text() + "\n"
             except Exception as e:
                 raise ValueError(f"Error reading PDF: {str(e)}")
@@ -227,29 +239,43 @@ def process_document(uploaded_file):
             try:
                 doc = docx.Document(tmp_path)
                 for paragraph in doc.paragraphs:
-                    text += paragraph.text + "\n"
             except Exception as e:
                 raise ValueError(f"Error reading DOCX: {str(e)}")
         elif file_extension == 'txt':
             try:
                 with open(tmp_path, 'r', encoding='utf-8') as file:
                     text = file.read()
             except UnicodeDecodeError:
-                with open(tmp_path, 'r', encoding='latin-1') as file:
-                    text = file.read()
             except Exception as e:
-                raise ValueError(f"Error reading TXT: {str(e)}")
         elif file_extension in ['xlsx', 'xls']:
             try:
-                df = pd.read_excel(tmp_path)
-                text = df.to_string()
             except Exception as e:
-                raise ValueError(f"Error reading Excel: {str(e)}")
-        if not text.strip():
-            raise ValueError("No text content found in the file")
         # Analyze document structure
         analysis = analyze_document_structure(text, uploaded_file.name)
@@ -275,12 +301,30 @@ def generate_analysis_by_type(text, analysis_type, analysis_info):
     for keyword in keywords:
         if keyword in text_lower:
             # Find context around keywords
-            pattern = rf'.{0,200}\b{keyword}\b.{0,200}'
             matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
-            relevant_sections.extend(matches[:3])  # Max 3 matches per keyword
     if not relevant_sections:
-        return f"No specific information found for {analysis_type} in this document."
     # Create structured analysis
     analysis_result = f"""
@@ -293,31 +337,43 @@ def generate_analysis_by_type(text, analysis_type, analysis_info):
     for i, section in enumerate(relevant_sections[:5], 1):
         cleaned_section = re.sub(r'\s+', ' ', section.strip())
-        analysis_result += f"\n{i}. {cleaned_section[:300]}...\n"
-    analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}."
     return analysis_result
 def chunk_text(text, chunk_size=1000, overlap=200):
-    """Split text into chunks"""
     if not text or not text.strip():
         return []
     chunks = []
     start = 0
     while start < len(text):
         end = start + chunk_size
-        chunk = text[start:end]
-        if end < len(text):
             last_period = chunk.rfind('.')
-            if last_period > chunk_size * 0.7:
-                end = start + last_period + 1
                 chunk = text[start:end]
-        if chunk.strip():
             chunks.append(chunk.strip())
         start = end - overlap
@@ -328,13 +384,15 @@ def chunk_text(text, chunk_size=1000, overlap=200):
     return chunks
 def search_documents(query, collection, embedding_model, n_results=3):
-    """Search for relevant document chunks"""
     try:
         if collection.count() == 0:
             return []
         query_embedding = embedding_model.encode([query]).tolist()
         results = collection.query(
             query_embeddings=query_embedding,
             n_results=min(n_results, collection.count()),
@@ -346,7 +404,7 @@ def search_documents(query, collection, embedding_model, n_results=3):
             for i in range(len(results['documents'][0])):
                 search_results.append({
                     'content': results['documents'][0][i],
-                    'metadata': results['metadatas'][0][i],
                     'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
                 })
@@ -361,7 +419,7 @@ def main():
     st.markdown("""
     <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
-    🚀 Powered by IBM Granite Models | 📊 Advanced Document Intelligence | 🔒 Secure & Compliant
     </div>
     """, unsafe_allow_html=True)
@@ -369,9 +427,10 @@ def main():
     with st.spinner("🔄 Loading AI models..."):
         models = load_models()
         if models[0] is None:
-            st.error("Failed to load AI models. Please refresh the page.")
-            return
-        embedding_model, tokenizer, model, collection = models
     # Sidebar for document management
     with st.sidebar:
@@ -425,13 +484,14 @@ def main():
                                         chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
                                         embedding = embedding_model.encode([chunk]).tolist()
-                                        collection.add(
                                             embeddings=embedding,
                                             documents=[chunk],
                                             metadatas=[{'filename': filename, 'chunk_id': j}],
                                             ids=[chunk_id]
                                         )
                                     except Exception as e:
                                         continue
                             st.success(f"✅ {filename}")
@@ -593,7 +653,7 @@ def main():
 **Query**: {query}
 **Key Findings**:
-{context[:1000]}...
 **Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
@@ -650,10 +710,10 @@ def main():
         st.header("🎯 Project Info")
         st.markdown("""
-        ### **Built For IBM Hackathon**
         **🔧 Technology Stack:**
-        - 🧠 IBM Granite Models
         - 🔍 RAG (Retrieval-Augmented Generation)
         - 📊 Streamlit UI
         - 🗄️ ChromaDB Vector Database

 os.environ['HF_HOME'] = tempfile.gettempdir()
 os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
 import torch
 import PyPDF2
 import docx
 @st.cache_resource
 def load_models():
+    """Load and cache models with better error handling"""
     try:
+        # Load embedding model first (most reliable)
+        st.info("Loading embedding model...")
         embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Initialize ChromaDB
+        st.info("Initializing vector database...")
         client = chromadb.Client()
         try:
             collection = client.get_collection("documents")
                 metadata={"hnsw:space": "cosine"}
             )
+        st.success("✅ Models loaded successfully!")
+        return embedding_model, collection
     except Exception as e:
+        st.error(f"❌ Error loading models: {str(e)}")
+        st.error("Please check your internet connection and try refreshing the page.")
+        return None, None
 def validate_file(uploaded_file):
     """Validate uploaded file"""
         'filename': filename,
         'word_count': len(text.split()),
         'char_count': len(text),
+        'estimated_pages': max(1, len(text) // 2000),  # Minimum 1 page
         'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
         'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
         'sections': [],
     }
     # Detect document type
+    text_lower = text.lower()
+    if any(term in text_lower for term in ['financial statement', 'balance sheet', 'income statement']):
         analysis['document_type'] = 'Financial Statement'
+    elif any(term in text_lower for term in ['annual report', '10-k', '10-q']):
         analysis['document_type'] = 'Annual Report'
+    elif any(term in text_lower for term in ['investment', 'portfolio', 'fund']):
         analysis['document_type'] = 'Investment Document'
+    elif any(term in text_lower for term in ['contract', 'agreement', 'terms']):
         analysis['document_type'] = 'Legal Document'
+    elif any(term in text_lower for term in ['budget', 'forecast', 'projection']):
+        analysis['document_type'] = 'Financial Planning'
+    else:
+        analysis['document_type'] = 'Business Document'
     # Extract sections (headers)
     headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
     analysis['sections'] = headers[:10]  # Top 10 sections
     # Extract key financial terms
+    financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin|expenses|income|growth|risk|return)\b', text, re.IGNORECASE)
     analysis['key_terms'] = list(set(financial_terms))[:15]
     return analysis
 @st.cache_data
 def process_document(uploaded_file):
+    """Process uploaded document with enhanced error handling"""
     is_valid, message = validate_file(uploaded_file)
     if not is_valid:
         raise ValueError(message)
             try:
                 with open(tmp_path, 'rb') as file:
                     reader = PyPDF2.PdfReader(file)
+                    if len(reader.pages) == 0:
+                        raise ValueError("PDF file appears to be empty")
                     for page in reader.pages:
+                        page_text = page.extract_text()
+                        if page_text:
+                            text += page_text + "\n"
+                if not text.strip():
+                    raise ValueError("Could not extract text from PDF")
             except Exception as e:
                 raise ValueError(f"Error reading PDF: {str(e)}")
             try:
                 doc = docx.Document(tmp_path)
                 for paragraph in doc.paragraphs:
+                    if paragraph.text.strip():
+                        text += paragraph.text + "\n"
+                if not text.strip():
+                    raise ValueError("DOCX file appears to be empty")
             except Exception as e:
                 raise ValueError(f"Error reading DOCX: {str(e)}")
         elif file_extension == 'txt':
             try:
+                # Try UTF-8 first
                 with open(tmp_path, 'r', encoding='utf-8') as file:
                     text = file.read()
             except UnicodeDecodeError:
+                try:
+                    # Fallback to latin-1
+                    with open(tmp_path, 'r', encoding='latin-1') as file:
+                        text = file.read()
+                except Exception as e:
+                    raise ValueError(f"Error reading TXT file: {str(e)}")
             except Exception as e:
+                raise ValueError(f"Error reading TXT file: {str(e)}")
         elif file_extension in ['xlsx', 'xls']:
             try:
+                df = pd.read_excel(tmp_path, sheet_name=0)  # Read first sheet
+                if df.empty:
+                    raise ValueError("Excel file appears to be empty")
+                text = df.to_string(index=False)
             except Exception as e:
+                raise ValueError(f"Error reading Excel file: {str(e)}")
+        if not text or not text.strip():
+            raise ValueError("No readable text content found in the file")
+        # Clean up text
+        text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove excessive newlines
+        text = text.strip()
         # Analyze document structure
         analysis = analyze_document_structure(text, uploaded_file.name)
     for keyword in keywords:
         if keyword in text_lower:
             # Find context around keywords
+            pattern = rf'.{{0,200}}\b{keyword}\b.{{0,200}}'
             matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
+            relevant_sections.extend(matches[:2])  # Max 2 matches per keyword
     if not relevant_sections:
+        # If no keyword matches, provide general analysis
+        words = text.split()
+        if len(words) > 500:
+            sample_text = ' '.join(words[:500]) + "..."
+        else:
+            sample_text = text
+        return f"""
+## {analysis_type}
+**Analysis Focus**: {description}
+**Document Analysis**:
+Based on the document content, here are the key insights related to {analysis_type.lower()}:
+{sample_text}
+**Summary**: The document has been analyzed for {analysis_type.lower()} content. While specific keywords weren't found, the above content provides relevant context for your analysis needs.
+"""
     # Create structured analysis
     analysis_result = f"""
     for i, section in enumerate(relevant_sections[:5], 1):
         cleaned_section = re.sub(r'\s+', ' ', section.strip())
+        if len(cleaned_section) > 300:
+            cleaned_section = cleaned_section[:300] + "..."
+        analysis_result += f"\n**Finding {i}**: {cleaned_section}\n"
+    analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}. These findings provide insights into the document's content from the perspective of {description.lower()}."
     return analysis_result
 def chunk_text(text, chunk_size=1000, overlap=200):
+    """Split text into chunks with better handling"""
     if not text or not text.strip():
         return []
+    # Clean text first
+    text = re.sub(r'\s+', ' ', text.strip())
     chunks = []
     start = 0
     while start < len(text):
         end = start + chunk_size
+        if end >= len(text):
+            # Last chunk
+            chunk = text[start:]
+        else:
+            chunk = text[start:end]
+            # Try to break at sentence boundary
             last_period = chunk.rfind('.')
+            last_newline = chunk.rfind('\n')
+            break_point = max(last_period, last_newline)
+            if break_point > chunk_size * 0.5:  # If we found a good break point
+                end = start + break_point + 1
                 chunk = text[start:end]
+        if chunk.strip() and len(chunk.strip()) > 50:  # Only add substantial chunks
             chunks.append(chunk.strip())
         start = end - overlap
     return chunks
 def search_documents(query, collection, embedding_model, n_results=3):
+    """Search for relevant document chunks with better error handling"""
     try:
         if collection.count() == 0:
             return []
+        # Generate query embedding
         query_embedding = embedding_model.encode([query]).tolist()
+        # Search the collection
         results = collection.query(
             query_embeddings=query_embedding,
             n_results=min(n_results, collection.count()),
             for i in range(len(results['documents'][0])):
                 search_results.append({
                     'content': results['documents'][0][i],
+                    'metadata': results['metadatas'][0][i] if results['metadatas'][0] else {},
                     'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
                 })
     st.markdown("""
     <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
+    🚀 Powered by Advanced AI | 📊 Document Intelligence | 🔒 Secure & Compliant
     </div>
     """, unsafe_allow_html=True)
     with st.spinner("🔄 Loading AI models..."):
         models = load_models()
         if models[0] is None:
+            st.error("❌ Failed to load AI models. Please refresh the page and check your internet connection.")
+            st.stop()
+        embedding_model, collection = models
     # Sidebar for document management
     with st.sidebar:
                                         chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
                                         embedding = embedding_model.encode([chunk]).tolist()
+                                        collection.upsert(
                                             embeddings=embedding,
                                             documents=[chunk],
                                             metadatas=[{'filename': filename, 'chunk_id': j}],
                                             ids=[chunk_id]
                                         )
                                     except Exception as e:
+                                        st.warning(f"Warning: Could not process chunk {j} of {filename}")
                                         continue
                             st.success(f"✅ {filename}")
 **Query**: {query}
 **Key Findings**:
+{context[:1500]}...
 **Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
         st.header("🎯 Project Info")
         st.markdown("""
+        ### **Enterprise AI Assistant**
         **🔧 Technology Stack:**
+        - 🧠 Advanced AI Models
         - 🔍 RAG (Retrieval-Augmented Generation)
         - 📊 Streamlit UI
         - 🗄️ ChromaDB Vector Database