Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

cd9e823

verified ·

1 Parent(s): 203cee9

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +582 -351

src/streamlit_app.py CHANGED Viewed

@@ -1,19 +1,159 @@
 import streamlit as st
 import os
 import tempfile
 import PyPDF2
-import fitz  # PyMuPDF - better for large PDFs
-import io
-from PIL import Image
-import pytesseract  # For OCR if needed
-# Enhanced file validation with configurable size limits
-def validate_file(uploaded_file, max_size_mb=100):  # Increased default to 100MB
-    """Enhanced file validation with configurable size limits"""
-    max_size = max_size_mb * 1024 * 1024  # Convert MB to bytes
     if uploaded_file.size > max_size:
-        return False, f"File {uploaded_file.name} is too large. Maximum size is {max_size_mb}MB."
     allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
     file_extension = uploaded_file.name.split('.')[-1].lower()
@@ -22,120 +162,47 @@ def validate_file(uploaded_file, max_size_mb=100):  # Increased default to 100MB
     return True, "Valid file"
-def process_large_pdf_pymupdf(file_path, max_pages=None, progress_callback=None):
-    """
-    Process large PDF using PyMuPDF (faster and more memory efficient)
-    """
-    text = ""
-    try:
-        doc = fitz.open(file_path)
-        total_pages = len(doc)
-        # Limit pages if specified
-        pages_to_process = min(total_pages, max_pages) if max_pages else total_pages
-        for page_num in range(pages_to_process):
-            if progress_callback:
-                progress_callback(page_num + 1, pages_to_process)
-            page = doc[page_num]
-            page_text = page.get_text()
-            # If page has very little text, try OCR on images
-            if len(page_text.strip()) < 50:
-                try:
-                    # Get page as image and apply OCR
-                    pix = page.get_pixmap()
-                    img_data = pix.tobytes("png")
-                    img = Image.open(io.BytesIO(img_data))
-                    ocr_text = pytesseract.image_to_string(img)
-                    if len(ocr_text.strip()) > len(page_text.strip()):
-                        page_text = ocr_text
-                except Exception as e:
-                    # OCR failed, continue with extracted text
-                    pass
-            text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
-        doc.close()
-        return text, total_pages
-    except Exception as e:
-        raise ValueError(f"Error processing PDF with PyMuPDF: {str(e)}")
-def process_large_pdf_streaming(file_path, chunk_size=1024*1024, max_pages=None):
-    """
-    Process large PDF in streaming fashion to handle memory constraints
-    """
-    text = ""
-    try:
-        with open(file_path, 'rb') as file:
-            reader = PyPDF2.PdfReader(file)
-            total_pages = len(reader.pages)
-            # Limit pages if specified
-            pages_to_process = min(total_pages, max_pages) if max_pages else total_pages
-            for page_num in range(pages_to_process):
-                try:
-                    page = reader.pages[page_num]
-                    page_text = page.extract_text()
-                    text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"
-                    # Yield control periodically to prevent blocking
-                    if page_num % 10 == 0:  # Every 10 pages
-                        # In Streamlit, you might want to update progress here
-                        pass
-                except Exception as e:
-                    # Skip problematic pages
-                    text += f"\n--- Page {page_num + 1} (Error) ---\nError extracting text: {str(e)}\n"
-                    continue
-            return text, total_pages
-    except Exception as e:
-        raise ValueError(f"Error processing PDF with streaming: {str(e)}")
-def compress_pdf_text(text, compression_ratio=0.7):
-    """
-    Compress extracted text by removing redundant content
-    """
-    lines = text.split('\n')
-    compressed_lines = []
-    seen_lines = set()
-    for line in lines:
-        # Remove extra whitespace
-        cleaned_line = ' '.join(line.split())
-        # Skip empty lines and very short lines
-        if len(cleaned_line) < 3:
-            continue
-        # Skip duplicate lines (common in headers/footers)
-        if cleaned_line in seen_lines:
-            continue
-        seen_lines.add(cleaned_line)
-        compressed_lines.append(cleaned_line)
-        # Stop if we've reached the compression target
-        if len(compressed_lines) >= len(lines) * compression_ratio:
-            break
-    return '\n'.join(compressed_lines)
 @st.cache_data
-def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, use_compression=True):
-    """
-    Enhanced document processing with support for larger files
-    """
-    is_valid, message = validate_file(uploaded_file, max_size_mb)
     if not is_valid:
         raise ValueError(message)
-    # Create temporary file
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
             tmp_file.write(uploaded_file.getvalue())
@@ -146,58 +213,24 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
         text = ""
-        total_pages = 0
         if file_extension == 'pdf':
-            # Progress callback for Streamlit
-            progress_bar = st.progress(0)
-            status_text = st.empty()
-            def update_progress(current_page, total_pages):
-                progress = current_page / total_pages
-                progress_bar.progress(progress)
-                status_text.text(f"Processing page {current_page} of {total_pages}")
-            # Try PyMuPDF first (better for large files)
             try:
-                status_text.text("Using PyMuPDF for better performance...")
-                text, total_pages = process_large_pdf_pymupdf(
-                    tmp_path,
-                    max_pages,
-                    update_progress
-                )
             except Exception as e:
-                # Fallback to PyPDF2 with streaming
-                status_text.text("Falling back to PyPDF2...")
-                text, total_pages = process_large_pdf_streaming(tmp_path, max_pages=max_pages)
-            # Clean up progress indicators
-            progress_bar.empty()
-            status_text.empty()
         elif file_extension == 'docx':
-            # Handle large DOCX files
             try:
-                import docx
                 doc = docx.Document(tmp_path)
-                paragraphs_processed = 0
-                total_paragraphs = len(doc.paragraphs)
-                progress_bar = st.progress(0)
-                for i, paragraph in enumerate(doc.paragraphs):
                     text += paragraph.text + "\n"
-                    paragraphs_processed += 1
-                    # Update progress every 100 paragraphs
-                    if paragraphs_processed % 100 == 0:
-                        progress_bar.progress(paragraphs_processed / total_paragraphs)
-                progress_bar.empty()
             except Exception as e:
                 raise ValueError(f"Error reading DOCX: {str(e)}")
-        # Handle other file types (TXT, Excel) - existing code
         elif file_extension == 'txt':
             try:
                 with open(tmp_path, 'r', encoding='utf-8') as file:
@@ -205,10 +238,11 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
             except UnicodeDecodeError:
                 with open(tmp_path, 'r', encoding='latin-1') as file:
                     text = file.read()
         elif file_extension in ['xlsx', 'xls']:
             try:
-                import pandas as pd
                 df = pd.read_excel(tmp_path)
                 text = df.to_string()
             except Exception as e:
@@ -217,234 +251,431 @@ def process_document_enhanced(uploaded_file, max_size_mb=100, max_pages=None, us
         if not text.strip():
             raise ValueError("No text content found in the file")
-        # Apply compression if requested
-        if use_compression and len(text) > 50000:  # Compress if text > 50k chars
-            original_length = len(text)
-            text = compress_pdf_text(text)
-            st.info(f"📊 Text compressed: {original_length:,} → {len(text):,} characters")
-        # Enhanced analysis
-        analysis = analyze_document_structure_enhanced(text, uploaded_file.name, total_pages)
         return text, uploaded_file.name, analysis
     finally:
-        # Clean up temporary file
         try:
             if os.path.exists(tmp_path):
                 os.remove(tmp_path)
         except:
             pass
-def analyze_document_structure_enhanced(text, filename, total_pages=0):
-    """Enhanced document structure analysis"""
-    import re
-    analysis = {
-        'filename': filename,
-        'word_count': len(text.split()),
-        'char_count': len(text),
-        'total_pages': total_pages,
-        'estimated_pages': total_pages or len(text) // 2000,
-        'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
-        'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
-        'sections': [],
-        'key_terms': [],
-        'document_type': 'Unknown',
-        'language_detected': 'English',  # You could add language detection here
-        'complexity_score': 0
-    }
-    # Calculate complexity score based on various factors
-    complexity_factors = [
-        len(text) > 100000,  # Very long document
-        analysis['has_financial_data'],  # Contains financial data
-        analysis['has_tables'],  # Contains tables
-        len(re.findall(r'\d+', text)) > 1000,  # Many numbers
-        len(re.findall(r'[A-Z]{2,}', text)) > 100,  # Many acronyms
-    ]
-    analysis['complexity_score'] = sum(complexity_factors)
-    # Detect document type with more sophisticated rules
     text_lower = text.lower()
-    if any(term in text_lower for term in ['financial statement', 'balance sheet', 'income statement', 'cash flow']):
-        analysis['document_type'] = 'Financial Statement'
-    elif any(term in text_lower for term in ['annual report', '10-k', '10-q', 'sec filing']):
-        analysis['document_type'] = 'Annual Report'
-    elif any(term in text_lower for term in ['investment', 'portfolio', 'fund', 'prospectus']):
-        analysis['document_type'] = 'Investment Document'
-    elif any(term in text_lower for term in ['contract', 'agreement', 'terms', 'legal']):
-        analysis['document_type'] = 'Legal Document'
-    elif any(term in text_lower for term in ['research', 'analysis', 'study', 'report']):
-        analysis['document_type'] = 'Research Report'
-    # Extract sections (improved)
-    headers = re.findall(r'^[A-Z][A-Za-z\s]{5,50}$', text, re.MULTILINE)
-    # Also look for numbered sections
-    numbered_sections = re.findall(r'^\d+\.\s+[A-Z][A-Za-z\s]{5,50}$', text, re.MULTILINE)
-    all_headers = list(set(headers + numbered_sections))
-    analysis['sections'] = all_headers[:15]  # Top 15 sections
-    # Extract key financial and business terms
-    important_terms = re.findall(
-        r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin|growth|risk|compliance|strategy|market|competition|valuation|dividend|earnings|expenses|budget|forecast)\b',
-        text,
-        re.IGNORECASE
-    )
-    analysis['key_terms'] = list(set(important_terms))[:20]
-    return analysis
-# Configuration options for the sidebar
-def add_file_processing_options():
-    """Add file processing options to sidebar"""
-    st.sidebar.markdown("### ⚙️ Processing Options")
-    # File size limit
-    max_size = st.sidebar.slider(
-        "Max File Size (MB)",
-        min_value=10,
-        max_value=500,
-        value=100,
-        step=10,
-        help="Increase for larger files, but may consume more memory"
-    )
-    # Page limit for PDFs
-    limit_pages = st.sidebar.checkbox("Limit PDF Pages", value=False)
-    max_pages = None
-    if limit_pages:
-        max_pages = st.sidebar.number_input(
-            "Max Pages to Process",
-            min_value=1,
-            max_value=1000,
-            value=100,
-            help="Process only first N pages to save time and memory"
-        )
-    # Text compression
-    use_compression = st.sidebar.checkbox(
-        "Enable Text Compression",
-        value=True,
-        help="Compress extracted text to reduce memory usage"
-    )
-    # Processing method for PDFs
-    pdf_method = st.sidebar.selectbox(
-        "PDF Processing Method",
-        ["PyMuPDF (Recommended)", "PyPDF2 (Fallback)"],
-        help="PyMuPDF is faster and more reliable for large files"
-    )
-    return {
-        'max_size_mb': max_size,
-        'max_pages': max_pages,
-        'use_compression': use_compression,
-        'pdf_method': pdf_method
-    }
-# Memory management utilities
-def get_memory_usage():
-    """Get current memory usage (if psutil is available)"""
     try:
-        import psutil
-        process = psutil.Process()
-        memory_mb = process.memory_info().rss / 1024 / 1024
-        return f"{memory_mb:.1f} MB"
-    except ImportError:
-        return "N/A"
-def clear_large_variables():
-    """Clear large variables from session state to free memory"""
-    keys_to_clear = []
-    for key in st.session_state.keys():
-        if key.startswith('temp_') or key.endswith('_large'):
-            keys_to_clear.append(key)
-    for key in keys_to_clear:
-        del st.session_state[key]
-    # Force garbage collection
-    import gc
-    gc.collect()
-# Example usage in your main function:
-def enhanced_file_upload_section():
-    """Enhanced file upload section with better large file handling"""
-    # Add processing options
-    processing_options = add_file_processing_options()
-    # Memory usage display
-    memory_usage = get_memory_usage()
-    st.sidebar.write(f"💾 Memory Usage: {memory_usage}")
-    # Clear memory button
-    if st.sidebar.button("🧹 Clear Memory"):
-        clear_large_variables()
-        st.sidebar.success("Memory cleared!")
-    # File upload with dynamic size limit
-    st.sidebar.info(f"📋 **File Requirements:**\n- Max size: {processing_options['max_size_mb']}MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
-    uploaded_files = st.file_uploader(
-        "Choose files",
-        accept_multiple_files=True,
-        type=['pdf', 'docx', 'txt', 'xlsx'],
-        help=f"Supported formats: PDF, DOCX, TXT, XLSX (Max {processing_options['max_size_mb']}MB each)"
-    )
-    if uploaded_files:
-        valid_files = []
-        for file in uploaded_files:
-            is_valid, message = validate_file(file, processing_options['max_size_mb'])
-            if is_valid:
-                valid_files.append(file)
-            else:
-                st.error(f"❌ {message}")
-        if valid_files:
-            st.success(f"✅ {len(valid_files)} valid files ready!")
-            # Show processing options
-            if processing_options['max_pages']:
-                st.info(f"📄 Will process first {processing_options['max_pages']} pages of PDF files")
-            if st.button("🔄 Process Documents", type="primary"):
-                process_files_with_options(valid_files, processing_options)
-def process_files_with_options(files, options):
-    """Process files with the specified options"""
-    progress_bar = st.progress(0)
-    status_text = st.empty()
-    for i, file in enumerate(files):
-        status_text.text(f"Processing {file.name}...")
-        try:
-            # Use enhanced processing function
-            text, filename, analysis = process_document_enhanced(
-                file,
-                max_size_mb=options['max_size_mb'],
-                max_pages=options['max_pages'],
-                use_compression=options['use_compression']
-            )
-            # Store in session state
-            st.session_state.processed_docs[filename] = {
-                'text': text,
-                'analysis': analysis,
-                'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
-                'processing_options': options
-            }
-            st.success(f"✅ {filename} - {analysis['word_count']:,} words, {analysis['total_pages']} pages")
-        except Exception as e:
-            st.error(f"❌ Error processing {file.name}: {str(e)}")
-        progress_bar.progress((i + 1) / len(files))
-    status_text.text("✅ Processing complete!")
-    st.balloons()

 import streamlit as st
 import os
 import tempfile
+# Fix cache permission issues in HF Spaces
+os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
+os.environ['HF_HOME'] = tempfile.gettempdir()
+os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+import torch
 import PyPDF2
+import docx
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import chromadb
+from chromadb.config import Settings
+import tempfile
+import uuid
+import re
+from datetime import datetime
+# Page config
+st.set_page_config(
+    page_title="FinanceGPT - Enterprise AI Assistant",
+    page_icon="💰",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 3rem;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .chat-message {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 1rem 0;
+        background-color: #f0f2f6;
+    }
+    .source-box {
+        background-color: #e8f4f8;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border-left: 4px solid #1f77b4;
+    }
+    .doc-summary {
+        background-color: #f8f9fa;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border: 1px solid #dee2e6;
+        margin: 1rem 0;
+    }
+    .analysis-card {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        color: white;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        margin: 0.5rem 0;
+    }
+    .metric-card {
+        background-color: #ffffff;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        text-align: center;
+        margin: 0.5rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Initialize session state
+if 'processed_docs' not in st.session_state:
+    st.session_state.processed_docs = {}
+if 'analysis_cache' not in st.session_state:
+    st.session_state.analysis_cache = {}
+# Document analysis types
+ANALYSIS_TYPES = {
+    "📊 Financial Summary": {
+        "description": "Extract key financial metrics, ratios, and performance indicators",
+        "keywords": ["revenue", "profit", "loss", "assets", "liabilities", "cash flow", "ROI", "margin"],
+        "icon": "📊"
+    },
+    "⚠️ Risk Analysis": {
+        "description": "Identify potential risks, threats, and vulnerability factors",
+        "keywords": ["risk", "threat", "vulnerability", "exposure", "mitigation", "hedge", "insurance"],
+        "icon": "⚠️"
+    },
+    "📈 Market Trends": {
+        "description": "Analyze market conditions, trends, and competitive landscape",
+        "keywords": ["market", "trend", "growth", "competition", "industry", "outlook", "forecast"],
+        "icon": "📈"
+    },
+    "✅ Compliance Check": {
+        "description": "Review regulatory compliance and legal requirements",
+        "keywords": ["compliance", "regulation", "legal", "audit", "governance", "policy", "standard"],
+        "icon": "✅"
+    },
+    "💡 Investment Insights": {
+        "description": "Extract investment recommendations and opportunities",
+        "keywords": ["investment", "opportunity", "recommendation", "valuation", "return", "portfolio"],
+        "icon": "💡"
+    },
+    "📋 Executive Summary": {
+        "description": "Generate high-level overview and key takeaways",
+        "keywords": ["summary", "overview", "highlights", "conclusion", "recommendation", "action"],
+        "icon": "📋"
+    },
+    "🔍 Detailed Analysis": {
+        "description": "Comprehensive deep-dive analysis of all content",
+        "keywords": ["analysis", "detailed", "comprehensive", "thorough", "complete", "full"],
+        "icon": "🔍"
+    },
+    "📊 Data Extraction": {
+        "description": "Extract tables, numbers, and structured data",
+        "keywords": ["data", "table", "number", "figure", "statistic", "metric", "KPI"],
+        "icon": "📊"
+    }
+}
+@st.cache_resource
+def load_models():
+    """Load and cache all models"""
+    try:
+        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        model_name = "microsoft/DialoGPT-medium"
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        client = chromadb.Client()
+        try:
+            collection = client.get_collection("documents")
+        except:
+            collection = client.create_collection(
+                name="documents",
+                metadata={"hnsw:space": "cosine"}
+            )
+        return embedding_model, tokenizer, model, collection
+    except Exception as e:
+        st.error(f"Error loading models: {str(e)}")
+        return None, None, None, None
+def validate_file(uploaded_file):
+    """Validate uploaded file"""
+    max_size = 50 * 1024 * 1024  # 50MB
     if uploaded_file.size > max_size:
+        return False, f"File {uploaded_file.name} is too large. Maximum size is 50MB."
     allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
     file_extension = uploaded_file.name.split('.')[-1].lower()
     return True, "Valid file"
+def analyze_document_structure(text, filename):
+    """Analyze document structure and extract metadata"""
+    analysis = {
+        'filename': filename,
+        'word_count': len(text.split()),
+        'char_count': len(text),
+        'estimated_pages': len(text) // 2000,  # Rough estimate
+        'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
+        'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
+        'sections': [],
+        'key_terms': [],
+        'document_type': 'Unknown'
+    }
+    # Detect document type
+    if any(term in text.lower() for term in ['financial statement', 'balance sheet', 'income statement']):
+        analysis['document_type'] = 'Financial Statement'
+    elif any(term in text.lower() for term in ['annual report', '10-k', '10-q']):
+        analysis['document_type'] = 'Annual Report'
+    elif any(term in text.lower() for term in ['investment', 'portfolio', 'fund']):
+        analysis['document_type'] = 'Investment Document'
+    elif any(term in text.lower() for term in ['contract', 'agreement', 'terms']):
+        analysis['document_type'] = 'Legal Document'
+    # Extract sections (headers)
+    headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
+    analysis['sections'] = headers[:10]  # Top 10 sections
+    # Extract key financial terms
+    financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin)\b', text, re.IGNORECASE)
+    analysis['key_terms'] = list(set(financial_terms))[:15]
+    return analysis
 @st.cache_data
+def process_document(uploaded_file):
+    """Process uploaded document with enhanced analysis"""
+    is_valid, message = validate_file(uploaded_file)
     if not is_valid:
         raise ValueError(message)
     try:
         with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
             tmp_file.write(uploaded_file.getvalue())
     try:
         file_extension = uploaded_file.name.split('.')[-1].lower()
         text = ""
         if file_extension == 'pdf':
             try:
+                with open(tmp_path, 'rb') as file:
+                    reader = PyPDF2.PdfReader(file)
+                    for page in reader.pages:
+                        text += page.extract_text() + "\n"
             except Exception as e:
+                raise ValueError(f"Error reading PDF: {str(e)}")
         elif file_extension == 'docx':
             try:
                 doc = docx.Document(tmp_path)
+                for paragraph in doc.paragraphs:
                     text += paragraph.text + "\n"
             except Exception as e:
                 raise ValueError(f"Error reading DOCX: {str(e)}")
         elif file_extension == 'txt':
             try:
                 with open(tmp_path, 'r', encoding='utf-8') as file:
             except UnicodeDecodeError:
                 with open(tmp_path, 'r', encoding='latin-1') as file:
                     text = file.read()
+            except Exception as e:
+                raise ValueError(f"Error reading TXT: {str(e)}")
         elif file_extension in ['xlsx', 'xls']:
             try:
                 df = pd.read_excel(tmp_path)
                 text = df.to_string()
             except Exception as e:
         if not text.strip():
             raise ValueError("No text content found in the file")
+        # Analyze document structure
+        analysis = analyze_document_structure(text, uploaded_file.name)
         return text, uploaded_file.name, analysis
     finally:
         try:
             if os.path.exists(tmp_path):
                 os.remove(tmp_path)
         except:
             pass
+def generate_analysis_by_type(text, analysis_type, analysis_info):
+    """Generate specific analysis based on type"""
+    keywords = analysis_info['keywords']
+    description = analysis_info['description']
+    # Find relevant sections based on keywords
+    relevant_sections = []
     text_lower = text.lower()
+    for keyword in keywords:
+        if keyword in text_lower:
+            # Find context around keywords
+            pattern = rf'.{0,200}\b{keyword}\b.{0,200}'
+            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
+            relevant_sections.extend(matches[:3])  # Max 3 matches per keyword
+    if not relevant_sections:
+        return f"No specific information found for {analysis_type} in this document."
+    # Create structured analysis
+    analysis_result = f"""
+## {analysis_type}
+**Analysis Focus**: {description}
+**Key Findings**:
+"""
+    for i, section in enumerate(relevant_sections[:5], 1):
+        cleaned_section = re.sub(r'\s+', ' ', section.strip())
+        analysis_result += f"\n{i}. {cleaned_section[:300]}...\n"
+    analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}."
+    return analysis_result
+def chunk_text(text, chunk_size=1000, overlap=200):
+    """Split text into chunks"""
+    if not text or not text.strip():
+        return []
+    chunks = []
+    start = 0
+    while start < len(text):
+        end = start + chunk_size
+        chunk = text[start:end]
+        if end < len(text):
+            last_period = chunk.rfind('.')
+            if last_period > chunk_size * 0.7:
+                end = start + last_period + 1
+                chunk = text[start:end]
+        if chunk.strip():
+            chunks.append(chunk.strip())
+        start = end - overlap
+        if start >= len(text):
+            break
+    return chunks
+def search_documents(query, collection, embedding_model, n_results=3):
+    """Search for relevant document chunks"""
     try:
+        if collection.count() == 0:
+            return []
+        query_embedding = embedding_model.encode([query]).tolist()
+        results = collection.query(
+            query_embeddings=query_embedding,
+            n_results=min(n_results, collection.count()),
+            include=['documents', 'metadatas', 'distances']
+        )
+        search_results = []
+        if results['documents'] and results['documents'][0]:
+            for i in range(len(results['documents'][0])):
+                search_results.append({
+                    'content': results['documents'][0][i],
+                    'metadata': results['metadatas'][0][i],
+                    'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
+                })
+        return search_results
+    except Exception as e:
+        st.error(f"Search error: {str(e)}")
+        return []
+def main():
+    # Header
+    st.markdown('<h1 class="main-header">💰 FinanceGPT - Enhanced Enterprise AI Assistant</h1>', unsafe_allow_html=True)
+    st.markdown("""
+    <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
+    🚀 Powered by IBM Granite Models | 📊 Advanced Document Intelligence | 🔒 Secure & Compliant
+    </div>
+    """, unsafe_allow_html=True)
+    # Load models
+    with st.spinner("🔄 Loading AI models..."):
+        models = load_models()
+        if models[0] is None:
+            st.error("Failed to load AI models. Please refresh the page.")
+            return
+        embedding_model, tokenizer, model, collection = models
+    # Sidebar for document management
+    with st.sidebar:
+        st.header("📁 Enhanced Document Management")
+        # File upload section
+        st.markdown("### 📤 Upload Documents")
+        st.info("📋 **File Requirements:**\n- Max size: 50MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
+        uploaded_files = st.file_uploader(
+            "Choose files",
+            accept_multiple_files=True,
+            type=['pdf', 'docx', 'txt', 'xlsx'],
+            help="Supported formats: PDF, DOCX, TXT, XLSX (Max 50MB each)"
+        )
+        if uploaded_files:
+            valid_files = []
+            for file in uploaded_files:
+                is_valid, message = validate_file(file)
+                if is_valid:
+                    valid_files.append(file)
+                else:
+                    st.error(f"❌ {message}")
+            if valid_files:
+                st.success(f"✅ {len(valid_files)} valid files ready!")
+                if st.button("🔄 Process Documents", type="primary"):
+                    progress_bar = st.progress(0)
+                    status_text = st.empty()
+                    for i, file in enumerate(valid_files):
+                        status_text.text(f"Processing {file.name}...")
+                        try:
+                            text, filename, analysis = process_document(file)
+                            # Store document analysis
+                            st.session_state.processed_docs[filename] = {
+                                'text': text,
+                                'analysis': analysis,
+                                'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+                            }
+                            # Create and store chunks
+                            chunks = chunk_text(text)
+                            if chunks:
+                                for j, chunk in enumerate(chunks):
+                                    try:
+                                        chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
+                                        embedding = embedding_model.encode([chunk]).tolist()
+                                        collection.add(
+                                            embeddings=embedding,
+                                            documents=[chunk],
+                                            metadatas=[{'filename': filename, 'chunk_id': j}],
+                                            ids=[chunk_id]
+                                        )
+                                    except Exception as e:
+                                        continue
+                            st.success(f"✅ {filename}")
+                        except Exception as e:
+                            st.error(f"❌ Error processing {file.name}: {str(e)}")
+                        progress_bar.progress((i + 1) / len(valid_files))
+                    status_text.text("✅ Processing complete!")
+                    st.balloons()
+        # Document analysis section
+        if st.session_state.processed_docs:
+            st.markdown("---")
+            st.markdown("### 📊 Document Analysis Options")
+            # Select document
+            doc_names = list(st.session_state.processed_docs.keys())
+            selected_doc = st.selectbox("Select Document:", doc_names)
+            if selected_doc:
+                doc_info = st.session_state.processed_docs[selected_doc]
+                # Document overview
+                st.markdown("#### 📋 Document Overview")
+                analysis = doc_info['analysis']
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.metric("Word Count", f"{analysis['word_count']:,}")
+                    st.metric("Pages (Est.)", analysis['estimated_pages'])
+                with col2:
+                    st.metric("Document Type", analysis['document_type'])
+                    financial_status = "✅ Yes" if analysis['has_financial_data'] else "❌ No"
+                    st.write(f"**Financial Data**: {financial_status}")
+                # Key terms
+                if analysis['key_terms']:
+                    st.markdown("**Key Terms Found:**")
+                    st.write(", ".join(analysis['key_terms'][:10]))
+                # Analysis type selection
+                st.markdown("#### 🔍 Analysis Types")
+                analysis_type = st.selectbox(
+                    "Choose Analysis Type:",
+                    list(ANALYSIS_TYPES.keys()),
+                    format_func=lambda x: f"{ANALYSIS_TYPES[x]['icon']} {x.split(' ', 1)[1]}"
+                )
+                if st.button(f"🚀 Generate {analysis_type}", use_container_width=True):
+                    cache_key = f"{selected_doc}_{analysis_type}"
+                    if cache_key not in st.session_state.analysis_cache:
+                        with st.spinner(f"Generating {analysis_type}..."):
+                            analysis_result = generate_analysis_by_type(
+                                doc_info['text'],
+                                analysis_type,
+                                ANALYSIS_TYPES[analysis_type]
+                            )
+                            st.session_state.analysis_cache[cache_key] = analysis_result
+                    # Display in main area
+                    st.session_state.current_analysis = st.session_state.analysis_cache[cache_key]
+                    st.session_state.current_analysis_type = analysis_type
+    # Main content area
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        # Display analysis results if available
+        if hasattr(st.session_state, 'current_analysis'):
+            st.markdown(f"## {st.session_state.current_analysis_type}")
+            st.markdown(f'<div class="analysis-card">{st.session_state.current_analysis}</div>', unsafe_allow_html=True)
+            # Clear analysis button
+            if st.button("🗑️ Clear Analysis"):
+                if hasattr(st.session_state, 'current_analysis'):
+                    del st.session_state.current_analysis
+                if hasattr(st.session_state, 'current_analysis_type'):
+                    del st.session_state.current_analysis_type
+                st.rerun()
+        st.header("💬 Interactive Q&A")
+        # Smart question suggestions
+        if st.session_state.processed_docs:
+            with st.expander("💡 Smart Question Suggestions"):
+                # Generate context-aware questions
+                doc_types = set(doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values())
+                smart_questions = []
+                if 'Financial Statement' in doc_types:
+                    smart_questions.extend([
+                        "What are the key financial ratios mentioned?",
+                        "Analyze the profitability trends",
+                        "What are the major expense categories?"
+                    ])
+                if 'Investment Document' in doc_types:
+                    smart_questions.extend([
+                        "What are the investment recommendations?",
+                        "What risks are associated with these investments?",
+                        "What is the expected return on investment?"
+                    ])
+                if 'Annual Report' in doc_types:
+                    smart_questions.extend([
+                        "Summarize the company's performance this year",
+                        "What are the future growth strategies?",
+                        "What challenges does the company face?"
+                    ])
+                # Default questions if no specific type detected
+                if not smart_questions:
+                    smart_questions = [
+                        "What are the key points in this document?",
+                        "Summarize the main findings",
+                        "What are the most important numbers mentioned?"
+                    ]
+                for question in smart_questions[:6]:
+                    if st.button(question, key=f"smart_{question}", use_container_width=True):
+                        st.session_state.query = question
+        # Query input
+        query = st.text_area(
+            "Enter your question:",
+            value=st.session_state.get('query', ''),
+            placeholder="e.g., What are the main financial risks identified in the documents?",
+            height=100
+        )
+        if st.button("🔍 Ask Question", type="primary", use_container_width=True):
+            if not query:
+                st.warning("⚠️ Please enter a question!")
+                return
+            if collection.count() == 0:
+                st.warning("⚠️ Please upload and process some documents first!")
+                return
+            with st.spinner("🤖 Analyzing documents and generating response..."):
+                try:
+                    search_results = search_documents(query, collection, embedding_model)
+                    if search_results:
+                        # Enhanced response generation
+                        context = ""
+                        source_files = set()
+                        for i, chunk in enumerate(search_results):
+                            filename = chunk['metadata'].get('filename', 'Unknown')
+                            source_files.add(filename)
+                            context += f"[Source {i+1}: {filename}]\n{chunk['content'][:400]}...\n\n"
+                        response = f"""
+### 🤖 AI Analysis Results
+**Query**: {query}
+**Key Findings**:
+{context[:1000]}...
+**Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
+**Documents Analyzed**: {', '.join(source_files)}
+"""
+                        st.markdown(response)
+                        # Enhanced source display
+                        st.markdown("### 📚 Detailed Sources")
+                        for i, result in enumerate(search_results):
+                            score_percent = f"{result['score']:.1%}"
+                            filename = result['metadata'].get('filename', 'Unknown')
+                            with st.expander(f"📄 Source {i+1}: {filename} (Relevance: {score_percent})"):
+                                st.markdown(f'<div class="source-box">{result["content"]}</div>', unsafe_allow_html=True)
+                    else:
+                        st.error("❌ No relevant information found in the uploaded documents.")
+                except Exception as e:
+                    st.error(f"❌ Error processing your question: {str(e)}")
+    with col2:
+        st.header("📊 Dashboard")
+        # Document statistics
+        if st.session_state.processed_docs:
+            st.markdown("### 📈 Document Statistics")
+            total_words = sum(doc['analysis']['word_count'] for doc in st.session_state.processed_docs.values())
+            total_pages = sum(doc['analysis']['estimated_pages'] for doc in st.session_state.processed_docs.values())
+            doc_types = [doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values()]
+            col_a, col_b = st.columns(2)
+            with col_a:
+                st.metric("📄 Documents", len(st.session_state.processed_docs))
+                st.metric("📊 Total Words", f"{total_words:,}")
+            with col_b:
+                st.metric("📑 Total Pages", total_pages)
+                st.metric("🗂️ Document Types", len(set(doc_types)))
+            # Document type breakdown
+            if doc_types:
+                st.markdown("**Document Types:**")
+                type_counts = {}
+                for doc_type in doc_types:
+                    type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
+                for doc_type, count in type_counts.items():
+                    st.write(f"• {doc_type}: {count}")
+        # Project info
+        st.markdown("---")
+        st.header("🎯 Project Info")
+        st.markdown("""
+        ### **Built For IBM Hackathon**
+        **🔧 Technology Stack:**
+        - 🧠 IBM Granite Models
+        - 🔍 RAG (Retrieval-Augmented Generation)
+        - 📊 Streamlit UI
+        - 🗄️ ChromaDB Vector Database
+        - 🔒 Enterprise Security
+        **💼 Analysis Types:**
+        - 📊 Financial Summary
+        - ⚠️ Risk Analysis
+        - 📈 Market Trends
+        - ✅ Compliance Check
+        - 💡 Investment Insights
+        - 📋 Executive Summary
+        - 🔍 Detailed Analysis
+        - 📊 Data Extraction
+        """)
+        # Statistics
+        try:
+            doc_count = collection.count()
+            st.metric("🔗 Vector Chunks", doc_count)
+        except:
+            st.metric("🔗 Vector Chunks", 0)
+if __name__ == "__main__":
+    main()