Spaces:

SimranShaikh
/

enterprise-rag-assistant

Sleeping

App Files Files Community

SimranShaikh commited on Jun 29, 2025

Commit

267cab6

verified ·

1 Parent(s): b8bcf74

commit

Browse files

Files changed (1) hide show

src/streamlit_app.py +424 -690

src/streamlit_app.py CHANGED Viewed

@@ -1,741 +1,475 @@
-import streamlit as st
-import os
-import tempfile
-# Fix cache permission issues in HF Spaces
-os.environ['TRANSFORMERS_CACHE'] = tempfile.gettempdir()
-os.environ['HF_HOME'] = tempfile.gettempdir()
-os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.gettempdir()
-import torch
 import PyPDF2
-import docx
 import pandas as pd
-from sentence_transformers import SentenceTransformer
-import chromadb
-from chromadb.config import Settings
-import tempfile
-import uuid
 import re
-from datetime import datetime
-# Page config
-st.set_page_config(
-    page_title="FinanceGPT - Enterprise AI Assistant",
-    page_icon="💰",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 3rem;
-        color: #1f77b4;
-        text-align: center;
-        margin-bottom: 2rem;
-    }
-    .chat-message {
-        padding: 1rem;
-        border-radius: 0.5rem;
-        margin: 1rem 0;
-        background-color: #f0f2f6;
-    }
-    .source-box {
-        background-color: #e8f4f8;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        border-left: 4px solid #1f77b4;
-    }
-    .doc-summary {
-        background-color: #f8f9fa;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        border: 1px solid #dee2e6;
-        margin: 1rem 0;
-    }
-    .analysis-card {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        margin: 0.5rem 0;
-    }
-    .metric-card {
-        background-color: #ffffff;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-        text-align: center;
-        margin: 0.5rem 0;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Initialize session state
-if 'processed_docs' not in st.session_state:
-    st.session_state.processed_docs = {}
-if 'analysis_cache' not in st.session_state:
-    st.session_state.analysis_cache = {}
-# Document analysis types
-ANALYSIS_TYPES = {
-    "📊 Financial Summary": {
-        "description": "Extract key financial metrics, ratios, and performance indicators",
-        "keywords": ["revenue", "profit", "loss", "assets", "liabilities", "cash flow", "ROI", "margin"],
-        "icon": "📊"
-    },
-    "⚠️ Risk Analysis": {
-        "description": "Identify potential risks, threats, and vulnerability factors",
-        "keywords": ["risk", "threat", "vulnerability", "exposure", "mitigation", "hedge", "insurance"],
-        "icon": "⚠️"
-    },
-    "📈 Market Trends": {
-        "description": "Analyze market conditions, trends, and competitive landscape",
-        "keywords": ["market", "trend", "growth", "competition", "industry", "outlook", "forecast"],
-        "icon": "📈"
-    },
-    "✅ Compliance Check": {
-        "description": "Review regulatory compliance and legal requirements",
-        "keywords": ["compliance", "regulation", "legal", "audit", "governance", "policy", "standard"],
-        "icon": "✅"
-    },
-    "💡 Investment Insights": {
-        "description": "Extract investment recommendations and opportunities",
-        "keywords": ["investment", "opportunity", "recommendation", "valuation", "return", "portfolio"],
-        "icon": "💡"
-    },
-    "📋 Executive Summary": {
-        "description": "Generate high-level overview and key takeaways",
-        "keywords": ["summary", "overview", "highlights", "conclusion", "recommendation", "action"],
-        "icon": "📋"
-    },
-    "🔍 Detailed Analysis": {
-        "description": "Comprehensive deep-dive analysis of all content",
-        "keywords": ["analysis", "detailed", "comprehensive", "thorough", "complete", "full"],
-        "icon": "🔍"
-    },
-    "📊 Data Extraction": {
-        "description": "Extract tables, numbers, and structured data",
-        "keywords": ["data", "table", "number", "figure", "statistic", "metric", "KPI"],
-        "icon": "📊"
     }
-}
-@st.cache_resource
-def load_models():
-    """Load and cache models with better error handling"""
-    try:
-        # Load embedding model first (most reliable)
-        st.info("Loading embedding model...")
-        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-        # Initialize ChromaDB
-        st.info("Initializing vector database...")
-        client = chromadb.Client()
-        try:
-            collection = client.get_collection("documents")
-        except:
-            collection = client.create_collection(
-                name="documents",
-                metadata={"hnsw:space": "cosine"}
-            )
-        st.success("✅ Models loaded successfully!")
-        return embedding_model, collection
-    except Exception as e:
-        st.error(f"❌ Error loading models: {str(e)}")
-        st.error("Please check your internet connection and try refreshing the page.")
-        return None, None
-def validate_file(uploaded_file):
-    """Validate uploaded file"""
-    max_size = 50 * 1024 * 1024  # 50MB
-    if uploaded_file.size > max_size:
-        return False, f"File {uploaded_file.name} is too large. Maximum size is 50MB."
-    allowed_extensions = ['pdf', 'docx', 'txt', 'xlsx', 'xls']
-    file_extension = uploaded_file.name.split('.')[-1].lower()
-    if file_extension not in allowed_extensions:
-        return False, f"File type .{file_extension} is not supported."
-    return True, "Valid file"
-def analyze_document_structure(text, filename):
-    """Analyze document structure and extract metadata"""
-    analysis = {
-        'filename': filename,
-        'word_count': len(text.split()),
-        'char_count': len(text),
-        'estimated_pages': max(1, len(text) // 2000),  # Minimum 1 page
-        'has_financial_data': bool(re.search(r'\$|€|£|₹|\d+\.\d+%|\d+,\d+', text)),
-        'has_tables': bool(re.search(r'\|\s*\w+\s*\|', text)),
-        'sections': [],
-        'key_terms': [],
-        'document_type': 'Unknown'
-    }
-    # Detect document type
-    text_lower = text.lower()
-    if any(term in text_lower for term in ['financial statement', 'balance sheet', 'income statement']):
-        analysis['document_type'] = 'Financial Statement'
-    elif any(term in text_lower for term in ['annual report', '10-k', '10-q']):
-        analysis['document_type'] = 'Annual Report'
-    elif any(term in text_lower for term in ['investment', 'portfolio', 'fund']):
-        analysis['document_type'] = 'Investment Document'
-    elif any(term in text_lower for term in ['contract', 'agreement', 'terms']):
-        analysis['document_type'] = 'Legal Document'
-    elif any(term in text_lower for term in ['budget', 'forecast', 'projection']):
-        analysis['document_type'] = 'Financial Planning'
-    else:
-        analysis['document_type'] = 'Business Document'
-    # Extract sections (headers)
-    headers = re.findall(r'^[A-Z][A-Za-z\s]{10,50}$', text, re.MULTILINE)
-    analysis['sections'] = headers[:10]  # Top 10 sections
-    # Extract key financial terms
-    financial_terms = re.findall(r'\b(?:revenue|profit|loss|assets|liabilities|equity|cash|debt|investment|ROI|EBITDA|margin|expenses|income|growth|risk|return)\b', text, re.IGNORECASE)
-    analysis['key_terms'] = list(set(financial_terms))[:15]
-    return analysis
-@st.cache_data
-def process_document(uploaded_file):
-    """Process uploaded document with enhanced error handling"""
-    is_valid, message = validate_file(uploaded_file)
-    if not is_valid:
-        raise ValueError(message)
     try:
-        with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file:
-            tmp_file.write(uploaded_file.getvalue())
-            tmp_path = tmp_file.name
     except Exception as e:
-        raise ValueError(f"Failed to create temporary file: {str(e)}")
     try:
-        file_extension = uploaded_file.name.split('.')[-1].lower()
-        text = ""
-        if file_extension == 'pdf':
-            try:
-                with open(tmp_path, 'rb') as file:
-                    reader = PyPDF2.PdfReader(file)
-                    if len(reader.pages) == 0:
-                        raise ValueError("PDF file appears to be empty")
-                    for page in reader.pages:
-                        page_text = page.extract_text()
-                        if page_text:
-                            text += page_text + "\n"
-                if not text.strip():
-                    raise ValueError("Could not extract text from PDF")
-            except Exception as e:
-                raise ValueError(f"Error reading PDF: {str(e)}")
-        elif file_extension == 'docx':
             try:
-                doc = docx.Document(tmp_path)
-                for paragraph in doc.paragraphs:
-                    if paragraph.text.strip():
-                        text += paragraph.text + "\n"
-                if not text.strip():
-                    raise ValueError("DOCX file appears to be empty")
             except Exception as e:
-                raise ValueError(f"Error reading DOCX: {str(e)}")
-        elif file_extension == 'txt':
-            try:
-                # Try UTF-8 first
-                with open(tmp_path, 'r', encoding='utf-8') as file:
-                    text = file.read()
-            except UnicodeDecodeError:
-                try:
-                    # Fallback to latin-1
-                    with open(tmp_path, 'r', encoding='latin-1') as file:
-                        text = file.read()
-                except Exception as e:
-                    raise ValueError(f"Error reading TXT file: {str(e)}")
-            except Exception as e:
-                raise ValueError(f"Error reading TXT file: {str(e)}")
-        elif file_extension in ['xlsx', 'xls']:
-            try:
-                df = pd.read_excel(tmp_path, sheet_name=0)  # Read first sheet
-                if df.empty:
-                    raise ValueError("Excel file appears to be empty")
-                text = df.to_string(index=False)
-            except Exception as e:
-                raise ValueError(f"Error reading Excel file: {str(e)}")
-        if not text or not text.strip():
-            raise ValueError("No readable text content found in the file")
-        # Clean up text
-        text = re.sub(r'\n\s*\n', '\n\n', text)  # Remove excessive newlines
-        text = text.strip()
-        # Analyze document structure
-        analysis = analyze_document_structure(text, uploaded_file.name)
-        return text, uploaded_file.name, analysis
-    finally:
-        try:
-            if os.path.exists(tmp_path):
-                os.remove(tmp_path)
-        except:
-            pass
-def generate_analysis_by_type(text, analysis_type, analysis_info):
-    """Generate specific analysis based on type"""
-    keywords = analysis_info['keywords']
-    description = analysis_info['description']
-    # Find relevant sections based on keywords
-    relevant_sections = []
-    text_lower = text.lower()
-    for keyword in keywords:
-        if keyword in text_lower:
-            # Find context around keywords
-            pattern = rf'.{{0,200}}\b{keyword}\b.{{0,200}}'
-            matches = re.findall(pattern, text, re.IGNORECASE | re.DOTALL)
-            relevant_sections.extend(matches[:2])  # Max 2 matches per keyword
-    if not relevant_sections:
-        # If no keyword matches, provide general analysis
-        words = text.split()
-        if len(words) > 500:
-            sample_text = ' '.join(words[:500]) + "..."
-        else:
-            sample_text = text
-        return f"""
-## {analysis_type}
-**Analysis Focus**: {description}
-**Document Analysis**:
-Based on the document content, here are the key insights related to {analysis_type.lower()}:
-{sample_text}
-**Summary**: The document has been analyzed for {analysis_type.lower()} content. While specific keywords weren't found, the above content provides relevant context for your analysis needs.
-"""
-    # Create structured analysis
-    analysis_result = f"""
-## {analysis_type}
-**Analysis Focus**: {description}
-**Key Findings**:
-"""
-    for i, section in enumerate(relevant_sections[:5], 1):
-        cleaned_section = re.sub(r'\s+', ' ', section.strip())
-        if len(cleaned_section) > 300:
-            cleaned_section = cleaned_section[:300] + "..."
-        analysis_result += f"\n**Finding {i}**: {cleaned_section}\n"
-    analysis_result += f"\n**Summary**: Based on the document analysis, {len(relevant_sections)} relevant sections were identified related to {analysis_type.lower()}. These findings provide insights into the document's content from the perspective of {description.lower()}."
-    return analysis_result
-def chunk_text(text, chunk_size=1000, overlap=200):
-    """Split text into chunks with better handling"""
-    if not text or not text.strip():
-        return []
-    # Clean text first
-    text = re.sub(r'\s+', ' ', text.strip())
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = start + chunk_size
-        if end >= len(text):
-            # Last chunk
-            chunk = text[start:]
         else:
-            chunk = text[start:end]
-            # Try to break at sentence boundary
-            last_period = chunk.rfind('.')
-            last_newline = chunk.rfind('\n')
-            break_point = max(last_period, last_newline)
-            if break_point > chunk_size * 0.5:  # If we found a good break point
-                end = start + break_point + 1
-                chunk = text[start:end]
-        if chunk.strip() and len(chunk.strip()) > 50:  # Only add substantial chunks
-            chunks.append(chunk.strip())
-        start = end - overlap
-        if start >= len(text):
-            break
-    return chunks
-def search_documents(query, collection, embedding_model, n_results=3):
-    """Search for relevant document chunks with better error handling"""
     try:
-        if collection.count() == 0:
-            return []
-        # Generate query embedding
-        query_embedding = embedding_model.encode([query]).tolist()
-        # Search the collection
-        results = collection.query(
-            query_embeddings=query_embedding,
-            n_results=min(n_results, collection.count()),
-            include=['documents', 'metadatas', 'distances']
-        )
-        search_results = []
-        if results['documents'] and results['documents'][0]:
-            for i in range(len(results['documents'][0])):
-                search_results.append({
-                    'content': results['documents'][0][i],
-                    'metadata': results['metadatas'][0][i] if results['metadatas'][0] else {},
-                    'score': 1 - results['distances'][0][i] if results['distances'][0][i] else 1.0
-                })
-        return search_results
     except Exception as e:
-        st.error(f"Search error: {str(e)}")
-        return []
-def main():
-    # Header
-    st.markdown('<h1 class="main-header">💰 FinanceGPT - Enhanced Enterprise AI Assistant</h1>', unsafe_allow_html=True)
-    st.markdown("""
-    <div style="text-align: center; font-size: 1.2rem; color: #666; margin-bottom: 2rem;">
-    🚀 Powered by Advanced AI | 📊 Document Intelligence | 🔒 Secure & Compliant
-    </div>
-    """, unsafe_allow_html=True)
-    # Load models
-    with st.spinner("🔄 Loading AI models..."):
-        models = load_models()
-        if models[0] is None:
-            st.error("❌ Failed to load AI models. Please refresh the page and check your internet connection.")
-            st.stop()
-        embedding_model, collection = models
-    # Sidebar for document management
-    with st.sidebar:
-        st.header("📁 Enhanced Document Management")
-        # File upload section
-        st.markdown("### 📤 Upload Documents")
-        st.info("📋 **File Requirements:**\n- Max size: 50MB per file\n- Formats: PDF, DOCX, TXT, XLSX")
-        uploaded_files = st.file_uploader(
-            "Choose files",
-            accept_multiple_files=True,
-            type=['pdf', 'docx', 'txt', 'xlsx'],
-            help="Supported formats: PDF, DOCX, TXT, XLSX (Max 50MB each)"
-        )
-        if uploaded_files:
-            valid_files = []
-            for file in uploaded_files:
-                is_valid, message = validate_file(file)
-                if is_valid:
-                    valid_files.append(file)
-                else:
-                    st.error(f"❌ {message}")
-            if valid_files:
-                st.success(f"✅ {len(valid_files)} valid files ready!")
-                if st.button("🔄 Process Documents", type="primary"):
-                    progress_bar = st.progress(0)
-                    status_text = st.empty()
-                    for i, file in enumerate(valid_files):
-                        status_text.text(f"Processing {file.name}...")
-                        try:
-                            text, filename, analysis = process_document(file)
-                            # Store document analysis
-                            st.session_state.processed_docs[filename] = {
-                                'text': text,
-                                'analysis': analysis,
-                                'processed_at': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-                            }
-                            # Create and store chunks
-                            chunks = chunk_text(text)
-                            if chunks:
-                                for j, chunk in enumerate(chunks):
-                                    try:
-                                        chunk_id = f"{filename}_{j}_{uuid.uuid4().hex[:8]}"
-                                        embedding = embedding_model.encode([chunk]).tolist()
-                                        collection.upsert(
-                                            embeddings=embedding,
-                                            documents=[chunk],
-                                            metadatas=[{'filename': filename, 'chunk_id': j}],
-                                            ids=[chunk_id]
-                                        )
-                                    except Exception as e:
-                                        st.warning(f"Warning: Could not process chunk {j} of {filename}")
-                                        continue
-                            st.success(f"✅ {filename}")
-                        except Exception as e:
-                            st.error(f"❌ Error processing {file.name}: {str(e)}")
-                        progress_bar.progress((i + 1) / len(valid_files))
-                    status_text.text("✅ Processing complete!")
-                    st.balloons()
-        # Document analysis section
-        if st.session_state.processed_docs:
-            st.markdown("---")
-            st.markdown("### 📊 Document Analysis Options")
-            # Select document
-            doc_names = list(st.session_state.processed_docs.keys())
-            selected_doc = st.selectbox("Select Document:", doc_names)
-            if selected_doc:
-                doc_info = st.session_state.processed_docs[selected_doc]
-                # Document overview
-                st.markdown("#### 📋 Document Overview")
-                analysis = doc_info['analysis']
-                col1, col2 = st.columns(2)
-                with col1:
-                    st.metric("Word Count", f"{analysis['word_count']:,}")
-                    st.metric("Pages (Est.)", analysis['estimated_pages'])
-                with col2:
-                    st.metric("Document Type", analysis['document_type'])
-                    financial_status = "✅ Yes" if analysis['has_financial_data'] else "❌ No"
-                    st.write(f"**Financial Data**: {financial_status}")
-                # Key terms
-                if analysis['key_terms']:
-                    st.markdown("**Key Terms Found:**")
-                    st.write(", ".join(analysis['key_terms'][:10]))
-                # Analysis type selection
-                st.markdown("#### 🔍 Analysis Types")
-                analysis_type = st.selectbox(
-                    "Choose Analysis Type:",
-                    list(ANALYSIS_TYPES.keys()),
-                    format_func=lambda x: f"{ANALYSIS_TYPES[x]['icon']} {x.split(' ', 1)[1]}"
-                )
-                if st.button(f"🚀 Generate {analysis_type}", use_container_width=True):
-                    cache_key = f"{selected_doc}_{analysis_type}"
-                    if cache_key not in st.session_state.analysis_cache:
-                        with st.spinner(f"Generating {analysis_type}..."):
-                            analysis_result = generate_analysis_by_type(
-                                doc_info['text'],
-                                analysis_type,
-                                ANALYSIS_TYPES[analysis_type]
-                            )
-                            st.session_state.analysis_cache[cache_key] = analysis_result
-                    # Display in main area
-                    st.session_state.current_analysis = st.session_state.analysis_cache[cache_key]
-                    st.session_state.current_analysis_type = analysis_type
-    # Main content area
-    col1, col2 = st.columns([2, 1])
-    with col1:
-        # Display analysis results if available
-        if hasattr(st.session_state, 'current_analysis'):
-            st.markdown(f"## {st.session_state.current_analysis_type}")
-            st.markdown(f'<div class="analysis-card">{st.session_state.current_analysis}</div>', unsafe_allow_html=True)
-            # Clear analysis button
-            if st.button("🗑️ Clear Analysis"):
-                if hasattr(st.session_state, 'current_analysis'):
-                    del st.session_state.current_analysis
-                if hasattr(st.session_state, 'current_analysis_type'):
-                    del st.session_state.current_analysis_type
-                st.rerun()
-        st.header("💬 Interactive Q&A")
-        # Smart question suggestions
-        if st.session_state.processed_docs:
-            with st.expander("💡 Smart Question Suggestions"):
-                # Generate context-aware questions
-                doc_types = set(doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values())
-                smart_questions = []
-                if 'Financial Statement' in doc_types:
-                    smart_questions.extend([
-                        "What are the key financial ratios mentioned?",
-                        "Analyze the profitability trends",
-                        "What are the major expense categories?"
-                    ])
-                if 'Investment Document' in doc_types:
-                    smart_questions.extend([
-                        "What are the investment recommendations?",
-                        "What risks are associated with these investments?",
-                        "What is the expected return on investment?"
-                    ])
-                if 'Annual Report' in doc_types:
-                    smart_questions.extend([
-                        "Summarize the company's performance this year",
-                        "What are the future growth strategies?",
-                        "What challenges does the company face?"
-                    ])
-                # Default questions if no specific type detected
-                if not smart_questions:
-                    smart_questions = [
-                        "What are the key points in this document?",
-                        "Summarize the main findings",
-                        "What are the most important numbers mentioned?"
-                    ]
-                for question in smart_questions[:6]:
-                    if st.button(question, key=f"smart_{question}", use_container_width=True):
-                        st.session_state.query = question
-        # Query input
-        query = st.text_area(
-            "Enter your question:",
-            value=st.session_state.get('query', ''),
-            placeholder="e.g., What are the main financial risks identified in the documents?",
-            height=100
-        )
-        if st.button("🔍 Ask Question", type="primary", use_container_width=True):
-            if not query:
-                st.warning("⚠️ Please enter a question!")
-                return
-            if collection.count() == 0:
-                st.warning("⚠️ Please upload and process some documents first!")
-                return
-            with st.spinner("🤖 Analyzing documents and generating response..."):
-                try:
-                    search_results = search_documents(query, collection, embedding_model)
-                    if search_results:
-                        # Enhanced response generation
-                        context = ""
-                        source_files = set()
-                        for i, chunk in enumerate(search_results):
-                            filename = chunk['metadata'].get('filename', 'Unknown')
-                            source_files.add(filename)
-                            context += f"[Source {i+1}: {filename}]\n{chunk['content'][:400]}...\n\n"
-                        response = f"""
-### 🤖 AI Analysis Results
-**Query**: {query}
-**Key Findings**:
-{context[:1500]}...
-**Summary**: Based on analysis of {len(search_results)} relevant sections from {len(source_files)} document(s), the information above directly addresses your question.
-**Documents Analyzed**: {', '.join(source_files)}
-"""
-                        st.markdown(response)
-                        # Enhanced source display
-                        st.markdown("### 📚 Detailed Sources")
-                        for i, result in enumerate(search_results):
-                            score_percent = f"{result['score']:.1%}"
-                            filename = result['metadata'].get('filename', 'Unknown')
-                            with st.expander(f"📄 Source {i+1}: {filename} (Relevance: {score_percent})"):
-                                st.markdown(f'<div class="source-box">{result["content"]}</div>', unsafe_allow_html=True)
-                    else:
-                        st.error("❌ No relevant information found in the uploaded documents.")
-                except Exception as e:
-                    st.error(f"❌ Error processing your question: {str(e)}")
-    with col2:
-        st.header("📊 Dashboard")
-        # Document statistics
-        if st.session_state.processed_docs:
-            st.markdown("### 📈 Document Statistics")
-            total_words = sum(doc['analysis']['word_count'] for doc in st.session_state.processed_docs.values())
-            total_pages = sum(doc['analysis']['estimated_pages'] for doc in st.session_state.processed_docs.values())
-            doc_types = [doc['analysis']['document_type'] for doc in st.session_state.processed_docs.values()]
-            col_a, col_b = st.columns(2)
-            with col_a:
-                st.metric("📄 Documents", len(st.session_state.processed_docs))
-                st.metric("📊 Total Words", f"{total_words:,}")
-            with col_b:
-                st.metric("📑 Total Pages", total_pages)
-                st.metric("🗂️ Document Types", len(set(doc_types)))
-            # Document type breakdown
-            if doc_types:
-                st.markdown("**Document Types:**")
-                type_counts = {}
-                for doc_type in doc_types:
-                    type_counts[doc_type] = type_counts.get(doc_type, 0) + 1
-                for doc_type, count in type_counts.items():
-                    st.write(f"• {doc_type}: {count}")
-        # Project info
-        st.markdown("---")
-        st.header("🎯 Project Info")
-        st.markdown("""
-        ### **Enterprise AI Assistant**
-        **🔧 Technology Stack:**
-        - 🧠 Advanced AI Models
-        - 🔍 RAG (Retrieval-Augmented Generation)
-        - 📊 Streamlit UI
-        - 🗄️ ChromaDB Vector Database
-        - 🔒 Enterprise Security
-        **💼 Analysis Types:**
-        - 📊 Financial Summary
-        - ⚠️ Risk Analysis
-        - 📈 Market Trends
-        - ✅ Compliance Check
-        - 💡 Investment Insights
-        - 📋 Executive Summary
-        - 🔍 Detailed Analysis
-        - 📊 Data Extraction
-        """)
-        # Statistics
-        try:
-            doc_count = collection.count()
-            st.metric("🔗 Vector Chunks", doc_count)
-        except:
-            st.metric("🔗 Vector Chunks", 0)
 if __name__ == "__main__":
-    main()

 import PyPDF2
+import pdfplumber
+import fitz  # PyMuPDF
 import pandas as pd
 import re
+import logging
+import os
+from typing import Dict, List, Tuple, Optional
+from pathlib import Path
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PDFProcessorError(Exception):
+    """Custom exception for PDF processing errors"""
+    pass
+def enhanced_pdf_processor(file_path: str, timeout: int = 30) -> Dict:
+    """
+    Enhanced PDF processor with robust error handling and multiple extraction methods
+    for better handling of complex PDFs like IBM reports
+    """
+    results = {
+        'text': '',
+        'tables': [],
+        'metadata': {},
+        'extraction_method': 'unknown',
+        'success': False,
+        'error': None,
+        'file_info': {}
     }
+    # Validate file
+    if not validate_pdf_file(file_path):
+        results['error'] = "Invalid PDF file or file doesn't exist"
+        return results
+    # Get file info
+    results['file_info'] = get_file_info(file_path)
+    # Try different extraction methods in order of preference
+    extraction_methods = [
+        ('PyMuPDF', extract_with_pymupdf),
+        ('pdfplumber', extract_with_pdfplumber),
+        ('PyPDF2', extract_with_pypdf2)
+    ]
+    for method_name, method_func in extraction_methods:
+        try:
+            logger.info(f"Trying extraction method: {method_name}")
+            if method_name == 'pdfplumber':
+                text_result, tables = method_func(file_path)
+                if text_result and len(text_result.strip()) > 50:
+                    results['text'] = text_result
+                    results['tables'] = tables
+                    results['extraction_method'] = method_name
+                    results['success'] = True
+                    logger.info(f"Successfully extracted with {method_name}")
+                    return results
+            elif method_name == 'PyMuPDF':
+                text_result, metadata = method_func(file_path)
+                if text_result and len(text_result.strip()) > 50:
+                    results['text'] = text_result
+                    results['metadata'] = metadata
+                    results['extraction_method'] = method_name
+                    results['success'] = True
+                    logger.info(f"Successfully extracted with {method_name}")
+                    return results
+            else:  # PyPDF2
+                text_result = method_func(file_path)
+                if text_result and len(text_result.strip()) > 50:
+                    results['text'] = text_result
+                    results['extraction_method'] = method_name
+                    results['success'] = True
+                    logger.info(f"Successfully extracted with {method_name}")
+                    return results
+        except Exception as e:
+            error_msg = f"{method_name} failed: {str(e)}"
+            logger.warning(error_msg)
+            results['error'] = error_msg
+            continue
+    # If all methods failed
+    if not results['success']:
+        results['error'] = "All extraction methods failed"
+        logger.error("All PDF extraction methods failed")
+    return results
+def validate_pdf_file(file_path: str) -> bool:
+    """Validate PDF file exists and is accessible"""
+    try:
+        path = Path(file_path)
+        if not path.exists():
+            logger.error(f"File does not exist: {file_path}")
+            return False
+        if not path.is_file():
+            logger.error(f"Path is not a file: {file_path}")
+            return False
+        if path.stat().st_size == 0:
+            logger.error(f"File is empty: {file_path}")
+            return False
+        # Check if file is actually a PDF
+        with open(file_path, 'rb') as f:
+            header = f.read(5)
+            if not header.startswith(b'%PDF-'):
+                logger.error(f"File is not a valid PDF: {file_path}")
+                return False
+        return True
+    except Exception as e:
+        logger.error(f"Error validating PDF file: {e}")
+        return False
+def get_file_info(file_path: str) -> Dict:
+    """Get basic file information"""
+    try:
+        path = Path(file_path)
+        stat = path.stat()
+        return {
+            'name': path.name,
+            'size': stat.st_size,
+            'size_mb': round(stat.st_size / (1024 * 1024), 2),
+            'modified': stat.st_mtime
+        }
+    except Exception as e:
+        logger.warning(f"Could not get file info: {e}")
+        return {}
+def extract_with_pypdf2(file_path: str) -> str:
+    """Extract text using PyPDF2 - fastest method"""
+    text = ""
+    try:
+        with open(file_path, 'rb') as file:
+            reader = PyPDF2.PdfReader(file)
+            # Check if PDF is encrypted
+            if reader.is_encrypted:
+                raise PDFProcessorError("PDF is encrypted and cannot be processed")
+            for page_num, page in enumerate(reader.pages):
+                try:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {page_num + 1} ---\n"
+                        text += page_text + "\n"
+                except Exception as e:
+                    logger.warning(f"Failed to extract text from page {page_num + 1}: {e}")
+                    continue
+        return clean_extracted_text(text)
+    except Exception as e:
+        raise PDFProcessorError(f"PyPDF2 extraction failed: {e}")
+def extract_with_pdfplumber(file_path: str) -> Tuple[str, List[Dict]]:
+    """Extract text and tables using pdfplumber - better for structured docs"""
+    text = ""
+    tables = []
     try:
+        with pdfplumber.open(file_path) as pdf:
+            for page_num, page in enumerate(pdf.pages):
+                try:
+                    # Extract text
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += f"\n--- Page {page_num + 1} ---\n"
+                        text += page_text + "\n"
+                    # Extract tables
+                    page_tables = page.extract_tables()
+                    for table_num, table in enumerate(page_tables):
+                        if table and len(table) > 1 and any(any(cell for cell in row if cell) for row in table):
+                            tables.append({
+                                'page': page_num + 1,
+                                'table_number': table_num + 1,
+                                'data': table,
+                                'text_representation': table_to_text(table)
+                            })
+                except Exception as e:
+                    logger.warning(f"Failed to process page {page_num + 1}: {e}")
+                    continue
+        return clean_extracted_text(text), tables
     except Exception as e:
+        raise PDFProcessorError(f"pdfplumber extraction failed: {e}")
+def extract_with_pymupdf(file_path: str) -> Tuple[str, Dict]:
+    """Extract text using PyMuPDF - most robust method"""
+    text = ""
+    metadata = {}
     try:
+        doc = fitz.open(file_path)
+        # Check if document is valid
+        if doc.is_closed:
+            raise PDFProcessorError("Could not open PDF document")
+        # Extract metadata safely
+        try:
+            doc_metadata = doc.metadata or {}
+            metadata = {
+                'page_count': doc.page_count,
+                'title': doc_metadata.get('title', ''),
+                'author': doc_metadata.get('author', ''),
+                'subject': doc_metadata.get('subject', ''),
+                'creator': doc_metadata.get('creator', ''),
+                'producer': doc_metadata.get('producer', ''),
+                'creation_date': doc_metadata.get('creationDate', ''),
+                'modification_date': doc_metadata.get('modDate', '')
+            }
+        except Exception as e:
+            logger.warning(f"Could not extract metadata: {e}")
+            metadata = {'page_count': doc.page_count}
+        # Extract text
+        for page_num in range(doc.page_count):
             try:
+                page = doc[page_num]
+                page_text = page.get_text()
+                if page_text:
+                    text += f"\n--- Page {page_num + 1} ---\n"
+                    text += page_text + "\n"
             except Exception as e:
+                logger.warning(f"Failed to extract text from page {page_num + 1}: {e}")
+                continue
+        doc.close()
+        return clean_extracted_text(text), metadata
+    except Exception as e:
+        raise PDFProcessorError(f"PyMuPDF extraction failed: {e}")
+def clean_extracted_text(text: str) -> str:
+    """Clean and normalize extracted text"""
+    if not text:
+        return ""
+    try:
+        # Remove excessive whitespace
+        text = re.sub(r'\n\s*\n', '\n\n', text)
+        text = re.sub(r' +', ' ', text)
+        # Fix common PDF extraction issues
+        text = text.replace('\ufffd', '')  # Remove unicode replacement chars
+        text = text.replace('\x00', '')  # Remove null characters
+        text = text.replace('\u200b', '')  # Remove zero-width space
+        # Normalize line breaks
+        text = text.replace('\r\n', '\n')
+        text = text.replace('\r', '\n')
+        # Remove control characters except newlines and tabs
+        text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]', '', text)
+        return text.strip()
+    except Exception as e:
+        logger.warning(f"Error cleaning text: {e}")
+        return text.strip() if text else ""
+def table_to_text(table: List[List]) -> str:
+    """Convert table data to readable text format"""
+    if not table:
+        return ""
+    try:
+        text_lines = []
+        for row in table:
+            if row:  # Skip empty rows
+                clean_row = [str(cell).strip() if cell else "" for cell in row]
+                if any(clean_row):  # Only add non-empty rows
+                    text_lines.append(" | ".join(clean_row))
+        return "\n".join(text_lines)
+    except Exception as e:
+        logger.warning(f"Error converting table to text: {e}")
+        return ""
+def detect_ibm_document_type(text: str, metadata: Dict) -> str:
+    """Detect specific IBM document types"""
+    try:
+        text_lower = text.lower()
+        title_lower = metadata.get('title', '').lower()
+        # IBM-specific patterns
+        if any(term in text_lower for term in ['ibm annual report', 'international business machines']):
+            return 'IBM Annual Report'
+        elif any(term in text_lower for term in ['ibm research', 'watson', 'artificial intelligence']):
+            return 'IBM Research Document'
+        elif any(term in text_lower for term in ['red hat', 'openshift', 'kubernetes']):
+            return 'IBM Cloud/Red Hat Document'
+        elif any(term in text_lower for term in ['mainframe', 'z systems', 'power systems']):
+            return 'IBM Hardware Documentation'
+        elif any(term in text_lower for term in ['cognos', 'spss', 'analytics']):
+            return 'IBM Analytics Document'
+        elif 'ibm' in text_lower:
+            return 'IBM Business Document'
         else:
+            return 'General Document'
+    except Exception as e:
+        logger.warning(f"Error detecting document type: {e}")
+        return 'Unknown Document'
+def process_ibm_pdf(file_path: str) -> Dict:
+    """
+    Process IBM PDF with enhanced extraction and error handling
+    """
+    try:
+        result = enhanced_pdf_processor(file_path)
+        if result['success']:
+            # Detect IBM document type
+            doc_type = detect_ibm_document_type(result['text'], result['metadata'])
+            result['document_type'] = doc_type
+            # Extract IBM-specific metrics if it's a financial document
+            if 'annual report' in doc_type.lower():
+                result['financial_metrics'] = extract_ibm_financial_metrics(result['text'])
+            # Process tables for better analysis
+            if result['tables']:
+                result['structured_data'] = process_ibm_tables(result['tables'])
+        return result
+    except Exception as e:
+        logger.error(f"Error processing IBM PDF: {e}")
+        return {
+            'text': '',
+            'tables': [],
+            'metadata': {},
+            'extraction_method': 'unknown',
+            'success': False,
+            'error': str(e),
+            'document_type': 'Unknown'
+        }
+def extract_ibm_financial_metrics(text: str) -> Dict:
+    """Extract IBM-specific financial metrics"""
+    metrics = {}
     try:
+        # Revenue patterns (more comprehensive)
+        revenue_patterns = [
+            r'(?:total\s+)?revenue[:\s]+\$?([\d,]+(?:\.\d+)?)\s*(?:million|billion)?',
+            r'total\s+revenue[:\s]+\$?([\d,]+(?:\.\d+)?)',
+            r'net\s+revenue[:\s]+\$?([\d,]+(?:\.\d+)?)'
+        ]
+        for pattern in revenue_patterns:
+            revenue_match = re.search(pattern, text, re.IGNORECASE)
+            if revenue_match:
+                metrics['revenue'] = revenue_match.group(1)
+                break
+        # Net income patterns
+        income_patterns = [
+            r'net\s+income[:\s]+\$?([\d,]+(?:\.\d+)?)\s*(?:million|billion)?',
+            r'net\s+earnings[:\s]+\$?([\d,]+(?:\.\d+)?)',
+            r'income\s+from\s+continuing\s+operations[:\s]+\$?([\d,]+(?:\.\d+)?)'
+        ]
+        for pattern in income_patterns:
+            income_match = re.search(pattern, text, re.IGNORECASE)
+            if income_match:
+                metrics['net_income'] = income_match.group(1)
+                break
+        # Earnings per share
+        eps_patterns = [
+            r'earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)',
+            r'diluted\s+earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)',
+            r'basic\s+earnings\s+per\s+share[:\s]+\$?([\d,]+(?:\.\d+)?)'
+        ]
+        for pattern in eps_patterns:
+            eps_match = re.search(pattern, text, re.IGNORECASE)
+            if eps_match:
+                metrics['eps'] = eps_match.group(1)
+                break
+        return metrics
     except Exception as e:
+        logger.warning(f"Error extracting financial metrics: {e}")
+        return {}
+def process_ibm_tables(tables: List[Dict]) -> List[Dict]:
+    """Process IBM tables for better structure"""
+    processed_tables = []
+    for table in tables:
+        try:
+            # Convert table to DataFrame for better processing
+            if table.get('data') and len(table['data']) > 1:
+                df = pd.DataFrame(table['data'][1:], columns=table['data'][0])
+                # Clean and process
+                df = df.dropna(how='all')  # Remove empty rows
+                df = df.fillna('')  # Fill NaN with empty string
+                # Remove completely empty columns
+                df = df.loc[:, (df != '').any(axis=0)]
+                if not df.empty:
+                    processed_tables.append({
+                        'page': table.get('page', 0),
+                        'table_number': table.get('table_number', 0),
+                        'dataframe': df,
+                        'summary': f"Table with {len(df)} rows and {len(df.columns)} columns",
+                        'text': df.to_string(index=False)
+                    })
+        except Exception as e:
+            logger.warning(f"Error processing table: {e}")
+            # If DataFrame conversion fails, keep original
+            processed_tables.append(table)
+    return processed_tables
+# Additional utility functions for web integration
+def safe_process_pdf(file_path: str, max_file_size_mb: int = 50) -> Dict:
+    """
+    Safely process PDF with size and security checks
+    """
+    try:
+        # Check file size
+        if os.path.getsize(file_path) > max_file_size_mb * 1024 * 1024:
+            return {
+                'success': False,
+                'error': f'File too large. Maximum size: {max_file_size_mb}MB'
+            }
+        # Process the PDF
+        return process_ibm_pdf(file_path)
+    except Exception as e:
+        logger.error(f"Safe PDF processing failed: {e}")
+        return {
+            'success': False,
+            'error': f'Processing failed: {str(e)}'
+        }
 if __name__ == "__main__":
+    # Example usage
+    pdf_path = "demo.pdf"  # Replace with your PDF path
+    result = safe_process_pdf(pdf_path)
+    if result['success']:
+        print(f"Successfully processed PDF using {result['extraction_method']}")
+        print(f"Document type: {result.get('document_type', 'Unknown')}")
+        print(f"Text length: {len(result['text'])} characters")
+        print(f"Number of tables: {len(result['tables'])}")
+        if result.get('financial_metrics'):
+            print("Financial metrics found:")
+            for metric, value in result['financial_metrics'].items():
+                print(f"  {metric}: {value}")
+    else:
+        print(f"Failed to process PDF: {result['error']}")