Spaces:

shivam701171
/

Invoice_processing_tool

Sleeping

App Files Files Community

shivam701171 commited on Jul 5, 2025

Commit

6a4369a

verified ·

1 Parent(s): baa5d38

Update app.py

Browse files

Files changed (1) hide show

app.py +248 -438

app.py CHANGED Viewed

@@ -1,15 +1,15 @@
 #!/usr/bin/env python3
 """
-Enhanced Invoice Processing & Analysis System - Hugging Face Spaces Compatible
 A comprehensive system with AI-powered extraction, semantic search, and analytics.
 Author: AI Assistant
 Date: 2024
-Version: HuggingFace v1.0
 """
 # ===============================================================================
-# IMPORTS AND HUGGING FACE COMPATIBILITY
 # ===============================================================================
 import os
@@ -25,8 +25,7 @@ from dataclasses import dataclass
 from pathlib import Path
 import time
 import logging
 # Check if running on Hugging Face Spaces
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
@@ -39,26 +38,7 @@ import plotly.express as px
 import plotly.graph_objects as go
 import requests
-# This should be the FIRST Streamlit command
-st.set_page_config(
-    page_title="AI Invoice Processing System",
-    page_icon="📄",
-    layout="wide",
-    initial_sidebar_state="expanded",
-    menu_items={
-        'Get Help': 'https://huggingface.co/spaces',
-        'Report a bug': 'https://huggingface.co/spaces',
-        'About': """
-        # AI Invoice Processing System
-        Built for Hugging Face Spaces with AI-powered extraction and semantic search.
-        """
-    }
-)
-# Vector storage and embeddings (HF compatible)
 try:
     import faiss
     FAISS_AVAILABLE = True
@@ -80,41 +60,51 @@ except ImportError:
     TORCH_AVAILABLE = False
 # Document processing (simplified for HF)
-try:
-    from docling.document_converter import DocumentConverter
-    from docling.datamodel.base_models import InputFormat
-    from docling.datamodel.pipeline_options import PdfPipelineOptions
-    from docling.document_converter import PdfFormatOption
-    DOCLING_AVAILABLE = True
-except ImportError:
-    DOCLING_AVAILABLE = False
-    st.warning("⚠️ Docling not available. Using simplified document processing.")
-# Alternative document processing for HF
 try:
     import pdfplumber
     PDF_PROCESSING_AVAILABLE = True
 except ImportError:
     try:
         import PyPDF2
         PDF_PROCESSING_AVAILABLE = True
     except ImportError:
         PDF_PROCESSING_AVAILABLE = False
 # ===============================================================================
-# HUGGING FACE CONFIGURATION
 # ===============================================================================
-# Hugging Face Spaces configuration
 HF_CONFIG = {
-    "max_file_size_mb": 10,  # Reduced for HF Spaces
-    "max_concurrent_files": 3,  # Reduced for HF Spaces
     "timeout_seconds": 30,
-    "use_cpu_only": True,  # Force CPU for HF Spaces
-    "embedding_model": "all-MiniLM-L6-v2",  # Lightweight model
     "cache_dir": "./cache",
     "data_dir": "./data",
-    "enable_ollama": False,  # Disable Ollama for HF Spaces
 }
 # Create necessary directories
@@ -122,15 +112,12 @@ os.makedirs(HF_CONFIG["cache_dir"], exist_ok=True)
 os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
 # ===============================================================================
-# STREAMLIT CONFIGURATION FOR HUGGING FACE
-# ===============================================================================
-# ===============================================================================
-# SIMPLIFIED DATA STRUCTURES FOR HF
 # ===============================================================================
 @dataclass
 class InvoiceData:
-    """Simplified data structure for extracted invoice information"""
     supplier_name: str = ""
     buyer_name: str = ""
     invoice_number: str = ""
@@ -153,182 +140,10 @@ class VectorSearchResult:
     metadata: Dict
 # ===============================================================================
-# HUGGING FACE COMPATIBLE VECTOR STORE
-# ===============================================================================
-class HuggingFaceVectorStore:
-    """Simplified vector store compatible with Hugging Face Spaces"""
-    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
-        self.embedding_model_name = embedding_model
-        self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
-        self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
-        self.embedding_model = None
-        self.vectors = []
-        self.document_metadata = []
-        self.embedding_dimension = None
-        self.setup_embedding_model()
-        self.load_vector_store()
-    def setup_embedding_model(self):
-        """Initialize the sentence transformer model"""
-        if not SENTENCE_TRANSFORMERS_AVAILABLE:
-            st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
-            return
-        try:
-            with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
-                self.embedding_model = SentenceTransformer(
-                    self.embedding_model_name,
-                    cache_folder=HF_CONFIG["cache_dir"]
-                )
-                # Get embedding dimension
-                test_embedding = self.embedding_model.encode(["test"])
-                self.embedding_dimension = test_embedding.shape[0]
-                st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
-        except Exception as e:
-            st.error(f"❌ Failed to load embedding model: {e}")
-            self.embedding_model = None
-    def load_vector_store(self):
-        """Load existing vector store"""
-        try:
-            if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
-                with open(self.vector_store_path, 'rb') as f:
-                    self.vectors = pickle.load(f)
-                with open(self.metadata_path, 'rb') as f:
-                    self.document_metadata = pickle.load(f)
-                st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
-            else:
-                self.vectors = []
-                self.document_metadata = []
-                st.info("📄 New vector store initialized")
-        except Exception as e:
-            st.error(f"❌ Error loading vector store: {e}")
-            self.vectors = []
-            self.document_metadata = []
-    def save_vector_store(self):
-        """Save vector store to disk"""
-        try:
-            with open(self.vector_store_path, 'wb') as f:
-                pickle.dump(self.vectors, f)
-            with open(self.metadata_path, 'wb') as f:
-                pickle.dump(self.document_metadata, f)
-            return True
-        except Exception as e:
-            st.error(f"Error saving vector store: {e}")
-            return False
-    def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
-        """Create searchable text from invoice data"""
-        text_parts = []
-        for field, value in invoice_data.items():
-            if value and field != 'id':
-                text_parts.append(f"{field}: {value}")
-        if raw_text:
-            text_parts.append(f"content: {raw_text[:300]}")
-        return " | ".join(text_parts)
-    def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
-        """Add a document to the vector store"""
-        if not self.embedding_model:
-            return False
-        try:
-            document_text = self.create_document_text(invoice_data, raw_text)
-            # Generate embedding
-            embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
-            # Create metadata
-            metadata = {
-                'invoice_id': invoice_data.get('id', ''),
-                'invoice_number': invoice_data.get('invoice_number', ''),
-                'supplier_name': invoice_data.get('supplier_name', ''),
-                'buyer_name': invoice_data.get('buyer_name', ''),
-                'amount': invoice_data.get('amount', 0),
-                'date': invoice_data.get('date', ''),
-                'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
-                'document_text': document_text[:200],
-                'timestamp': datetime.now().isoformat()
-            }
-            # Add to store
-            self.vectors.append(embedding)
-            self.document_metadata.append(metadata)
-            return True
-        except Exception as e:
-            st.error(f"Error adding document to vector store: {e}")
-            return False
-    def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
-        """Perform semantic search using cosine similarity"""
-        if not self.embedding_model or not self.vectors:
-            return []
-        try:
-            # Generate query embedding
-            query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
-            # Calculate similarities
-            similarities = []
-            for i, doc_embedding in enumerate(self.vectors):
-                similarity = np.dot(query_embedding, doc_embedding)
-                similarities.append((similarity, i))
-            # Sort by similarity
-            similarities.sort(reverse=True)
-            # Return top results
-            results = []
-            for similarity, idx in similarities[:top_k]:
-                if similarity > 0.1:  # Relevance threshold
-                    metadata = self.document_metadata[idx]
-                    result = VectorSearchResult(
-                        invoice_id=metadata.get('invoice_id', ''),
-                        invoice_number=metadata.get('invoice_number', ''),
-                        supplier_name=metadata.get('supplier_name', ''),
-                        similarity_score=float(similarity),
-                        content_preview=metadata.get('document_text', ''),
-                        metadata=metadata
-                    )
-                    results.append(result)
-            return results
-        except Exception as e:
-            st.error(f"Error in semantic search: {e}")
-            return []
-    def get_stats(self) -> Dict:
-        """Get vector store statistics"""
-        return {
-            'total_documents': len(self.document_metadata),
-            'embedding_dimension': self.embedding_dimension,
-            'model_name': self.embedding_model_name,
-            'vector_store_size': len(self.vectors)
-        }
-# ===============================================================================
-# SIMPLIFIED DOCUMENT PROCESSING FOR HF
 # ===============================================================================
-class HuggingFaceDocumentProcessor:
     """Simplified document processor for Hugging Face Spaces"""
     def __init__(self):
@@ -340,23 +155,17 @@ class HuggingFaceDocumentProcessor:
         # PDF processing
         if PDF_PROCESSING_AVAILABLE:
-            try:
-                import pdfplumber
                 self.processors['pdf'] = self.extract_with_pdfplumber
                 st.success("✅ PDF processing available (pdfplumber)")
-            except ImportError:
-                try:
-                    import PyPDF2
-                    self.processors['pdf'] = self.extract_with_pypdf2
-                    st.success("✅ PDF processing available (PyPDF2)")
-                except ImportError:
-                    st.warning("⚠️ No PDF processor available")
         # Text files
         self.processors['txt'] = self.extract_text_file
-        # Images (basic OCR alternative)
-        self.processors['image'] = self.extract_image_text
     def extract_with_pdfplumber(self, file_path: str) -> str:
         """Extract text using pdfplumber"""
@@ -396,11 +205,6 @@ class HuggingFaceDocumentProcessor:
             st.error(f"Text file extraction failed: {e}")
             return ""
-    def extract_image_text(self, file_path: str) -> str:
-        """Basic image text extraction (placeholder for OCR)"""
-        st.warning("⚠️ OCR not available in this environment. Please use text-based documents.")
-        return ""
     def extract_text_from_document(self, file_path: str) -> str:
         """Extract text from document based on file type"""
         file_ext = Path(file_path).suffix.lower()
@@ -409,8 +213,6 @@ class HuggingFaceDocumentProcessor:
             processor = self.processors.get('pdf')
         elif file_ext == '.txt':
             processor = self.processors.get('txt')
-        elif file_ext in ['.jpg', '.jpeg', '.png']:
-            processor = self.processors.get('image')
         else:
             st.warning(f"Unsupported file type: {file_ext}")
             return ""
@@ -422,11 +224,11 @@ class HuggingFaceDocumentProcessor:
             return ""
 # ===============================================================================
-# SIMPLIFIED AI EXTRACTION FOR HF
 # ===============================================================================
-class HuggingFaceAIExtractor:
-    """Simplified AI extraction for Hugging Face Spaces"""
     def __init__(self):
         self.use_transformers = self.setup_transformers()
@@ -434,16 +236,12 @@ class HuggingFaceAIExtractor:
     def setup_transformers(self):
         """Try to setup Hugging Face transformers for NER"""
         try:
-            from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
-            # Use a lightweight NER model
-            model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
             with st.spinner("Loading AI extraction model..."):
                 self.ner_pipeline = pipeline(
                     "ner",
-                    model=model_name,
-                    tokenizer=model_name,
                     aggregation_strategy="simple"
                 )
@@ -614,17 +412,180 @@ class HuggingFaceAIExtractor:
         return date_str
 # ===============================================================================
-# MAIN PROCESSOR FOR HUGGING FACE
 # ===============================================================================
-class HuggingFaceInvoiceProcessor:
-    """Main invoice processor optimized for Hugging Face Spaces"""
     def __init__(self):
         self.setup_storage()
-        self.document_processor = HuggingFaceDocumentProcessor()
-        self.ai_extractor = HuggingFaceAIExtractor()
-        self.vector_store = HuggingFaceVectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
         # Initialize stats
         self.processing_stats = {
@@ -786,13 +747,13 @@ class HuggingFaceInvoiceProcessor:
         data["metadata"]["total_invoices"] = len(invoices)
 # ===============================================================================
-# SIMPLIFIED CHATBOT FOR HF
 # ===============================================================================
-class HuggingFaceChatBot:
-    """Simplified chatbot for Hugging Face Spaces"""
-    def __init__(self, processor: HuggingFaceInvoiceProcessor):
         self.processor = processor
     def query_database(self, query: str) -> str:
@@ -819,7 +780,6 @@ class HuggingFaceChatBot:
             elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
                 return self.handle_supplier_query(data, query)
             elif self.processor.vector_store:
                 return self.handle_semantic_search(query)
@@ -1005,40 +965,19 @@ class HuggingFaceChatBot:
         return response
 # ===============================================================================
-# STREAMLIT APPLICATION FOR HUGGING FACE
-# ===============================================================================
-# ===============================================================================
-# FIXED MAIN APPLICATION WITH PROPER CHAT INPUT PLACEMENT
-# ===============================================================================
-# ===============================================================================
-# FIXED APPLICATION WITH UNIQUE WIDGET KEYS
-# ===============================================================================
-# ===============================================================================
-# FIXED APPLICATION WITH DYNAMIC UNIQUE KEYS AND SESSION STATE
 # ===============================================================================
-import streamlit as st
-import pandas as pd
-import plotly.express as px
-import json
-from datetime import datetime
-import os
-import uuid
-# Generate unique session ID for this run
-if 'session_id' not in st.session_state:
-    st.session_state.session_id = str(uuid.uuid4())[:8]
-def create_huggingface_app():
-    """Main Streamlit application optimized for Hugging Face Spaces"""
-    # Get unique session ID
     session_id = st.session_state.session_id
-    # Custom CSS for better UI
     st.markdown("""
     <style>
     .main-header {
@@ -1073,40 +1012,35 @@ def create_huggingface_app():
     """, unsafe_allow_html=True)
     # Initialize processor
-    if 'hf_processor' not in st.session_state:
         with st.spinner("🔧 Initializing AI Invoice Processor..."):
             try:
-                from enhanced_invoice_system_part1 import (
-                    HuggingFaceInvoiceProcessor, HF_CONFIG
-                )
-                st.session_state.hf_processor = HuggingFaceInvoiceProcessor()
-                st.session_state.hf_chatbot = HuggingFaceChatBot(st.session_state.hf_processor)
                 st.session_state.chat_history = []
                 st.success("✅ System initialized successfully!")
             except Exception as e:
                 st.error(f"❌ Initialization failed: {e}")
                 st.stop()
-    # Sidebar with system status
     with st.sidebar:
         st.header("🎛️ System Status")
-        processor = st.session_state.hf_processor
-        # Document processing status
-        if hasattr(processor, 'document_processor') and processor.document_processor.processors:
             st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
-        # AI extraction status
-        if hasattr(processor, 'ai_extractor') and processor.ai_extractor.use_transformers:
             st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
-        # Vector search status
-        if hasattr(processor, 'vector_store') and processor.vector_store and processor.vector_store.embedding_model:
             st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
@@ -1120,15 +1054,12 @@ def create_huggingface_app():
             st.metric("Total Invoices", total_invoices)
             st.metric("Total Value", f"₹{total_amount:,.2f}")
-            if hasattr(processor, 'processing_stats'):
-                success_rate = f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}"
-                st.metric("Success Rate", success_rate)
         except Exception as e:
             st.error(f"Stats error: {e}")
-        # Processing info
         st.header("⚙️ System Info")
         st.info(f"""
         **Session ID:** {session_id}
@@ -1181,10 +1112,9 @@ def create_huggingface_app():
             </div>
             """, unsafe_allow_html=True)
-        # File upload interface
         st.markdown("### 📁 Upload Your Invoices")
-        # Use timestamp to ensure unique keys
         timestamp = datetime.now().strftime("%H%M%S")
         uploaded_files = st.file_uploader(
@@ -1220,7 +1150,7 @@ def create_huggingface_app():
                 with st.chat_message(message["role"]):
                     st.markdown(message["content"])
-        # Chat input area
         st.markdown("### ✍️ Ask a Question")
         col1, col2 = st.columns([4, 1])
@@ -1282,7 +1212,7 @@ def create_huggingface_app():
         st.header("📊 Analytics Dashboard")
         try:
-            data = st.session_state.hf_processor.load_json_data()
             invoices = data.get("invoices", [])
             if not invoices:
@@ -1350,7 +1280,7 @@ def create_huggingface_app():
         st.header("📋 Data Explorer")
         try:
-            data = st.session_state.hf_processor.load_json_data()
             invoices = data.get("invoices", [])
             if not invoices:
@@ -1444,13 +1374,12 @@ def create_huggingface_app():
             st.error(f"Data explorer error: {e}")
     # -------------------------------------------------------------------------
-    # GLOBAL CHAT INPUT (Outside sections)
     # -------------------------------------------------------------------------
     st.markdown("---")
     st.markdown("### 💬 Quick Chat (Works from any section)")
-    # Global chat input with unique key
     global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
     if global_query:
@@ -1485,8 +1414,7 @@ def process_files(uploaded_files, session_id):
             st.info(f"Processing: {uploaded_file.name}")
         try:
-            # Process file
-            result = st.session_state.hf_processor.process_uploaded_file(uploaded_file)
             with results_container:
                 if result and result.invoice_number:
@@ -1510,7 +1438,6 @@ def process_files(uploaded_files, session_id):
             with results_container:
                 st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
-    # Final status
     with status_container:
         st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
@@ -1519,17 +1446,15 @@ def process_files(uploaded_files, session_id):
 def handle_chat_query(query, show_response=False):
     """Handle chat query"""
-    # Add user message
     st.session_state.chat_history.append({
         "role": "user",
         "content": query,
         "timestamp": datetime.now()
     })
-    # Get AI response
     try:
         with st.spinner("🤖 AI is analyzing..."):
-            response = st.session_state.hf_chatbot.query_database(query)
             st.session_state.chat_history.append({
                 "role": "assistant",
@@ -1537,7 +1462,6 @@ def handle_chat_query(query, show_response=False):
                 "timestamp": datetime.now()
             })
-        # Show response if requested
         if show_response:
             with st.chat_message("assistant"):
                 st.markdown(response)
@@ -1555,26 +1479,10 @@ def handle_chat_query(query, show_response=False):
 def main():
     """Main entry point for Hugging Face Spaces"""
     try:
-        # Import required classes
-        from enhanced_invoice_system_part1 import IS_HF_SPACE
-        # Display environment info
         if IS_HF_SPACE:
             st.sidebar.info("🤗 Running on Hugging Face Spaces")
-        # Create and run the app
-        create_huggingface_app()
-    except ImportError as e:
-        st.error(f"""
-        ## 🚨 Import Error
-        Missing required modules: {e}
-        Please ensure all files are uploaded to your Hugging Face Space:
-        - enhanced_invoice_system_part1.py
-        - enhanced_invoice_system_part2.py (this file)
-        """)
     except Exception as e:
         st.error(f"""
@@ -1585,103 +1493,5 @@ def main():
         Please refresh the page or check the logs for more details.
         """)
-if __name__ == "__main__":
-    main()
-# ===============================================================================
-# MAIN APPLICATION ENTRY POINT
-# ===============================================================================
-def main():
-    """Main entry point for Hugging Face Spaces"""
-    try:
-        # Display Hugging Face info if running on HF Spaces
-        if IS_HF_SPACE:
-            st.sidebar.info("🤗 Running on Hugging Face Spaces")
-        # Create and run the app
-        create_huggingface_app()
-    except Exception as e:
-        st.error(f"Application error: {e}")
-        st.info("Please refresh the page or contact support if the error persists.")
-if __name__ == "__main__":
-    main()
-# ===============================================================================
-# MAIN APPLICATION ENTRY POINT
-# ===============================================================================
-def main():
-    """Main entry point for Hugging Face Spaces"""
-    try:
-        # Display Hugging Face info if running on HF Spaces
-        if IS_HF_SPACE:
-            st.sidebar.info("🤗 Running on Hugging Face Spaces")
-        # Create and run the app
-        create_huggingface_app()
-    except Exception as e:
-        st.error(f"Application error: {e}")
-        st.info("Please refresh the page or contact support if the error persists.")
-if __name__ == "__main__":
-    main()
-# ===============================================================================
-# HUGGING FACE REQUIREMENTS AND CONFIGURATION
-# ===============================================================================
-def generate_hf_requirements():
-    """Generate requirements.txt optimized for Hugging Face Spaces"""
-    requirements = """streamlit>=1.28.0
-pandas>=1.5.0
-numpy>=1.21.0
-plotly>=5.0.0
-sentence-transformers>=2.2.0
-transformers>=4.21.0
-torch>=1.13.0
-faiss-cpu>=1.7.0
-pdfplumber>=0.7.0
-requests>=2.28.0
-python-dateutil>=2.8.0
-Pillow>=9.0.0
-"""
-    return requirements.strip()
-def generate_hf_config():
-    """Generate app configuration for Hugging Face Spaces"""
-    config = {
-        "title": "AI Invoice Processing System",
-        "emoji": "📄",
-        "colorFrom": "blue",
-        "colorTo": "purple",
-        "sdk": "streamlit",
-        "sdk_version": "1.28.0",
-        "app_file": "app.py",
-        "pinned": False,
-        "python_version": "3.9"
-    }
-    return config
-# ===============================================================================
-# MAIN APPLICATION ENTRY POINT
-# ===============================================================================
-def main():
-    """Main entry point for Hugging Face Spaces"""
-    try:
-        # Display Hugging Face info if running on HF Spaces
-        if IS_HF_SPACE:
-            st.sidebar.info("🤗 Running on Hugging Face Spaces")
-        # Create and run the app
-        create_huggingface_app()
-    except Exception as e:
-        st.error(f"Application error: {e}")
-        st.info("Please refresh the page or contact support if the error persists.")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python3
 """
+AI Invoice Processing System - Complete Single File for Hugging Face Spaces
 A comprehensive system with AI-powered extraction, semantic search, and analytics.
 Author: AI Assistant
 Date: 2024
+Version: HuggingFace Single File v1.0
 """
 # ===============================================================================
+# IMPORTS AND COMPATIBILITY CHECKS
 # ===============================================================================
 import os
 from pathlib import Path
 import time
 import logging
+import uuid
 # Check if running on Hugging Face Spaces
 IS_HF_SPACE = os.getenv("SPACE_ID") is not None
 import plotly.graph_objects as go
 import requests
+# Vector storage and embeddings (with fallbacks)
 try:
     import faiss
     FAISS_AVAILABLE = True
     TORCH_AVAILABLE = False
 # Document processing (simplified for HF)
 try:
     import pdfplumber
     PDF_PROCESSING_AVAILABLE = True
+    PDF_PROCESSOR = "pdfplumber"
 except ImportError:
     try:
         import PyPDF2
         PDF_PROCESSING_AVAILABLE = True
+        PDF_PROCESSOR = "PyPDF2"
     except ImportError:
         PDF_PROCESSING_AVAILABLE = False
+        PDF_PROCESSOR = None
+# ===============================================================================
+# STREAMLIT CONFIGURATION
+# ===============================================================================
+st.set_page_config(
+    page_title="AI Invoice Processing System",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded",
+    menu_items={
+        'Get Help': 'https://huggingface.co/spaces',
+        'Report a bug': 'https://huggingface.co/spaces',
+        'About': """
+        # AI Invoice Processing System
+        Built for Hugging Face Spaces with AI-powered extraction and semantic search.
+        """
+    }
+)
 # ===============================================================================
+# CONFIGURATION
 # ===============================================================================
 HF_CONFIG = {
+    "max_file_size_mb": 10,
+    "max_concurrent_files": 3,
     "timeout_seconds": 30,
+    "use_cpu_only": True,
+    "embedding_model": "all-MiniLM-L6-v2",
     "cache_dir": "./cache",
     "data_dir": "./data",
+    "enable_ollama": False,
 }
 # Create necessary directories
 os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
 # ===============================================================================
+# DATA STRUCTURES
 # ===============================================================================
 @dataclass
 class InvoiceData:
+    """Data structure for extracted invoice information"""
     supplier_name: str = ""
     buyer_name: str = ""
     invoice_number: str = ""
     metadata: Dict
 # ===============================================================================
+# DOCUMENT PROCESSING CLASSES
 # ===============================================================================
+class DocumentProcessor:
     """Simplified document processor for Hugging Face Spaces"""
     def __init__(self):
         # PDF processing
         if PDF_PROCESSING_AVAILABLE:
+            if PDF_PROCESSOR == "pdfplumber":
                 self.processors['pdf'] = self.extract_with_pdfplumber
                 st.success("✅ PDF processing available (pdfplumber)")
+            elif PDF_PROCESSOR == "PyPDF2":
+                self.processors['pdf'] = self.extract_with_pypdf2
+                st.success("✅ PDF processing available (PyPDF2)")
+        else:
+            st.warning("⚠️ No PDF processor available")
         # Text files
         self.processors['txt'] = self.extract_text_file
     def extract_with_pdfplumber(self, file_path: str) -> str:
         """Extract text using pdfplumber"""
             st.error(f"Text file extraction failed: {e}")
             return ""
     def extract_text_from_document(self, file_path: str) -> str:
         """Extract text from document based on file type"""
         file_ext = Path(file_path).suffix.lower()
             processor = self.processors.get('pdf')
         elif file_ext == '.txt':
             processor = self.processors.get('txt')
         else:
             st.warning(f"Unsupported file type: {file_ext}")
             return ""
             return ""
 # ===============================================================================
+# AI EXTRACTION CLASS
 # ===============================================================================
+class AIExtractor:
+    """AI extraction for Hugging Face Spaces"""
     def __init__(self):
         self.use_transformers = self.setup_transformers()
     def setup_transformers(self):
         """Try to setup Hugging Face transformers for NER"""
         try:
+            from transformers import pipeline
             with st.spinner("Loading AI extraction model..."):
                 self.ner_pipeline = pipeline(
                     "ner",
+                    model="dbmdz/bert-large-cased-finetuned-conll03-english",
                     aggregation_strategy="simple"
                 )
         return date_str
 # ===============================================================================
+# VECTOR STORE CLASS
+# ===============================================================================
+class VectorStore:
+    """Simplified vector store for Hugging Face Spaces"""
+    def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
+        self.embedding_model_name = embedding_model
+        self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
+        self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
+        self.embedding_model = None
+        self.vectors = []
+        self.document_metadata = []
+        self.embedding_dimension = None
+        self.setup_embedding_model()
+        self.load_vector_store()
+    def setup_embedding_model(self):
+        """Initialize the sentence transformer model"""
+        if not SENTENCE_TRANSFORMERS_AVAILABLE:
+            st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
+            return
+        try:
+            with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
+                self.embedding_model = SentenceTransformer(
+                    self.embedding_model_name,
+                    cache_folder=HF_CONFIG["cache_dir"]
+                )
+                # Get embedding dimension
+                test_embedding = self.embedding_model.encode(["test"])
+                self.embedding_dimension = test_embedding.shape[0]
+                st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
+        except Exception as e:
+            st.error(f"❌ Failed to load embedding model: {e}")
+            self.embedding_model = None
+    def load_vector_store(self):
+        """Load existing vector store"""
+        try:
+            if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
+                with open(self.vector_store_path, 'rb') as f:
+                    self.vectors = pickle.load(f)
+                with open(self.metadata_path, 'rb') as f:
+                    self.document_metadata = pickle.load(f)
+                st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
+            else:
+                self.vectors = []
+                self.document_metadata = []
+                st.info("📄 New vector store initialized")
+        except Exception as e:
+            st.error(f"❌ Error loading vector store: {e}")
+            self.vectors = []
+            self.document_metadata = []
+    def save_vector_store(self):
+        """Save vector store to disk"""
+        try:
+            with open(self.vector_store_path, 'wb') as f:
+                pickle.dump(self.vectors, f)
+            with open(self.metadata_path, 'wb') as f:
+                pickle.dump(self.document_metadata, f)
+            return True
+        except Exception as e:
+            st.error(f"Error saving vector store: {e}")
+            return False
+    def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
+        """Create searchable text from invoice data"""
+        text_parts = []
+        for field, value in invoice_data.items():
+            if value and field != 'id':
+                text_parts.append(f"{field}: {value}")
+        if raw_text:
+            text_parts.append(f"content: {raw_text[:300]}")
+        return " | ".join(text_parts)
+    def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
+        """Add a document to the vector store"""
+        if not self.embedding_model:
+            return False
+        try:
+            document_text = self.create_document_text(invoice_data, raw_text)
+            # Generate embedding
+            embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
+            # Create metadata
+            metadata = {
+                'invoice_id': invoice_data.get('id', ''),
+                'invoice_number': invoice_data.get('invoice_number', ''),
+                'supplier_name': invoice_data.get('supplier_name', ''),
+                'buyer_name': invoice_data.get('buyer_name', ''),
+                'amount': invoice_data.get('amount', 0),
+                'date': invoice_data.get('date', ''),
+                'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
+                'document_text': document_text[:200],
+                'timestamp': datetime.now().isoformat()
+            }
+            # Add to store
+            self.vectors.append(embedding)
+            self.document_metadata.append(metadata)
+            return True
+        except Exception as e:
+            st.error(f"Error adding document to vector store: {e}")
+            return False
+    def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
+        """Perform semantic search using cosine similarity"""
+        if not self.embedding_model or not self.vectors:
+            return []
+        try:
+            # Generate query embedding
+            query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
+            # Calculate similarities
+            similarities = []
+            for i, doc_embedding in enumerate(self.vectors):
+                similarity = np.dot(query_embedding, doc_embedding)
+                similarities.append((similarity, i))
+            # Sort by similarity
+            similarities.sort(reverse=True)
+            # Return top results
+            results = []
+            for similarity, idx in similarities[:top_k]:
+                if similarity > 0.1:  # Relevance threshold
+                    metadata = self.document_metadata[idx]
+                    result = VectorSearchResult(
+                        invoice_id=metadata.get('invoice_id', ''),
+                        invoice_number=metadata.get('invoice_number', ''),
+                        supplier_name=metadata.get('supplier_name', ''),
+                        similarity_score=float(similarity),
+                        content_preview=metadata.get('document_text', ''),
+                        metadata=metadata
+                    )
+                    results.append(result)
+            return results
+        except Exception as e:
+            st.error(f"Error in semantic search: {e}")
+            return []
+# ===============================================================================
+# MAIN PROCESSOR CLASS
 # ===============================================================================
+class InvoiceProcessor:
+    """Main invoice processor for Hugging Face Spaces"""
     def __init__(self):
         self.setup_storage()
+        self.document_processor = DocumentProcessor()
+        self.ai_extractor = AIExtractor()
+        self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
         # Initialize stats
         self.processing_stats = {
         data["metadata"]["total_invoices"] = len(invoices)
 # ===============================================================================
+# CHATBOT CLASS
 # ===============================================================================
+class ChatBot:
+    """Chatbot for invoice queries"""
+    def __init__(self, processor: InvoiceProcessor):
         self.processor = processor
     def query_database(self, query: str) -> str:
             elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
                 return self.handle_supplier_query(data, query)
             elif self.processor.vector_store:
                 return self.handle_semantic_search(query)
         return response
 # ===============================================================================
+# STREAMLIT APPLICATION
 # ===============================================================================
+def create_app():
+    """Main Streamlit application"""
+    # Generate unique session ID for this run
+    if 'session_id' not in st.session_state:
+        st.session_state.session_id = str(uuid.uuid4())[:8]
     session_id = st.session_state.session_id
+    # Custom CSS
     st.markdown("""
     <style>
     .main-header {
     """, unsafe_allow_html=True)
     # Initialize processor
+    if 'processor' not in st.session_state:
         with st.spinner("🔧 Initializing AI Invoice Processor..."):
             try:
+                st.session_state.processor = InvoiceProcessor()
+                st.session_state.chatbot = ChatBot(st.session_state.processor)
                 st.session_state.chat_history = []
                 st.success("✅ System initialized successfully!")
             except Exception as e:
                 st.error(f"❌ Initialization failed: {e}")
                 st.stop()
+    # Sidebar
     with st.sidebar:
         st.header("🎛️ System Status")
+        processor = st.session_state.processor
+        # Component status
+        if processor.document_processor.processors:
             st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
+        if processor.ai_extractor.use_transformers:
             st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
+        if processor.vector_store and processor.vector_store.embedding_model:
             st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
         else:
             st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
             st.metric("Total Invoices", total_invoices)
             st.metric("Total Value", f"₹{total_amount:,.2f}")
+            st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
         except Exception as e:
             st.error(f"Stats error: {e}")
+        # System info
         st.header("⚙️ System Info")
         st.info(f"""
         **Session ID:** {session_id}
             </div>
             """, unsafe_allow_html=True)
+        # File upload
         st.markdown("### 📁 Upload Your Invoices")
         timestamp = datetime.now().strftime("%H%M%S")
         uploaded_files = st.file_uploader(
                 with st.chat_message(message["role"]):
                     st.markdown(message["content"])
+        # Chat input
         st.markdown("### ✍️ Ask a Question")
         col1, col2 = st.columns([4, 1])
         st.header("📊 Analytics Dashboard")
         try:
+            data = st.session_state.processor.load_json_data()
             invoices = data.get("invoices", [])
             if not invoices:
         st.header("📋 Data Explorer")
         try:
+            data = st.session_state.processor.load_json_data()
             invoices = data.get("invoices", [])
             if not invoices:
             st.error(f"Data explorer error: {e}")
     # -------------------------------------------------------------------------
+    # GLOBAL CHAT INPUT
     # -------------------------------------------------------------------------
     st.markdown("---")
     st.markdown("### 💬 Quick Chat (Works from any section)")
     global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
     if global_query:
             st.info(f"Processing: {uploaded_file.name}")
         try:
+            result = st.session_state.processor.process_uploaded_file(uploaded_file)
             with results_container:
                 if result and result.invoice_number:
             with results_container:
                 st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
     with status_container:
         st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
 def handle_chat_query(query, show_response=False):
     """Handle chat query"""
     st.session_state.chat_history.append({
         "role": "user",
         "content": query,
         "timestamp": datetime.now()
     })
     try:
         with st.spinner("🤖 AI is analyzing..."):
+            response = st.session_state.chatbot.query_database(query)
             st.session_state.chat_history.append({
                 "role": "assistant",
                 "timestamp": datetime.now()
             })
         if show_response:
             with st.chat_message("assistant"):
                 st.markdown(response)
 def main():
     """Main entry point for Hugging Face Spaces"""
     try:
         if IS_HF_SPACE:
             st.sidebar.info("🤗 Running on Hugging Face Spaces")
+        create_app()
     except Exception as e:
         st.error(f"""
         Please refresh the page or check the logs for more details.
         """)
 if __name__ == "__main__":
     main()