diff --git "a/streamlit_app.py" "b/streamlit_app.py" --- "a/streamlit_app.py" +++ "b/streamlit_app.py" @@ -1,5392 +1,1506 @@ -#!/usr/bin/env python3 -""" -Enhanced Invoice Processing & Analysis System with Vector Storage -A comprehensive system with Docling, Mistral AI, JSON storage, and semantic search capabilities. - -Author: AI Assistant -Date: 2024 -""" - -# =============================================================================== -# IMPORTS -# =============================================================================== - -# Standard library imports -import os -import json -import re -import tempfile -import shutil -import pickle -import numpy as np -from datetime import datetime -from typing import Dict, List, Optional, Tuple -from dataclasses import dataclass -from pathlib import Path - -# Third-party imports -import streamlit as st -import sqlite3 -import pandas as pd -import plotly.express as px -import plotly.graph_objects as go -import requests -import ollama - -# Vector storage and embeddings -import faiss -from sentence_transformers import SentenceTransformer -import torch -import importlib - -# Docling imports -from docling.document_converter import DocumentConverter -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import PdfFormatOption - -# =============================================================================== -# STREAMLIT CONFIGURATION -# =============================================================================== - -st.set_page_config( - page_title="Enhanced Invoice Processing & Analysis System", - page_icon="šŸ“„", - layout="wide", - initial_sidebar_state="expanded" -) - -# =============================================================================== -# DATA STRUCTURES -# =============================================================================== - -@dataclass -class InvoiceData: - """Data structure for extracted invoice information""" - supplier_name: str = "" - buyer_name: str = "" - invoice_number: str = "" - date: str = "" - amount: float = 0.0 - quantity: int = 0 - product_description: str = "" - file_path: str = "" - extraction_confidence: float = 0.0 - -@dataclass -class VectorSearchResult: - """Data structure for vector search results""" - invoice_id: str - invoice_number: str - supplier_name: str - similarity_score: float - content_preview: str - metadata: Dict - -# =============================================================================== -# VECTOR STORAGE CLASS -# =============================================================================== - -class InvoiceVectorStore: - """Handles vector embeddings and semantic search for invoices""" - - def __init__(self, - embedding_model: str = "all-MiniLM-L6-v2", - vector_store_path: str = "invoice_vectors.faiss", - metadata_path: str = "vector_metadata.pkl"): - - self.embedding_model_name = embedding_model - self.vector_store_path = vector_store_path - self.metadata_path = metadata_path - self.embedding_model = None - self.vector_store = None - self.document_metadata = [] - self.embedding_dimension = None - - self.setup_embedding_model() - self.load_vector_store() - - def setup_embedding_model(self): - """Initialize the sentence transformer model""" - try: - self.embedding_model = SentenceTransformer(self.embedding_model_name) - # Get embedding dimension - test_embedding = self.embedding_model.encode(["test"]) - self.embedding_dimension = test_embedding.shape[1] - st.success(f"āœ… Embedding model loaded: {self.embedding_model_name} (dim: {self.embedding_dimension})") - except Exception as e: - st.error(f"āŒ Failed to load embedding model: {e}") - self.embedding_model = None - - def load_vector_store(self): - """Load existing FAISS vector store and metadata""" - try: - if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path): - # Load FAISS index - self.vector_store = faiss.read_index(self.vector_store_path) - - # Load metadata - with open(self.metadata_path, 'rb') as f: - self.document_metadata = pickle.load(f) - - st.success(f"āœ… Vector store loaded: {len(self.document_metadata)} documents") - else: - # Initialize new vector store - if self.embedding_dimension: - self.vector_store = faiss.IndexFlatIP(self.embedding_dimension) - self.document_metadata = [] - st.info("šŸ“„ New vector store initialized") - except Exception as e: - st.error(f"āŒ Error loading vector store: {e}") - if self.embedding_dimension: - self.vector_store = faiss.IndexFlatIP(self.embedding_dimension) - self.document_metadata = [] - - def save_vector_store(self): - """Save FAISS vector store and metadata to disk""" - try: - if self.vector_store: - faiss.write_index(self.vector_store, self.vector_store_path) - - with open(self.metadata_path, 'wb') as f: - pickle.dump(self.document_metadata, f) - - return True - except Exception as e: - st.error(f"Error saving vector store: {e}") - return False - - def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str: - """Create searchable text from invoice data""" - text_parts = [] - - # Basic information - if invoice_data.get('invoice_number'): - text_parts.append(f"Invoice Number: {invoice_data['invoice_number']}") - - if invoice_data.get('supplier_name'): - text_parts.append(f"Supplier: {invoice_data['supplier_name']}") - - if invoice_data.get('buyer_name'): - text_parts.append(f"Buyer: {invoice_data['buyer_name']}") - - if invoice_data.get('product_description'): - text_parts.append(f"Description: {invoice_data['product_description']}") - - if invoice_data.get('amount'): - text_parts.append(f"Amount: {invoice_data['amount']}") - - if invoice_data.get('date'): - text_parts.append(f"Date: {invoice_data['date']}") - - # Add raw text preview for additional context - if raw_text: - text_parts.append(f"Content: {raw_text[:500]}") - - return " | ".join(text_parts) - - def add_document(self, invoice_data: dict, raw_text: str = "") -> bool: - """Add a document to the vector store""" - if not self.embedding_model or not self.vector_store: - return False - - try: - # Create searchable text - document_text = self.create_document_text(invoice_data, raw_text) - - # Generate embedding - embedding = self.embedding_model.encode([document_text], normalize_embeddings=True) - - # Create metadata - metadata = { - 'invoice_id': invoice_data.get('id', ''), - 'invoice_number': invoice_data.get('invoice_number', ''), - 'supplier_name': invoice_data.get('supplier_name', ''), - 'buyer_name': invoice_data.get('buyer_name', ''), - 'amount': invoice_data.get('amount', 0), - 'date': invoice_data.get('date', ''), - 'file_name': invoice_data.get('file_info', {}).get('file_name', ''), - 'extraction_confidence': invoice_data.get('extraction_info', {}).get('confidence', 0), - 'document_text': document_text[:200], # Preview - 'timestamp': datetime.now().isoformat() - } - - # Check if document already exists (by invoice_number) - existing_idx = None - for i, meta in enumerate(self.document_metadata): - if meta.get('invoice_number') == metadata['invoice_number'] and metadata['invoice_number']: - existing_idx = i - break - - if existing_idx is not None: - # Update existing document - self.vector_store.remove_ids(np.array([existing_idx])) - self.document_metadata[existing_idx] = metadata - self.vector_store.add(embedding.astype('float32')) - else: - # Add new document - self.vector_store.add(embedding.astype('float32')) - self.document_metadata.append(metadata) - - return True - - except Exception as e: - st.error(f"Error adding document to vector store: {e}") - return False - - def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]: - """Perform semantic search on the vector store""" - if not self.embedding_model or not self.vector_store or len(self.document_metadata) == 0: - return [] - - try: - # Generate query embedding - query_embedding = self.embedding_model.encode([query], normalize_embeddings=True) - - # Search in vector store - scores, indices = self.vector_store.search( - query_embedding.astype('float32'), - min(top_k, len(self.document_metadata)) - ) - - results = [] - for score, idx in zip(scores[0], indices[0]): - if idx < len(self.document_metadata) and score > 0.1: # Relevance threshold - metadata = self.document_metadata[idx] - result = VectorSearchResult( - invoice_id=metadata.get('invoice_id', ''), - invoice_number=metadata.get('invoice_number', ''), - supplier_name=metadata.get('supplier_name', ''), - similarity_score=float(score), - content_preview=metadata.get('document_text', ''), - metadata=metadata - ) - results.append(result) - - return results - - except Exception as e: - st.error(f"Error in semantic search: {e}") - return [] - - def rebuild_vector_store(self, json_data: dict) -> bool: - """Rebuild vector store from JSON data""" - if not self.embedding_model: - return False - - try: - # Clear existing store - if self.embedding_dimension: - self.vector_store = faiss.IndexFlatIP(self.embedding_dimension) - self.document_metadata = [] - - invoices = json_data.get('invoices', []) - if not invoices: - return True - - st.info(f"Rebuilding vector store with {len(invoices)} invoices...") - progress_bar = st.progress(0) - - for i, invoice in enumerate(invoices): - # Get raw text if available - raw_text = invoice.get('extraction_info', {}).get('raw_text_preview', '') - - # Add to vector store - self.add_document(invoice, raw_text) - - # Update progress - progress_bar.progress((i + 1) / len(invoices)) - - # Save to disk - self.save_vector_store() - st.success(f"āœ… Vector store rebuilt with {len(self.document_metadata)} documents") - return True - - except Exception as e: - st.error(f"Error rebuilding vector store: {e}") - return False - - def get_stats(self) -> Dict: - """Get vector store statistics""" - return { - 'total_documents': len(self.document_metadata), - 'embedding_dimension': self.embedding_dimension, - 'model_name': self.embedding_model_name, - 'vector_store_size': self.vector_store.ntotal if self.vector_store else 0 - } - -# =============================================================================== -# ENHANCED INVOICE PROCESSOR -# =============================================================================== - -class EnhancedInvoiceProcessor: - """Enhanced invoice processor with vector storage capabilities""" - - def __init__(self, - db_path: str = "invoices.db", - json_path: str = "invoices_data.json", - model_name: str = "mistral:7b", - embedding_model: str = "all-MiniLM-L6-v2"): - - self.db_path = db_path - self.json_path = json_path - self.model_name = model_name - - # Initialize components - self.setup_database() - self.setup_ollama() - self.setup_docling() - self.setup_json_storage() - - # Initialize vector store - self.vector_store = InvoiceVectorStore(embedding_model=embedding_model) - - # Copy all the existing setup methods from your original class - def setup_database(self): - """Initialize SQLite database with enhanced schema""" - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - # Main invoices table - cursor.execute(''' - CREATE TABLE IF NOT EXISTS invoices ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - supplier_name TEXT, - buyer_name TEXT, - invoice_number TEXT UNIQUE, - date TEXT, - amount REAL, - quantity INTEGER, - product_description TEXT, - file_path TEXT, - file_name TEXT, - file_size INTEGER, - file_type TEXT, - extraction_confidence REAL, - raw_text TEXT, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') - - # Processing summary table - cursor.execute(''' - CREATE TABLE IF NOT EXISTS processing_summary ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - session_date DATE, - total_files_processed INTEGER, - successful_extractions INTEGER, - failed_extractions INTEGER, - total_amount_processed REAL, - unique_suppliers INTEGER, - unique_buyers INTEGER, - processing_time_seconds REAL, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') - - # File processing log table - cursor.execute(''' - CREATE TABLE IF NOT EXISTS file_processing_log ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file_name TEXT, - file_path TEXT, - file_size INTEGER, - processing_status TEXT, - error_message TEXT, - processing_time_seconds REAL, - created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP - ) - ''') - - conn.commit() - conn.close() - - def setup_ollama(self): - """Setup Ollama for local LLM processing""" - try: - response = requests.get('http://localhost:11434/api/tags', timeout=5) - if response.status_code == 200: - models = response.json().get('models', []) - model_names = [model['name'] for model in models] - - if self.model_name not in model_names: - st.warning(f"Model {self.model_name} not found. Available: {model_names}") - st.info(f"Run: `ollama pull {self.model_name}`") - self.use_ai = False - else: - self.use_ai = True - st.success(f"Using {self.model_name} for processing") - else: - st.error("Ollama not responding") - self.use_ai = False - except Exception as e: - st.error(f"Ollama setup error: {e}") - st.info("Start Ollama with: `ollama serve`") - self.use_ai = False - - def setup_docling(self): - """Initialize Docling document converter""" - try: - pipeline_options = PdfPipelineOptions() - pipeline_options.do_ocr = True - pipeline_options.do_table_structure = True - pipeline_options.table_structure_options.do_cell_matching = True - - self.doc_converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), - InputFormat.IMAGE: PdfFormatOption(pipeline_options=pipeline_options), - } - ) - - self.docling_available = True - st.success("Docling initialized successfully") - - except Exception as e: - st.error(f"Docling initialization failed: {e}") - self.docling_available = False - - def setup_json_storage(self): - """Initialize JSON storage file with proper structure""" - if not os.path.exists(self.json_path): - initial_data = { - "metadata": { - "created_at": datetime.now().isoformat(), - "last_updated": datetime.now().isoformat(), - "total_invoices": 0, - "version": "1.0", - "vector_store_enabled": True - }, - "invoices": [], - "summary": { - "total_amount": 0.0, - "total_quantity": 0, - "unique_suppliers": [], - "unique_buyers": [], - "processing_stats": { - "successful": 0, - "failed": 0, - "total_processed": 0 - } - } - } - self.save_json_data(initial_data) - - # Copy all existing JSON methods from your original class - def load_json_data(self) -> dict: - """Load invoice data from JSON file""" - try: - with open(self.json_path, 'r', encoding='utf-8') as f: - return json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - self.setup_json_storage() - return self.load_json_data() - - def save_json_data(self, data: dict): - """Save invoice data to JSON file""" - try: - with open(self.json_path, 'w', encoding='utf-8') as f: - json.dump(data, f, indent=2, ensure_ascii=False) - except Exception as e: - st.error(f"Error saving JSON data: {e}") - - def update_json_summary(self, data: dict): - """Update summary statistics in JSON data""" - invoices = data.get("invoices", []) - - # Calculate totals - total_amount = sum(inv.get("amount", 0) for inv in invoices) - total_quantity = sum(inv.get("quantity", 0) for inv in invoices) - - # Get unique suppliers and buyers - unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name"))) - unique_buyers = list(set(inv.get("buyer_name", "") for inv in invoices if inv.get("buyer_name"))) - - # Update summary - data["summary"] = { - "total_amount": total_amount, - "total_quantity": total_quantity, - "unique_suppliers": unique_suppliers, - "unique_buyers": unique_buyers, - "processing_stats": data.get("summary", {}).get("processing_stats", { - "successful": len([inv for inv in invoices if inv.get("invoice_number")]), - "failed": 0, - "total_processed": len(invoices) - }) - } - - # Update metadata - data["metadata"]["last_updated"] = datetime.now().isoformat() - data["metadata"]["total_invoices"] = len(invoices) - - return data - - # Copy all existing document processing methods - def extract_text_from_document(self, file_path: str) -> str: - """Extract text using Docling""" - if not self.docling_available: - return "" - - try: - result = self.doc_converter.convert(file_path) - - if not result.document: - return "" - - markdown_text = result.document.export_to_markdown() - - # Extract tables - tables_text = "" - if hasattr(result.document, 'tables') and result.document.tables: - tables_text = "\n\nTABLES:\n" - for i, table in enumerate(result.document.tables): - tables_text += f"\nTable {i+1}:\n" - tables_text += str(table.export_to_markdown()) - - return markdown_text + tables_text - - except Exception as e: - st.error(f"Document extraction failed: {e}") - return "" - - def extract_invoice_info_with_ai(self, text: str, file_path: str) -> InvoiceData: - """Use Mistral to extract structured information""" - if not self.use_ai: - return self.extract_invoice_info_regex(text) - - try: - prompt = f""" - Extract invoice information from this document and return ONLY valid JSON: - - {{ - "supplier_name": "company providing goods/services", - "buyer_name": "company receiving goods/services", - "invoice_number": "invoice/bill number", - "date": "date in YYYY-MM-DD format", - "amount": "total amount as number", - "quantity": "total quantity as integer", - "product_description": "description of items/services" - }} - - Document: {text[:3000]} - - Return only JSON: - """ - - response = ollama.chat( - model=self.model_name, - messages=[{'role': 'user', 'content': prompt}], - options={'temperature': 0.1, 'top_p': 0.9, 'num_predict': 300} - ) - - response_text = response['message']['content'].strip() - json_start = response_text.find('{') - json_end = response_text.rfind('}') + 1 - - if json_start != -1 and json_end > json_start: - json_str = response_text[json_start:json_end] - data = json.loads(json_str) - - invoice_data = InvoiceData() - invoice_data.supplier_name = str(data.get('supplier_name', '')).strip() - invoice_data.buyer_name = str(data.get('buyer_name', '')).strip() - invoice_data.invoice_number = str(data.get('invoice_number', '')).strip() - invoice_data.date = self.parse_date(str(data.get('date', ''))) - - # Parse amount - try: - amount_val = data.get('amount', 0) - if isinstance(amount_val, str): - amount_clean = re.sub(r'[^\d.]', '', amount_val) - invoice_data.amount = float(amount_clean) if amount_clean else 0.0 - else: - invoice_data.amount = float(amount_val) - except: - invoice_data.amount = 0.0 - - # Parse quantity - try: - qty_val = data.get('quantity', 0) - invoice_data.quantity = int(float(str(qty_val).replace(',', ''))) - except: - invoice_data.quantity = 0 - - invoice_data.product_description = str(data.get('product_description', '')).strip() - invoice_data.extraction_confidence = 0.9 - invoice_data.file_path = file_path - - return invoice_data - else: - raise ValueError("No valid JSON in response") - - except Exception as e: - st.error(f"AI extraction failed: {e}") - return self.extract_invoice_info_regex(text) - - def extract_invoice_info_regex(self, text: str) -> InvoiceData: - """Fallback regex extraction""" - invoice_data = InvoiceData() - - patterns = { - 'invoice_number': [ - r'invoice\s*#?\s*:?\s*([A-Z0-9\-_]+)', - r'bill\s*#?\s*:?\s*([A-Z0-9\-_]+)', - r'inv\s*#?\s*:?\s*([A-Z0-9\-_]+)' - ], - 'date': [ - r'date\s*:?\s*(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})', - r'(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})', - r'(\d{4}[/\-]\d{1,2}[/\-]\d{1,2})' - ], - 'amount': [ - r'total\s*:?\s*\₹?([0-9,]+\.?\d*)', - r'amount\s*:?\s*\₹?([0-9,]+\.?\d*)', - r'\₹([0-9,]+\.?\d*)' - ], - 'quantity': [ - r'qty\s*:?\s*(\d+)', - r'quantity\s*:?\s*(\d+)', - r'(\d+)\s*units?' - ] - } - - text_lower = text.lower() - - # Extract using patterns - for pattern in patterns['invoice_number']: - match = re.search(pattern, text_lower) - if match: - invoice_data.invoice_number = match.group(1).upper() - break - - for pattern in patterns['date']: - match = re.search(pattern, text) - if match: - invoice_data.date = self.parse_date(match.group(1)) - break - - for pattern in patterns['amount']: - match = re.search(pattern, text_lower) - if match: - try: - amount_str = match.group(1).replace(',', '') - invoice_data.amount = float(amount_str) - break - except ValueError: - continue - - for pattern in patterns['quantity']: - match = re.search(pattern, text_lower) - if match: - try: - invoice_data.quantity = int(match.group(1)) - break - except ValueError: - continue - - invoice_data.extraction_confidence = 0.6 - return invoice_data - - def parse_date(self, date_str: str) -> str: - """Parse date to YYYY-MM-DD format""" - if not date_str: - return "" - - formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d'] - - for fmt in formats: - try: - parsed_date = datetime.strptime(date_str, fmt) - return parsed_date.strftime('%Y-%m-%d') - except ValueError: - continue - - return date_str - - # Enhanced data storage with vector integration - def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str = "", file_size: int = 0, file_type: str = ""): - """Save to database, JSON, and vector store""" - # Save to database (existing functionality) - conn = sqlite3.connect(self.db_path) - cursor = conn.cursor() - - try: - cursor.execute(''' - INSERT OR REPLACE INTO invoices - (supplier_name, buyer_name, invoice_number, date, amount, - quantity, product_description, file_path, file_name, file_size, - file_type, extraction_confidence, raw_text, updated_at) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, CURRENT_TIMESTAMP) - ''', ( - invoice_data.supplier_name, - invoice_data.buyer_name, - invoice_data.invoice_number, - invoice_data.date, - invoice_data.amount, - invoice_data.quantity, - invoice_data.product_description, - invoice_data.file_path, - Path(invoice_data.file_path).name if invoice_data.file_path else "", - file_size, - file_type, - invoice_data.extraction_confidence, - raw_text[:5000] # Store first 5000 chars of raw text - )) - - conn.commit() - db_success = True - - except sqlite3.IntegrityError as e: - st.error(f"Database error: {e}") - db_success = False - finally: - conn.close() - - # Save to JSON file - json_success = self.save_invoice_to_json(invoice_data, raw_text, file_size, file_type) - - return db_success and json_success - - def save_invoice_to_json(self, invoice_data: InvoiceData, raw_text: str = "", file_size: int = 0, file_type: str = "") -> bool: - """Save invoice data to JSON file and vector store""" - try: - # Load existing data - data = self.load_json_data() - - # Create invoice record - invoice_record = { - "id": len(data["invoices"]) + 1, - "invoice_number": invoice_data.invoice_number, - "supplier_name": invoice_data.supplier_name, - "buyer_name": invoice_data.buyer_name, - "date": invoice_data.date, - "amount": invoice_data.amount, - "quantity": invoice_data.quantity, - "product_description": invoice_data.product_description, - "file_info": { - "file_name": Path(invoice_data.file_path).name if invoice_data.file_path else "", - "file_path": invoice_data.file_path, - "file_size": file_size, - "file_type": file_type - }, - "extraction_info": { - "confidence": invoice_data.extraction_confidence, - "extraction_method": "AI" if self.use_ai else "Regex", - "raw_text_preview": raw_text[:500] if raw_text else "" - }, - "timestamps": { - "created_at": datetime.now().isoformat(), - "updated_at": datetime.now().isoformat() - } - } - - # Check for duplicates and update if exists - existing_index = None - for i, inv in enumerate(data["invoices"]): - if inv.get("invoice_number") == invoice_data.invoice_number: - existing_index = i - break - - if existing_index is not None: - # Update existing record - invoice_record["id"] = data["invoices"][existing_index]["id"] - invoice_record["timestamps"]["created_at"] = data["invoices"][existing_index]["timestamps"]["created_at"] - data["invoices"][existing_index] = invoice_record - else: - # Add new record - data["invoices"].append(invoice_record) - - # Update summary statistics - data = self.update_json_summary(data) - - # Save updated data - self.save_json_data(data) - - # Add to vector store - vector_success = self.vector_store.add_document(invoice_record, raw_text) - if vector_success: - self.vector_store.save_vector_store() - return True - except Exception as e: - st.error(f"Error saving to JSON: {e}") - return False - - def process_file(self, file_path: str, file_size: int = 0) -> InvoiceData: - """Process single file with enhanced logging""" - start_time = datetime.now() - file_name = Path(file_path).name - file_type = Path(file_path).suffix.lower() - - try: - text = self.extract_text_from_document(file_path) - if not text.strip(): - return InvoiceData() - - invoice_data = self.extract_invoice_info_with_ai(text, file_path) - - # Save with additional metadata - self.save_invoice_data(invoice_data, text, file_size, file_type) - - return invoice_data - - except Exception as e: - return InvoiceData() - - -# =============================================================================== -# ENHANCED CHATBOT WITH SEMANTIC SEARCH -# =============================================================================== - -class EnhancedInvoiceChatBot: - """Enhanced chat interface with hybrid search (SQL + Vector)""" - - def __init__(self, - db_path: str = "invoices.db", - json_path: str = "invoices_data.json", - model_name: str = "mistral:7b", - vector_store: InvoiceVectorStore = None): - - self.db_path = db_path - self.json_path = json_path - self.model_name = model_name - self.vector_store = vector_store - self.setup_ollama() - - def setup_ollama(self): - """Setup Ollama""" - try: - response = requests.get('http://localhost:11434/api/tags', timeout=5) - self.use_ai = response.status_code == 200 - except: - self.use_ai = False - - def load_json_data(self) -> dict: - """Load invoice data from JSON file""" - try: - with open(self.json_path, 'r', encoding='utf-8') as f: - return json.load(f) - except (FileNotFoundError, json.JSONDecodeError): - return {"invoices": [], "summary": {}, "metadata": {}} - - def get_invoice_data(self) -> pd.DataFrame: - """Get all invoice data from database (for compatibility)""" - conn = sqlite3.connect(self.db_path) - df = pd.read_sql_query("SELECT * FROM invoices", conn) - conn.close() - return df - - def hybrid_search(self, query: str, top_k: int = 5) -> Tuple[List[Dict], List[VectorSearchResult]]: - """Perform hybrid search combining SQL and vector search""" - # SQL search - sql_results = self.sql_search(query) - - # Vector search - vector_results = [] - if self.vector_store: - vector_results = self.vector_store.semantic_search(query, top_k) - - return sql_results, vector_results - - def sql_search(self, query: str) -> List[Dict]: - """Traditional SQL search based on keywords""" - query_lower = query.lower() - - try: - conn = sqlite3.connect(self.db_path) - - # Determine query type and build SQL - if any(phrase in query_lower for phrase in ["amount", "value", "cost", "price"]): - # Amount-based queries - sql = """ - SELECT invoice_number, supplier_name, buyer_name, amount, date, 'amount_search' as search_type - FROM invoices - WHERE amount IS NOT NULL AND amount > 0 - ORDER BY amount DESC - LIMIT 10 - """ - elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]): - # Supplier-based queries - sql = """ - SELECT invoice_number, supplier_name, buyer_name, amount, date, 'supplier_search' as search_type - FROM invoices - WHERE supplier_name IS NOT NULL AND supplier_name != '' - ORDER BY supplier_name - LIMIT 10 - """ - elif any(phrase in query_lower for phrase in ["recent", "latest", "new"]): - # Recent invoices - sql = """ - SELECT invoice_number, supplier_name, buyer_name, amount, date, 'recent_search' as search_type - FROM invoices - ORDER BY created_at DESC - LIMIT 10 - """ - else: - # General search - sql = """ - SELECT invoice_number, supplier_name, buyer_name, amount, date, 'general_search' as search_type - FROM invoices - ORDER BY created_at DESC - LIMIT 10 - """ - - df = pd.read_sql_query(sql, conn) - conn.close() - - return df.to_dict('records') - - except Exception as e: - st.error(f"SQL search error: {e}") - return [] - - def query_database(self, query: str) -> str: - """Enhanced query processing with hybrid search""" - json_data = self.load_json_data() - - if not json_data.get("invoices"): - return "No invoice data found. Please process some invoices first." - - query_lower = query.lower() - - try: - # Handle basic queries with JSON data (fast responses) - if any(phrase in query_lower for phrase in ["summary", "overview", "report", "all invoices"]): - return self.generate_json_summary(json_data) - - elif "how many" in query_lower and "invoice" in query_lower: - return self.handle_invoice_count_query(json_data) - - elif "total amount" in query_lower or "total value" in query_lower: - return self.handle_amount_query(json_data) - - # For complex queries, use hybrid search - else: - return self.hybrid_search_response(query, json_data) - - except Exception as e: - return f"Error processing query: {e}" - - def hybrid_search_response(self, query: str, json_data: dict) -> str: - """Generate response using hybrid search results""" - # Perform hybrid search - sql_results, vector_results = self.hybrid_search(query) - - response_parts = [] - - # Add vector search results if available - if vector_results: - response_parts.append("šŸ” **Semantic Search Results:**") - for i, result in enumerate(vector_results[:3], 1): - response_parts.append(f"{i}. **{result.invoice_number}** - {result.supplier_name}") - response_parts.append(f" Similarity: {result.similarity_score:.3f}") - response_parts.append(f" Preview: {result.content_preview[:100]}...") - response_parts.append("") - - # Add SQL search results - if sql_results: - response_parts.append("šŸ“Š **Database Search Results:**") - for i, result in enumerate(sql_results[:3], 1): - invoice_num = result.get('invoice_number', 'N/A') - supplier = result.get('supplier_name', 'N/A') - amount = result.get('amount', 0) - date = result.get('date', 'N/A') - - response_parts.append(f"{i}. **{invoice_num}** - {supplier}") - response_parts.append(f" Amount: ₹{amount:.2f} | Date: {date}") - response_parts.append("") - - # Use AI for intelligent synthesis if available - if self.use_ai and (vector_results or sql_results): - ai_summary = self.ai_synthesize_results(query, vector_results, sql_results, json_data) - if ai_summary: - response_parts.insert(0, "šŸ¤– **AI Analysis:**") - response_parts.insert(1, ai_summary) - response_parts.insert(2, "") - - if not response_parts: - return "No relevant results found for your query. Try rephrasing or being more specific." - - return "\n".join(response_parts) - - def ai_synthesize_results(self, query: str, vector_results: List[VectorSearchResult], - sql_results: List[Dict], json_data: dict) -> str: - """Use AI to synthesize search results into intelligent response""" - if not self.use_ai: - return "" - - try: - # Prepare context for AI - context_parts = [] - - # Add vector search context - if vector_results: - context_parts.append("SEMANTIC SEARCH RESULTS:") - for result in vector_results[:3]: - context_parts.append(f"- Invoice {result.invoice_number}: {result.supplier_name} (Similarity: {result.similarity_score:.3f})") - - # Add SQL search context - if sql_results: - context_parts.append("\nDATABASE SEARCH RESULTS:") - for result in sql_results[:3]: - context_parts.append(f"- Invoice {result.get('invoice_number', 'N/A')}: {result.get('supplier_name', 'N/A')} (₹{result.get('amount', 0):.2f})") - - # Add summary statistics - summary = json_data.get("summary", {}) - context_parts.append(f"\nDATABASE SUMMARY:") - context_parts.append(f"- Total invoices: {len(json_data.get('invoices', []))}") - context_parts.append(f"- Total amount: ₹{summary.get('total_amount', 0):,.2f}") - context_parts.append(f"- Unique suppliers: {len(summary.get('unique_suppliers', []))}") - - context = "\n".join(context_parts) - - prompt = f""" -You are an AI assistant analyzing invoice data. Based on the search results and database summary, provide a helpful and insightful answer to the user's question. - -USER QUESTION: {query} - -SEARCH RESULTS AND CONTEXT: -{context} - -Provide a concise, informative response that: -1. Directly answers the user's question -2. Highlights the most relevant findings -3. Provides useful insights from the data -4. Uses clear formatting with markdown - -Response: -""" - - response = ollama.chat( - model=self.model_name, - messages=[{'role': 'user', 'content': prompt}], - options={'temperature': 0.3, 'num_predict': 400} - ) - - return response['message']['content'] - - except Exception as e: - st.error(f"AI synthesis error: {e}") - return "" - - # Copy existing helper methods from original chatbot - def handle_invoice_count_query(self, json_data: dict) -> str: - """Handle invoice counting queries""" - total_invoices = len(json_data["invoices"]) - unique_invoices = len(set(inv.get("invoice_number", "") for inv in json_data["invoices"] if inv.get("invoice_number"))) - - result = f"**Invoice Numbers Summary:**\n" - result += f"• Total unique invoice numbers: **{unique_invoices}**\n" - result += f"• Total records in database: **{total_invoices}**\n\n" - - if unique_invoices < total_invoices: - result += f"āš ļø Note: {total_invoices - unique_invoices} duplicate records found\n\n" - - result += "**Recent Invoice Numbers:**\n" - recent_invoices = sorted(json_data["invoices"], key=lambda x: x.get("timestamps", {}).get("created_at", ""), reverse=True)[:10] - for i, inv in enumerate(recent_invoices, 1): - if inv.get("invoice_number"): - result += f"{i}. {inv['invoice_number']}\n" - - return result - - def handle_amount_query(self, json_data: dict) -> str: - """Handle amount-related queries""" - total_amount = json_data.get("summary", {}).get("total_amount", 0) - avg_amount = total_amount / len(json_data["invoices"]) if json_data["invoices"] else 0 - return f"**Financial Summary:**\n• Total amount: **₹{total_amount:,.2f}**\n• Average per invoice: **₹{avg_amount:,.2f}**" - - def generate_json_summary(self, json_data: dict) -> str: - """Generate comprehensive summary using JSON data""" - try: - metadata = json_data.get("metadata", {}) - summary = json_data.get("summary", {}) - invoices = json_data.get("invoices", []) - - # Build comprehensive summary - report = "# šŸ“Š **COMPREHENSIVE INVOICE SUMMARY REPORT**\n\n" - - report += "## šŸ”¢ **Overall Statistics**\n" - report += f"• **Total Records**: {len(invoices):,}\n" - report += f"• **Unique Invoice Numbers**: {len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number'))):,}\n" - report += f"• **Unique Suppliers**: {len(summary.get('unique_suppliers', [])):,}\n" - report += f"• **Total Invoice Value**: ₹{summary.get('total_amount', 0):,.2f}\n" - report += f"• **Average Invoice Amount**: ₹{(summary.get('total_amount', 0) / len(invoices) if invoices else 0):,.2f}\n" - report += f"• **Total Quantity**: {summary.get('total_quantity', 0):,}\n\n" - - # Vector store statistics if available - if self.vector_store: - vector_stats = self.vector_store.get_stats() - report += "## šŸ” **Semantic Search Capabilities**\n" - report += f"• **Vector Store**: {vector_stats['total_documents']} documents indexed\n" - report += f"• **Embedding Model**: {vector_stats['model_name']}\n" - report += f"• **Embedding Dimension**: {vector_stats['embedding_dimension']}\n\n" - - report += "## šŸ“… **Processing Timeline**\n" - if metadata.get("created_at"): - report += f"• **First Processing Session**: {metadata['created_at'][:19]}\n" - report += f"• **Last Updated**: {metadata['last_updated'][:19]}\n\n" - - # Top suppliers analysis - if summary.get("unique_suppliers"): - supplier_counts = {} - supplier_amounts = {} - for inv in invoices: - supplier = inv.get("supplier_name", "") - if supplier: - supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1 - supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get("amount", 0) - - top_suppliers = sorted(supplier_counts.items(), key=lambda x: x[1], reverse=True)[:5] - report += "## šŸ¢ **Top 5 Suppliers by Volume**\n" - for supplier, count in top_suppliers: - amount = supplier_amounts.get(supplier, 0) - report += f"• **{supplier}**: {count} invoices (₹{amount:,.2f})\n" - report += "\n" - - # Processing statistics - processing_stats = summary.get("processing_stats", {}) - if processing_stats: - report += "## āš™ļø **Processing Statistics**\n" - report += f"• āœ… **Successful**: {processing_stats.get('successful', 0)} files\n" - report += f"• āŒ **Failed**: {processing_stats.get('failed', 0)} files\n" - report += f"• šŸ“„ **Total Processed**: {processing_stats.get('total_processed', 0)} files\n" - - return report - - except Exception as e: - return f"Error generating summary: {e}" - - -# =============================================================================== -# ENHANCED VISUALIZATION FUNCTIONS -# =============================================================================== - -def create_enhanced_visualizations(df: pd.DataFrame, vector_store: InvoiceVectorStore = None): - """Create enhanced dashboard visualizations with vector store insights""" - if df.empty: - st.info("No data available for visualization") - return - - # Original visualizations - col1, col2 = st.columns(2) - - with col1: - # Suppliers chart - if not df['supplier_name'].isna().all(): - supplier_counts = df['supplier_name'].value_counts().head(10) - fig_suppliers = px.bar( - x=supplier_counts.values, - y=supplier_counts.index, - orientation='h', - title="Top Suppliers by Delivery Count", - labels={'x': 'Number of Deliveries', 'y': 'Supplier'} - ) - fig_suppliers.update_layout(height=400) - st.plotly_chart(fig_suppliers, use_container_width=True) - - with col2: - # Amount by supplier - if not df['supplier_name'].isna().all() and not df['amount'].isna().all(): - supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10) - fig_amounts = px.bar( - x=supplier_amounts.values, - y=supplier_amounts.index, - orientation='h', - title="Top Suppliers by Total Amount", - labels={'x': 'Total Amount (₹)', 'y': 'Supplier'} - ) - fig_amounts.update_layout(height=400) - st.plotly_chart(fig_amounts, use_container_width=True) - - # Vector store insights - if vector_store: - st.subheader("šŸ” Semantic Search Analytics") - - col1, col2 = st.columns(2) - - with col1: - # Vector store statistics - vector_stats = vector_store.get_stats() - - st.metric("Documents in Vector Store", vector_stats['total_documents']) - st.metric("Embedding Dimension", vector_stats['embedding_dimension']) - st.write(f"**Model**: {vector_stats['model_name']}") - - with col2: - # Test semantic search - st.write("**Test Semantic Search:**") - test_query = st.text_input("Enter search query:", placeholder="high value invoices") - - if test_query and st.button("šŸ” Search"): - results = vector_store.semantic_search(test_query, top_k=3) - - if results: - for i, result in enumerate(results, 1): - with st.expander(f"Result {i}: {result.invoice_number} (Score: {result.similarity_score:.3f})"): - st.write(f"**Supplier**: {result.supplier_name}") - st.write(f"**Content**: {result.content_preview}") - else: - st.info("No results found") - - -# =============================================================================== -# ENHANCED STREAMLIT APPLICATION -# =============================================================================== - -def enhanced_main(): - """Enhanced main Streamlit application with vector capabilities""" - - # Header - st.title("šŸ“„ Enhanced Invoice Processing & Analysis System") - st.markdown("Upload invoices, extract data with AI, and perform semantic search on your data!") - - # Initialize session state - if 'enhanced_processor' not in st.session_state: - st.session_state.enhanced_processor = EnhancedInvoiceProcessor() - - if 'enhanced_chatbot' not in st.session_state: - st.session_state.enhanced_chatbot = EnhancedInvoiceChatBot( - vector_store=st.session_state.enhanced_processor.vector_store - ) - - if 'chat_history' not in st.session_state: - st.session_state.chat_history = [] - - # ------------------------------------------------------------------------- - # ENHANCED SIDEBAR - # ------------------------------------------------------------------------- - - with st.sidebar: - st.header("šŸ› ļø Enhanced System Status") - - # Original system checks - if hasattr(st.session_state.enhanced_processor, 'docling_available') and st.session_state.enhanced_processor.docling_available: - st.success("āœ… Docling Ready") - else: - st.error("āŒ Docling Not Available") - - if hasattr(st.session_state.enhanced_processor, 'use_ai') and st.session_state.enhanced_processor.use_ai: - st.success("āœ… AI Processing Ready") - else: - st.warning("āš ļø Using Regex Fallback") - - # Vector store status - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store and vector_store.embedding_model: - st.success("āœ… Vector Search Ready") - vector_stats = vector_store.get_stats() - st.metric("Indexed Documents", vector_stats['total_documents']) - else: - st.error("āŒ Vector Search Not Available") - - # JSON file status - json_path = st.session_state.enhanced_processor.json_path - if os.path.exists(json_path): - file_size = os.path.getsize(json_path) / 1024 # KB - st.success(f"āœ… JSON Data Ready ({file_size:.1f} KB)") - else: - st.info("šŸ“„ JSON file will be created") - - st.header("šŸ“Š Enhanced Quick Stats") - - # Load stats from JSON - try: - json_data = st.session_state.enhanced_chatbot.load_json_data() - total_invoices = len(json_data.get("invoices", [])) - total_amount = json_data.get("summary", {}).get("total_amount", 0) - unique_suppliers = len(json_data.get("summary", {}).get("unique_suppliers", [])) - - if total_invoices > 0: - st.metric("Total Invoices", total_invoices) - st.metric("Total Amount", f"₹{total_amount:,.2f}") - st.metric("Unique Suppliers", unique_suppliers) - else: - st.info("No data yet - upload some invoices!") - except Exception as e: - st.error(f"Error loading stats: {e}") - - # Vector store management - st.header("šŸ” Vector Store Management") - - if st.button("šŸ”„ Rebuild Vector Store", use_container_width=True, key="rebuild_vector_sidebar"): - try: - json_data = st.session_state.enhanced_chatbot.load_json_data() - with st.spinner("Rebuilding vector store..."): - success = vector_store.rebuild_vector_store(json_data) - if success: - st.success("āœ… Vector store rebuilt!") - st.rerun() - except Exception as e: - st.error(f"Error rebuilding vector store: {e}") - - if st.button("šŸ’¾ Save Vector Store", use_container_width=True, key="save_vector_store_sidebar"): - if vector_store.save_vector_store(): - st.success("āœ… Vector store saved!") - else: - st.error("āŒ Failed to save vector store") - - # ------------------------------------------------------------------------- - # ENHANCED MAIN TABS - # ------------------------------------------------------------------------- - - tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs([ - "šŸ“¤ Upload & Process", - "šŸ’¬ Enhanced Chat", - "šŸ“Š Enhanced Dashboard", - "šŸ“‹ Data View", - "šŸ”§ Vector Manager", - "šŸ” Semantic Search" - ]) - - # ------------------------------------------------------------------------- - # TAB 1: UPLOAD & PROCESS (Enhanced) - # ------------------------------------------------------------------------- - - with tab1: - st.header("Upload Invoice Documents") - st.info("šŸš€ Now with automatic vector indexing for semantic search!") - - uploaded_files = st.file_uploader( - "Drop invoice files here", - type=['pdf', 'jpg', 'jpeg', 'png', 'docx', 'txt'], - accept_multiple_files=True, - help="Supported formats: PDF, Images (JPG, PNG), Word documents, Text files" - ) - - if uploaded_files: - if st.button("šŸš€ Process All Files with Vector Indexing", type="primary", key="process_all_files"): - progress_bar = st.progress(0) - status_text = st.empty() - results_container = st.container() - - processed_count = 0 - total_files = len(uploaded_files) - - for i, uploaded_file in enumerate(uploaded_files): - # Save uploaded file temporarily - with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file: - file_content = uploaded_file.getvalue() - tmp_file.write(file_content) - tmp_file_path = tmp_file.name - file_size = len(file_content) - - try: - status_text.text(f"Processing: {uploaded_file.name}") - - # Process the file - invoice_data = st.session_state.enhanced_processor.process_file(tmp_file_path, file_size) - - if invoice_data.invoice_number: - processed_count += 1 - - with results_container: - with st.expander(f"āœ… {uploaded_file.name}", expanded=False): - col1, col2 = st.columns(2) - with col1: - st.write(f"**Invoice #:** {invoice_data.invoice_number}") - st.write(f"**Supplier:** {invoice_data.supplier_name}") - st.write(f"**Buyer:** {invoice_data.buyer_name}") - with col2: - st.write(f"**Date:** {invoice_data.date}") - st.write(f"**Amount:** ₹{invoice_data.amount:.2f}") - st.write(f"**Quantity:** {invoice_data.quantity}") - st.write(f"**Description:** {invoice_data.product_description}") - st.write(f"**Confidence:** {invoice_data.extraction_confidence:.1%}") - st.success("šŸ” Added to vector store for semantic search") - else: - with results_container: - st.warning(f"āš ļø Limited data extracted from {uploaded_file.name}") - - except Exception as e: - with results_container: - st.error(f"āŒ Error processing {uploaded_file.name}: {e}") - - finally: - # Clean up temp file - os.unlink(tmp_file_path) - - # Update progress - progress_bar.progress((i + 1) / total_files) - - status_text.text(f"āœ… Processing complete! Successfully processed {processed_count}/{total_files} files") - - # Show vector store update confirmation - if processed_count > 0: - st.success(f"šŸ“„ JSON data and vector store updated with {processed_count} new invoices!") - # Save vector store - vector_store.save_vector_store() - - # ------------------------------------------------------------------------- - # TAB 2: ENHANCED CHAT - # ------------------------------------------------------------------------- - - with tab2: - st.header("šŸ’¬ Enhanced Chat with Semantic Search") - st.info("šŸ’” Now powered by hybrid search: SQL database + Vector similarity!") - - # Chat input - user_query = st.chat_input("Ask about your invoices using natural language...") - - if user_query: - # Add user message to history - st.session_state.chat_history.append({"role": "user", "content": user_query}) - - # Get enhanced bot response - with st.spinner("Analyzing with AI and semantic search..."): - bot_response = st.session_state.enhanced_chatbot.query_database(user_query) - - # Add bot response to history - st.session_state.chat_history.append({"role": "assistant", "content": bot_response}) - - # Display chat history - for message in st.session_state.chat_history: - with st.chat_message(message["role"]): - st.markdown(message["content"]) - - # Enhanced suggested queries - if not st.session_state.chat_history: - st.subheader("šŸ’” Try these enhanced AI questions:") - - col1, col2, col3 = st.columns(3) - - with col1: - st.markdown("**šŸ“Š Basic Queries:**") - basic_queries = [ - "Get comprehensive summary", - "How many invoices do we have?", - "What's the total amount?", - "List all suppliers" - ] - for i, query in enumerate(basic_queries): - if st.button(query, key=f"basic_query_{i}"): - st.session_state.chat_history.append({"role": "user", "content": query}) - bot_response = st.session_state.enhanced_chatbot.query_database(query) - st.session_state.chat_history.append({"role": "assistant", "content": bot_response}) - st.rerun() - - with col2: - st.markdown("**šŸ” Semantic Queries:**") - semantic_queries = [ - "Find high value transactions", - "Show me technology related invoices", - "Find invoices with office supplies", - "Search for consulting services" - ] - for i, query in enumerate(semantic_queries): - if st.button(query, key=f"semantic_query_{i}"): - st.session_state.chat_history.append({"role": "user", "content": query}) - bot_response = st.session_state.enhanced_chatbot.query_database(query) - st.session_state.chat_history.append({"role": "assistant", "content": bot_response}) - st.rerun() - - with col3: - st.markdown("**šŸ¤– AI Analysis:**") - ai_queries = [ - "Analyze spending patterns", - "Identify potential cost savings", - "Compare supplier performance", - "Find unusual invoice patterns"] - for i, query in enumerate(ai_queries): - if st.button(query, key=f"ai_query_{i}"): - st.session_state.chat_history.append({"role": "user", "content": query}) - bot_response = st.session_state.enhanced_chatbot.query_database(query) - st.session_state.chat_history.append({"role": "assistant", "content": bot_response}) - st.rerun() - - # ------------------------------------------------------------------------- - # TAB 3: ENHANCED DASHBOARD - # ------------------------------------------------------------------------- - - with tab3: - st.header("šŸ“Š Enhanced Analytics Dashboard") - - # Load data from JSON for faster processing - json_data = st.session_state.enhanced_chatbot.load_json_data() - invoices = json_data.get("invoices", []) - - if invoices: - # Convert JSON data to DataFrame for visualizations - df_data = [] - for inv in invoices: - df_data.append({ - 'invoice_number': inv.get('invoice_number', ''), - 'supplier_name': inv.get('supplier_name', ''), - 'buyer_name': inv.get('buyer_name', ''), - 'amount': inv.get('amount', 0), - 'quantity': inv.get('quantity', 0), - 'date': inv.get('date', ''), - 'extraction_confidence': inv.get('extraction_info', {}).get('confidence', 0), - 'created_at': inv.get('timestamps', {}).get('created_at', '') - }) - - df = pd.DataFrame(df_data) - - # Enhanced key metrics row - col1, col2, col3, col4, col5 = st.columns(5) - - with col1: - total_invoices = len(df) - unique_invoices = df['invoice_number'].nunique() - st.metric("Total Records", total_invoices, help="Total number of processed invoices") - st.metric("Unique Invoice Numbers", unique_invoices) - - with col2: - total_amount = df['amount'].sum() - avg_amount = df['amount'].mean() - st.metric("Total Amount", f"₹{total_amount:,.2f}") - st.metric("Average Amount", f"₹{avg_amount:,.2f}") - - with col3: - unique_suppliers = df['supplier_name'].nunique() - unique_buyers = df['buyer_name'].nunique() - st.metric("Unique Suppliers", unique_suppliers) - st.metric("Unique Buyers", unique_buyers) - - with col4: - total_quantity = df['quantity'].sum() - avg_confidence = df['extraction_confidence'].mean() - st.metric("Total Quantity", f"{total_quantity:,}") - st.metric("Avg Confidence", f"{avg_confidence:.1%}") - - with col5: - # Vector store metrics - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store: - vector_stats = vector_store.get_stats() - st.metric("Vector Documents", vector_stats['total_documents']) - st.metric("Embedding Dim", vector_stats['embedding_dimension']) - else: - st.metric("Vector Store", "Not Available") - st.metric("Semantic Search", "Disabled") - - # Create enhanced visualizations - create_enhanced_visualizations(df, vector_store) - - else: - st.info("No data available for visualization. Upload and process some invoices first!") - - # ------------------------------------------------------------------------- - # TAB 4: DATA VIEW (Enhanced) - # ------------------------------------------------------------------------- - - with tab4: - st.header("šŸ“‹ Enhanced Invoice Data View") - - # Load from JSON for enhanced data view - json_data = st.session_state.enhanced_chatbot.load_json_data() - invoices = json_data.get("invoices", []) - - if invoices: - # Convert to DataFrame - df_data = [] - for inv in invoices: - df_data.append({ - 'ID': inv.get('id', ''), - 'Invoice Number': inv.get('invoice_number', ''), - 'Supplier': inv.get('supplier_name', ''), - 'Buyer': inv.get('buyer_name', ''), - 'Date': inv.get('date', ''), - 'Amount': inv.get('amount', 0), - 'Quantity': inv.get('quantity', 0), - 'Description': inv.get('product_description', ''), - 'Confidence': inv.get('extraction_info', {}).get('confidence', 0), - 'Method': inv.get('extraction_info', {}).get('extraction_method', ''), - 'File Type': inv.get('file_info', {}).get('file_type', ''), - 'Vector Indexed': 'Yes' if vector_store and any(meta.get('invoice_number') == inv.get('invoice_number') for meta in vector_store.document_metadata) else 'No', - 'Created': inv.get('timestamps', {}).get('created_at', '')[:19] if inv.get('timestamps', {}).get('created_at') else '' - }) - - df = pd.DataFrame(df_data) - - # Enhanced filters - col1, col2, col3, col4, col5 = st.columns(5) - - with col1: - suppliers = ['All'] + list(df['Supplier'].dropna().unique()) - selected_supplier = st.selectbox("Filter by Supplier", suppliers) - - with col2: - buyers = ['All'] + list(df['Buyer'].dropna().unique()) - selected_buyer = st.selectbox("Filter by Buyer", buyers) - - with col3: - methods = ['All'] + list(df['Method'].dropna().unique()) - selected_method = st.selectbox("Filter by Extraction Method", methods) - - with col4: - confidence_filter = st.selectbox("Confidence Filter", - ["All", "High (>80%)", "Medium (50-80%)", "Low (<50%)"]) - - with col5: - vector_filter = st.selectbox("Vector Indexed", ["All", "Yes", "No"]) - - # Apply filters - filtered_df = df.copy() - if selected_supplier != 'All': - filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier] - if selected_buyer != 'All': - filtered_df = filtered_df[filtered_df['Buyer'] == selected_buyer] - if selected_method != 'All': - filtered_df = filtered_df[filtered_df['Method'] == selected_method] - if vector_filter != 'All': - filtered_df = filtered_df[filtered_df['Vector Indexed'] == vector_filter] - - if confidence_filter == "High (>80%)": - filtered_df = filtered_df[filtered_df['Confidence'] > 0.8] - elif confidence_filter == "Medium (50-80%)": - filtered_df = filtered_df[(filtered_df['Confidence'] >= 0.5) & (filtered_df['Confidence'] <= 0.8)] - elif confidence_filter == "Low (<50%)": - filtered_df = filtered_df[filtered_df['Confidence'] < 0.5] - - # Display summary - if len(filtered_df) != len(df): - st.info(f"Showing {len(filtered_df)} of {len(df)} records") - - # Display data with enhanced columns - st.dataframe( - filtered_df, - use_container_width=True, - column_config={ - "Amount": st.column_config.NumberColumn("Amount", format="₹%.2f"), - "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1), - "Vector Indexed": st.column_config.SelectboxColumn("Vector Indexed", options=["Yes", "No"]), - } - ) - - # Enhanced export options - col1, col2, col3, col4 = st.columns(4) - - with col1: - csv = filtered_df.to_csv(index=False) - st.download_button( - label="šŸ“„ Download CSV", - data=csv, - file_name=f"filtered_invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", - mime="text/csv" - ) - - with col2: - # Export filtered JSON data - filtered_invoices = [inv for inv in invoices if inv.get('id') in filtered_df['ID'].values] - filtered_json = { - "metadata": json_data.get("metadata", {}), - "invoices": filtered_invoices, - "filter_applied": { - "supplier": selected_supplier if selected_supplier != 'All' else None, - "buyer": selected_buyer if selected_buyer != 'All' else None, - "method": selected_method if selected_method != 'All' else None, - "confidence": confidence_filter if confidence_filter != 'All' else None, - "vector_indexed": vector_filter if vector_filter != 'All' else None - } - } - - st.download_button( - label="šŸ“„ Download JSON", - data=json.dumps(filtered_json, indent=2, ensure_ascii=False), - file_name=f"filtered_invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - mime="application/json" - ) - - with col3: - # Export vector embeddings metadata - if vector_store and vector_store.document_metadata: - vector_metadata = [] - for meta in vector_store.document_metadata: - if any(inv.get('invoice_number') == meta.get('invoice_number') for inv in filtered_invoices): - vector_metadata.append(meta) - - st.download_button( - label="šŸ” Download Vector Metadata", - data=json.dumps(vector_metadata, indent=2, ensure_ascii=False), - file_name=f"vector_metadata_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - mime="application/json" - ) - - with col4: - if st.button("šŸ—‘ļø Clear All Data", type="secondary"): - if st.button("āš ļø Confirm Delete", type="secondary"): - # Clear database - conn = sqlite3.connect(st.session_state.enhanced_processor.db_path) - cursor = conn.cursor() - cursor.execute("DELETE FROM invoices") - cursor.execute("DELETE FROM file_processing_log") - cursor.execute("DELETE FROM processing_summary") - conn.commit() - conn.close() - - # Reset JSON file - st.session_state.enhanced_processor.setup_json_storage() - - # Clear vector store - if vector_store: - vector_store.vector_store = faiss.IndexFlatIP(vector_store.embedding_dimension) - vector_store.document_metadata = [] - vector_store.save_vector_store() - - st.success("All data cleared!") - st.rerun() - else: - st.info("No invoice data available. Upload and process some invoices first!") - - # ------------------------------------------------------------------------- - # TAB 5: VECTOR MANAGER - # ------------------------------------------------------------------------- - - with tab5: - st.header("šŸ” Vector Store Manager") - st.info("Manage your semantic search capabilities and vector embeddings") - - vector_store = st.session_state.enhanced_processor.vector_store - - if vector_store: - # Vector store information - col1, col2 = st.columns(2) - - with col1: - st.subheader("šŸ“Š Vector Store Info") - vector_stats = vector_store.get_stats() - st.write(f"**Total Documents:** {vector_stats['total_documents']}") - st.write(f"**Embedding Model:** {vector_stats['model_name']}") - st.write(f"**Embedding Dimension:** {vector_stats['embedding_dimension']}") - st.write(f"**Vector Store Size:** {vector_stats['vector_store_size']}") - - # File status - if os.path.exists(vector_store.vector_store_path): - file_size = os.path.getsize(vector_store.vector_store_path) / 1024 # KB - st.write(f"**Vector File Size:** {file_size:.2f} KB") - else: - st.write("**Vector File:** Not saved yet") - - with col2: - st.subheader("šŸ› ļø Management Tools") - - if st.button("šŸ”„ Rebuild Vector Store", use_container_width=True, key="rebuild_vector_admin"): - json_data = st.session_state.enhanced_chatbot.load_json_data() - with st.spinner("Rebuilding vector store..."): - success = vector_store.rebuild_vector_store(json_data) - if success: - st.success("āœ… Vector store rebuilt successfully!") - st.rerun() - - if st.button("šŸ’¾ Save Vector Store", use_container_width=True, key="save_vector_sidebar"): - if vector_store.save_vector_store(): - st.success("āœ… Vector store saved to disk!") - else: - st.error("āŒ Failed to save vector store") - - if st.button("šŸ“Š Validate Vector Store", use_container_width=True): - try: - # Validation checks - issues = [] - - if not vector_store.embedding_model: - issues.append("āŒ Embedding model not loaded") - - if not vector_store.vector_store: - issues.append("āŒ FAISS index not initialized") - - if len(vector_store.document_metadata) == 0: - issues.append("āš ļø No documents in vector store") - - if vector_store.vector_store and vector_store.vector_store.ntotal != len(vector_store.document_metadata): - issues.append("āš ļø Mismatch between vectors and metadata") - - if not issues: - st.success("āœ… Vector store validation passed!") - else: - for issue in issues: - st.warning(issue) - - except Exception as e: - st.error(f"Validation error: {e}") - - # Document metadata viewer - st.subheader("šŸ“‹ Document Metadata") - - if vector_store.document_metadata: - # Create DataFrame from metadata - metadata_df = pd.DataFrame(vector_store.document_metadata) - - # Display metadata - st.dataframe( - metadata_df, - use_container_width=True, - column_config={ - "extraction_confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1), - "amount": st.column_config.NumberColumn("Amount", format="₹%.2f"), - } - ) - - # Export metadata - if st.button("šŸ“„ Export Metadata CSV"): - csv_data = metadata_df.to_csv(index=False) - st.download_button( - label="šŸ’¾ Download Metadata", - data=csv_data, - file_name=f"vector_metadata_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", - mime="text/csv" - ) - else: - st.info("No documents in vector store yet.") - - # Vector store configuration - st.subheader("āš™ļø Configuration") - - col1, col2 = st.columns(2) - - with col1: - st.write("**Current Settings:**") - st.code(f""" -Embedding Model: {vector_store.embedding_model_name} -Vector Store Path: {vector_store.vector_store_path} -Metadata Path: {vector_store.metadata_path} -Embedding Dimension: {vector_store.embedding_dimension} - """) - - with col2: - st.write("**Change Embedding Model:**") - new_model = st.selectbox( - "Select new model:", - [ - "all-MiniLM-L6-v2", - "all-mpnet-base-v2", - "multi-qa-mpnet-base-dot-v1", - "all-distilroberta-v1" - ], - index=0 - ) - - if st.button("šŸ”„ Switch Model"): - if new_model != vector_store.embedding_model_name: - with st.spinner(f"Loading {new_model}..."): - try: - # Create new vector store with different model - new_vector_store = InvoiceVectorStore(embedding_model=new_model) - - # Rebuild with new embeddings - json_data = st.session_state.enhanced_chatbot.load_json_data() - if new_vector_store.rebuild_vector_store(json_data): - st.session_state.enhanced_processor.vector_store = new_vector_store - st.session_state.enhanced_chatbot.vector_store = new_vector_store - st.success(f"āœ… Switched to {new_model}!") - st.rerun() - else: - st.error("Failed to rebuild with new model") - except Exception as e: - st.error(f"Error switching model: {e}") - else: - st.error("Vector store not available. Please check the system status.") - - # ------------------------------------------------------------------------- - # TAB 6: SEMANTIC SEARCH - # ------------------------------------------------------------------------- - - with tab6: - st.header("šŸ” Semantic Search Interface") - st.info("Search your invoices using natural language and semantic similarity") - - vector_store = st.session_state.enhanced_processor.vector_store - - if vector_store and vector_store.document_metadata: - # Search interface - col1, col2 = st.columns([3, 1]) - - with col1: - search_query = st.text_input( - "Enter your search query:", - placeholder="e.g., high value technology purchases, office supplies, consulting services", - help="Use natural language to describe what you're looking for" - ) - - with col2: - top_k = st.number_input("Number of results:", min_value=1, max_value=20, value=5) - - if search_query: - with st.spinner("Performing semantic search..."): - results = vector_store.semantic_search(search_query, top_k) - - if results: - st.subheader(f"šŸŽÆ Found {len(results)} similar documents:") - - for i, result in enumerate(results, 1): - with st.expander( - f"{i}. Invoice {result.invoice_number} - {result.supplier_name} " - f"(Similarity: {result.similarity_score:.3f})", - expanded=i <= 3 - ): - col1, col2 = st.columns(2) - - with col1: - st.write(f"**Invoice Number:** {result.invoice_number}") - st.write(f"**Supplier:** {result.supplier_name}") - st.write(f"**Amount:** ₹{result.metadata.get('amount', 0):,.2f}") - st.write(f"**Date:** {result.metadata.get('date', 'N/A')}") - - with col2: - st.write(f"**Similarity Score:** {result.similarity_score:.4f}") - st.write(f"**File:** {result.metadata.get('file_name', 'N/A')}") - st.write(f"**Confidence:** {result.metadata.get('extraction_confidence', 0):.1%}") - st.write(f"**Timestamp:** {result.metadata.get('timestamp', 'N/A')[:19]}") - - st.write("**Content Preview:**") - st.text_area( - "Document content:", - value=result.content_preview, - height=100, - key=f"content_{i}", - disabled=True - ) - else: - st.warning("No similar documents found. Try rephrasing your query.") - - # Search suggestions and examples - st.subheader("šŸ’” Search Examples") - - col1, col2, col3 = st.columns(3) - - with col1: - st.markdown("**By Product/Service:**") - product_queries = [ - "office supplies and stationery", - "technology equipment purchases", - "consulting and professional services", - "software licenses and subscriptions" - ] - for i, query in enumerate(product_queries): - if st.button(query, key=f"product_query_{i}"): - st.rerun() - - with col2: - st.markdown("**By Amount/Value:**") - amount_queries = [ - "high value transactions above 50000", - "small purchases under 5000", - "medium range invoices", - "expensive equipment purchases" - ] - for i, query in enumerate(amount_queries): - if st.button(query, key=f"amount_query_{i}"): - st.rerun() - - with col3: - st.markdown("**By Pattern/Type:**") - pattern_queries = [ - "recurring monthly services", - "one-time large purchases", - "maintenance and support", - "travel and expenses" - ] - for i, query in enumerate(pattern_queries): - if st.button(query, key=f"pattern_query_{i}"): - st.rerun() - - # Advanced search options - with st.expander("šŸ”§ Advanced Search Options"): - col1, col2 = st.columns(2) - - with col1: - similarity_threshold = st.slider( - "Similarity Threshold:", - min_value=0.0, - max_value=1.0, - value=0.1, - step=0.05, - help="Filter results below this similarity score" - ) - - date_range = st.date_input( - "Date Range Filter:", - value=None, - help="Filter results by date range" - ) - - with col2: - amount_range = st.slider( - "Amount Range (₹):", - min_value=0, - max_value=100000, - value=(0, 100000), - help="Filter results by amount range" - ) - - supplier_filter = st.multiselect( - "Filter by Suppliers:", - options=[meta.get('supplier_name', '') for meta in vector_store.document_metadata if meta.get('supplier_name')], - help="Select specific suppliers to search within" - ) - - if st.button("šŸŽÆ Apply Advanced Search"): - if search_query: - # Apply advanced filters to results - filtered_results = [] - all_results = vector_store.semantic_search(search_query, 20) - - for result in all_results: - # Apply filters - if result.similarity_score < similarity_threshold: - continue - - if supplier_filter and result.supplier_name not in supplier_filter: - continue - - amount = result.metadata.get('amount', 0) - if not (amount_range[0] <= amount <= amount_range[1]): - continue - - filtered_results.append(result) - - st.success(f"Found {len(filtered_results)} results matching advanced criteria") - - # Display filtered results - for i, result in enumerate(filtered_results[:top_k], 1): - st.write(f"{i}. **{result.invoice_number}** - {result.supplier_name} (Score: {result.similarity_score:.3f})") - - else: - st.warning("No documents in vector store. Please upload and process some invoices first.") - - if st.button("šŸš€ Process Sample Data"): - st.info("Upload some invoices in the 'Upload & Process' tab to enable semantic search.") - - -# =============================================================================== -# UTILITY FUNCTIONS -# =============================================================================== - -def export_complete_system_data(): - """Export all system data for backup or migration""" - try: - # Get all data - json_data = st.session_state.enhanced_chatbot.load_json_data() - vector_stats = st.session_state.enhanced_processor.vector_store.get_stats() - - # Create comprehensive backup - backup_data = { - "export_timestamp": datetime.now().isoformat(), - "system_info": { - "version": "enhanced_v1.0", - "vector_model": vector_stats['model_name'], - "embedding_dimension": vector_stats['embedding_dimension'] - }, - "invoice_data": json_data, - "vector_metadata": st.session_state.enhanced_processor.vector_store.document_metadata, - "system_stats": vector_stats - } - - return json.dumps(backup_data, indent=2, ensure_ascii=False) - - except Exception as e: - st.error(f"Error creating backup: {e}") - return None - -def import_system_data(backup_data: str): - """Import system data from backup""" - try: - data = json.loads(backup_data) - - # Restore JSON data - invoice_data = data.get("invoice_data", {}) - st.session_state.enhanced_processor.save_json_data(invoice_data) - - # Rebuild vector store - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store: - vector_store.rebuild_vector_store(invoice_data) - - st.success("āœ… System data imported successfully!") - return True - - except Exception as e: - st.error(f"Error importing data: {e}") - return False - - -# =============================================================================== -# MAIN APPLICATION ENTRY POINT -# =============================================================================== - -if __name__ == "__main__": - enhanced_main() - - -# =============================================================================== -# ADDITIONAL CONFIGURATION AND SETUP -# =============================================================================== - -# Configuration for different deployment environments -DEPLOYMENT_CONFIG = { - "local": { - "embedding_model": "all-MiniLM-L6-v2", - "ollama_url": "http://localhost:11434", - "vector_store_path": "invoice_vectors.faiss", - "batch_size": 10 - }, - "cloud": { - "embedding_model": "all-MiniLM-L6-v2", - "ollama_url": "http://ollama-service:11434", - "vector_store_path": "/data/invoice_vectors.faiss", - "batch_size": 20 - } -} - -# Performance optimization settings -PERFORMANCE_CONFIG = { - "max_text_length": 5000, - "max_embedding_batch_size": 32, - "vector_search_timeout": 30, - "similarity_threshold": 0.1, - "max_results_per_query": 20 -} - -# Security settings -SECURITY_CONFIG = { - "max_file_size_mb": 50, - "allowed_file_types": ['.pdf', '.jpg', '.jpeg', '.png', '.docx', '.txt'], - "sanitize_uploads": True, - "max_concurrent_uploads": 5 -} - -# Monitoring and logging configuration -MONITORING_CONFIG = { - "log_level": "INFO", - "enable_performance_metrics": True, - "track_search_queries": True, - "enable_error_reporting": True -} - - -# =============================================================================== -# HELPER FUNCTIONS FOR EXTENDED FUNCTIONALITY -# =============================================================================== - -def setup_advanced_logging(): - """Setup advanced logging for the enhanced system""" - import logging - - # Create formatters - detailed_formatter = logging.Formatter( - '%(asctime)s - %(name)s - %(levelname)s - %(funcName)s:%(lineno)d - %(message)s' - ) - - # File handler for detailed logs - file_handler = logging.FileHandler('enhanced_invoice_system.log') - file_handler.setFormatter(detailed_formatter) - file_handler.setLevel(logging.DEBUG) - - # Console handler for important messages - console_handler = logging.StreamHandler() - console_handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) - console_handler.setLevel(logging.INFO) - - # Setup loggers - logger = logging.getLogger('enhanced_invoice_system') - logger.setLevel(logging.DEBUG) - logger.addHandler(file_handler) - logger.addHandler(console_handler) - - return logger - -def validate_system_requirements(): - """Enhanced system validation that works with your compatible packages""" - requirements_met = True - issues = [] - - # Define required packages with their actual import names - required_packages = [ - { - 'import_name': 'faiss', - 'package_name': 'faiss-cpu', - 'display_name': 'FAISS (Vector Search)', - 'required': True - }, - { - 'import_name': 'sentence_transformers', - 'package_name': 'sentence-transformers', - 'display_name': 'Sentence Transformers', - 'required': True - }, - { - 'import_name': 'torch', - 'package_name': 'torch', - 'display_name': 'PyTorch', - 'required': True - }, - { - 'import_name': 'streamlit', - 'package_name': 'streamlit', - 'display_name': 'Streamlit', - 'required': True - }, - { - 'import_name': 'pandas', - 'package_name': 'pandas', - 'display_name': 'Pandas', - 'required': True - }, - { - 'import_name': 'numpy', - 'package_name': 'numpy', - 'display_name': 'NumPy', - 'required': True - }, - { - 'import_name': 'sklearn', - 'package_name': 'scikit-learn', - 'display_name': 'Scikit-learn', - 'required': False - }, - { - 'import_name': 'requests', - 'package_name': 'requests', - 'display_name': 'Requests', - 'required': False - } - ] - - st.markdown("### šŸ” System Requirements Check") - - for pkg in required_packages: - try: - # Try to import the package - module = importlib.import_module(pkg['import_name']) - - # Get version if available - version = getattr(module, '__version__', 'Unknown') - - # Success - st.success(f"āœ… {pkg['display_name']}: {version}") - - except ImportError as e: - # Package missing - if pkg['required']: - st.error(f"āŒ {pkg['display_name']}: Missing") - issues.append(f"Missing required package: {pkg['package_name']}") - requirements_met = False - else: - st.warning(f"āš ļø {pkg['display_name']}: Missing (optional)") - - except Exception as e: - # Other error - st.error(f"āŒ {pkg['display_name']}: Error - {str(e)[:50]}...") - if pkg['required']: - issues.append(f"Error with package {pkg['package_name']}: {str(e)}") - requirements_met = False - - # Test critical functionality - st.markdown("### 🧪 Functionality Tests") - - try: - # Test FAISS basic functionality - import faiss - import numpy as np - - # Create a simple index - index = faiss.IndexFlatL2(64) - vectors = np.random.random((5, 64)).astype('float32') - index.add(vectors) - - # Test search - query = np.random.random((1, 64)).astype('float32') - distances, indices = index.search(query, 3) - - st.success("āœ… FAISS: Vector search working") - - except Exception as e: - st.error(f"āŒ FAISS: Functionality test failed - {str(e)[:100]}...") - issues.append(f"FAISS functionality error: {str(e)}") - requirements_met = False - - try: - # Test Sentence Transformers - from sentence_transformers import SentenceTransformer - - # Don't actually load a model (takes time), just test import - st.success("āœ… Sentence Transformers: Import successful") - - except Exception as e: - st.error(f"āŒ Sentence Transformers: Import failed - {str(e)[:100]}...") - issues.append(f"Sentence Transformers error: {str(e)}") - requirements_met = False - - # Check Ollama availability - try: - response = requests.get('http://localhost:11434/api/tags', timeout=5) - if response.status_code != 200: - issues.append("āš ļø Ollama service not responding") - except: - issues.append("āš ļø Ollama not available") - - # Check disk space (basic check) - import shutil - free_space_gb = shutil.disk_usage('.').free / (1024**3) - if free_space_gb < 1: - issues.append(f"āš ļø Low disk space: {free_space_gb:.2f} GB available") - - return requirements_met, issues - -def create_system_health_dashboard(): - """Create a system health monitoring dashboard""" - st.subheader("šŸ„ System Health Monitor") - - requirements_met, issues = validate_system_requirements() - - if requirements_met: - st.success("āœ… All system requirements met") - else: - st.error("āŒ System requirements issues detected") - for issue in issues: - st.write(issue) - # System metrics - col1, col2, col3, col4 = st.columns(4) - - with col1: - # Memory usage - import psutil - memory_percent = psutil.virtual_memory().percent - st.metric("Memory Usage", f"{memory_percent:.1f}%") - - with col2: - # Disk usage - disk_percent = psutil.disk_usage('.').used / psutil.disk_usage('.').total * 100 - st.metric("Disk Usage", f"{disk_percent:.1f}%") - - with col3: - # Vector store health - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store and vector_store.document_metadata: - vector_health = "Healthy" - else: - vector_health = "Not Ready" - st.metric("Vector Store", vector_health) - - with col4: - # Database health - try: - conn = sqlite3.connect(st.session_state.enhanced_processor.db_path) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM invoices") - db_records = cursor.fetchone()[0] - conn.close() - st.metric("DB Records", db_records) - except: - st.metric("Database", "Error") - -def performance_benchmark(): - """Run performance benchmarks on the system""" - st.subheader("⚔ Performance Benchmark") - - if st.button("šŸš€ Run Benchmark"): - with st.spinner("Running performance tests..."): - results = {} - - # Test embedding generation speed - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store and vector_store.embedding_model: - start_time = datetime.now() - test_texts = [ - "Invoice number INV-2024-001 from ABC Corp for office supplies", - "Technology equipment purchase from XYZ Ltd for $5000", - "Consulting services invoice for project management" - ] - embeddings = vector_store.embedding_model.encode(test_texts) - embedding_time = (datetime.now() - start_time).total_seconds() - results['embedding_speed'] = f"{len(test_texts)/embedding_time:.2f} docs/sec" - - # Test vector search speed - if vector_store and vector_store.document_metadata: - start_time = datetime.now() - search_results = vector_store.semantic_search("office supplies", top_k=5) - search_time = (datetime.now() - start_time).total_seconds() - results['search_speed'] = f"{search_time*1000:.2f} ms" - - # Test database query speed - try: - start_time = datetime.now() - conn = sqlite3.connect(st.session_state.enhanced_processor.db_path) - df = pd.read_sql_query("SELECT * FROM invoices LIMIT 100", conn) - conn.close() - db_time = (datetime.now() - start_time).total_seconds() - results['db_query_speed'] = f"{db_time*1000:.2f} ms" - except: - results['db_query_speed'] = "Error" - - # Display results - for metric, value in results.items(): - st.metric(metric.replace('_', ' ').title(), value) - -def create_backup_system(): - """Create comprehensive backup functionality""" - st.subheader("šŸ’¾ Backup & Restore System") - - col1, col2 = st.columns(2) - - with col1: - st.markdown("**Create Backup:**") - - backup_options = st.multiselect( - "Select data to backup:", - ["JSON Data", "Vector Store", "Database", "System Config"], - default=["JSON Data", "Vector Store", "Database"] - ) - - if st.button("šŸ“¦ Create Backup"): - with st.spinner("Creating backup..."): - backup_data = {} - - if "JSON Data" in backup_options: - json_data = st.session_state.enhanced_chatbot.load_json_data() - backup_data["json_data"] = json_data - - if "Vector Store" in backup_options: - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store: - backup_data["vector_metadata"] = vector_store.document_metadata - backup_data["vector_config"] = { - "model_name": vector_store.embedding_model_name, - "dimension": vector_store.embedding_dimension - } - - if "Database" in backup_options: - try: - conn = sqlite3.connect(st.session_state.enhanced_processor.db_path) - df = pd.read_sql_query("SELECT * FROM invoices", conn) - conn.close() - backup_data["database_data"] = df.to_dict('records') - except: - st.error("Failed to backup database") - - if "System Config" in backup_options: - backup_data["system_config"] = { - "deployment_config": DEPLOYMENT_CONFIG, - "performance_config": PERFORMANCE_CONFIG, - "security_config": SECURITY_CONFIG - } - - backup_data["backup_timestamp"] = datetime.now().isoformat() - backup_data["backup_version"] = "enhanced_v1.0" - - # Create downloadable backup - backup_json = json.dumps(backup_data, indent=2, ensure_ascii=False) - - st.download_button( - label="šŸ’¾ Download Backup", - data=backup_json, - file_name=f"invoice_system_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json", - mime="application/json" - ) - - st.success("āœ… Backup created successfully!") - - with col2: - st.markdown("**Restore from Backup:**") - - uploaded_backup = st.file_uploader( - "Upload backup file:", - type=['json'], - help="Select a backup file created by this system" - ) - - if uploaded_backup: - try: - backup_content = json.loads(uploaded_backup.getvalue().decode('utf-8')) - - st.write("**Backup Information:**") - st.write(f"Created: {backup_content.get('backup_timestamp', 'Unknown')}") - st.write(f"Version: {backup_content.get('backup_version', 'Unknown')}") - - restore_options = [] - if "json_data" in backup_content: - restore_options.append("JSON Data") - if "vector_metadata" in backup_content: - restore_options.append("Vector Store") - if "database_data" in backup_content: - restore_options.append("Database") - if "system_config" in backup_content: - restore_options.append("System Config") - - selected_restore = st.multiselect( - "Select data to restore:", - restore_options, - default=restore_options - ) - - if st.button("šŸ”„ Restore Data"): - with st.spinner("Restoring data..."): - try: - if "JSON Data" in selected_restore and "json_data" in backup_content: - st.session_state.enhanced_processor.save_json_data(backup_content["json_data"]) - st.success("āœ… JSON data restored") - - if "Vector Store" in selected_restore and "vector_metadata" in backup_content: - vector_store = st.session_state.enhanced_processor.vector_store - if vector_store: - vector_store.document_metadata = backup_content["vector_metadata"] - vector_store.save_vector_store() - st.success("āœ… Vector store metadata restored") - - if "Database" in selected_restore and "database_data" in backup_content: - # Clear existing data and restore - conn = sqlite3.connect(st.session_state.enhanced_processor.db_path) - cursor = conn.cursor() - cursor.execute("DELETE FROM invoices") - - # Insert restored data - for record in backup_content["database_data"]: - cursor.execute(''' - INSERT INTO invoices - (supplier_name, buyer_name, invoice_number, date, amount, - quantity, product_description, file_path, file_name, - extraction_confidence, raw_text) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) - ''', ( - record.get('supplier_name', ''), - record.get('buyer_name', ''), - record.get('invoice_number', ''), - record.get('date', ''), - record.get('amount', 0), - record.get('quantity', 0), - record.get('product_description', ''), - record.get('file_path', ''), - record.get('file_name', ''), - record.get('extraction_confidence', 0), - record.get('raw_text', '') - )) - - conn.commit() - conn.close() - st.success("āœ… Database restored") - - st.success("šŸŽ‰ Restore completed! Please refresh the page.") - - except Exception as e: - st.error(f"Restore failed: {e}") - - except Exception as e: - st.error(f"Invalid backup file: {e}") - -def create_deployment_guide(): - """Create deployment configuration guide""" - st.subheader("šŸš€ Deployment Guide") - - deployment_type = st.selectbox( - "Select deployment environment:", - ["Local Development", "Docker Container", "Cloud Deployment", "Production Server"] - ) - - if deployment_type == "Local Development": - st.markdown(""" - ### šŸ“‹ Local Development Setup - - **Prerequisites:** - ```bash - # Install Python 3.8+ - python --version - - # Install Ollama - curl -fsSL https://ollama.com/install.sh | sh - ollama serve - ollama pull mistral:7b - ``` - - **Installation:** - ```bash - # Clone repository - git clone - cd enhanced-invoice-pipeline - - # Install dependencies - pip install -r requirements.txt - - # Run the application - streamlit run enhanced_main.py - ``` - """) - - elif deployment_type == "Docker Container": - st.markdown(""" - ### 🐳 Docker Deployment - - **Dockerfile:** - ```dockerfile - FROM python:3.9-slim - - WORKDIR /app - - # Install system dependencies - RUN apt-get update && apt-get install -y \\ - curl \\ - && rm -rf /var/lib/apt/lists/* - - # Install Ollama - RUN curl -fsSL https://ollama.com/install.sh | sh - - # Copy requirements and install Python dependencies - COPY requirements.txt . - RUN pip install -r requirements.txt - - # Copy application code - COPY . . - - # Expose ports - EXPOSE 8501 11434 - - # Start script - CMD ["sh", "-c", "ollama serve & streamlit run enhanced_main.py --server.port=8501 --server.address=0.0.0.0"] - ``` - - **Docker Compose:** - ```yaml - version: '3.8' - services: - invoice-app: - build: . - ports: - - "8501:8501" - - "11434:11434" - volumes: - - ./data:/app/data - - ./models:/app/models - environment: - - STREAMLIT_SERVER_ADDRESS=0.0.0.0 - ``` - """) - - elif deployment_type == "Cloud Deployment": - st.markdown(""" - ### ā˜ļø Cloud Deployment (AWS/GCP/Azure) - - **Environment Variables:** - ```bash - export OLLAMA_HOST=0.0.0.0:11434 - export STREAMLIT_SERVER_PORT=8501 - export VECTOR_STORE_PATH=/data/vectors - export JSON_DATA_PATH=/data/invoices.json - ``` - - **Cloud-specific considerations:** - - Use managed storage for persistence (S3, GCS, Azure Blob) - - Configure load balancing for high availability - - Set up monitoring and logging - - Implement proper security groups/firewalls - - Consider using managed vector databases (Pinecone, Weaviate) - """) - - elif deployment_type == "Production Server": - st.markdown(""" - ### šŸ­ Production Server Setup - - **System Requirements:** - - CPU: 4+ cores - - RAM: 8GB+ (16GB recommended) - - Storage: 50GB+ SSD - - GPU: Optional (for faster embeddings) - - **Production Configuration:** - ```python - PRODUCTION_CONFIG = { - "max_workers": 4, - "embedding_batch_size": 64, - "vector_store_backup_interval": 3600, # 1 hour - "log_level": "WARNING", - "enable_metrics": True, - "secure_mode": True - } - ``` - - **Security Checklist:** - - [ ] Enable HTTPS/TLS - - [ ] Configure authentication - - [ ] Set up firewall rules - - [ ] Regular security updates - - [ ] Data encryption at rest - - [ ] Backup automation - """) - -def create_api_documentation(): - """Create API documentation for programmatic access""" - st.subheader("šŸ“” API Documentation") - - st.markdown(""" - ### REST API Endpoints - - The enhanced invoice system can be extended with REST API endpoints: - """) - - api_sections = st.tabs(["Upload API", "Search API", "Analytics API", "Management API"]) - - with api_sections[0]: - st.markdown(""" - #### šŸ“¤ Upload & Processing API - - **POST /api/upload** - ```python - import requests - - # Upload single file - with open('invoice.pdf', 'rb') as f: - response = requests.post( - 'http://localhost:8501/api/upload', - files={'file': f}, - data={'process_immediately': True} - ) - - # Response - { - "success": true, - "invoice_id": "inv_123", - "extracted_data": {...}, - "vector_indexed": true - } - ``` - - **GET /api/status/{job_id}** - ```python - # Check processing status - response = requests.get('http://localhost:8501/api/status/job_123') - - # Response - { - "status": "completed", - "progress": 100, - "result": {...} - } - ``` - """) - - with api_sections[1]: - st.markdown(""" - #### šŸ” Search API - - **POST /api/search** - ```python - # Semantic search - response = requests.post( - 'http://localhost:8501/api/search', - json={ - "query": "high value technology purchases", - "type": "semantic", - "top_k": 5, - "filters": { - "amount_min": 1000, - "date_after": "2024-01-01" - } - } - ) - - # Response - { - "results": [ - { - "invoice_number": "INV-2024-001", - "similarity_score": 0.89, - "metadata": {...} - } - ], - "total_found": 15 - } - ``` - - **GET /api/search/suggestions** - ```python - # Get search suggestions - response = requests.get( - 'http://localhost:8501/api/search/suggestions', - params={"partial_query": "office"} - ) - - # Response - { - "suggestions": [ - "office supplies", - "office equipment", - "office rent" - ] - } - ``` - """) - - with api_sections[2]: - st.markdown(""" - #### šŸ“Š Analytics API - - **GET /api/analytics/summary** - ```python - # Get system summary - response = requests.get('http://localhost:8501/api/analytics/summary') - - # Response - { - "total_invoices": 1250, - "total_amount": 450000.50, - "unique_suppliers": 85, - "processing_stats": {...} - } - ``` - - **GET /api/analytics/trends** - ```python - # Get spending trends - response = requests.get( - 'http://localhost:8501/api/analytics/trends', - params={ - "period": "monthly", - "start_date": "2024-01-01", - "end_date": "2024-12-31" - } - ) - - # Response - { - "trends": [ - {"month": "2024-01", "amount": 12500.00, "count": 45}, - {"month": "2024-02", "amount": 15200.00, "count": 52} - ] - } - ``` - """) - - with api_sections[3]: - st.markdown(""" - #### āš™ļø Management API - - **POST /api/vector/rebuild** - ```python - # Rebuild vector store - response = requests.post( - 'http://localhost:8501/api/vector/rebuild', - json={"force": true} - ) - - # Response - { - "status": "rebuilding", - "job_id": "rebuild_456", - "estimated_time": 120 - } - ``` - - **GET /api/system/health** - ```python - # System health check - response = requests.get('http://localhost:8501/api/system/health') - - # Response - { - "status": "healthy", - "components": { - "database": "ok", - "vector_store": "ok", - "ollama": "ok", - "embedding_model": "ok" - }, - "metrics": { - "memory_usage": 45.2, - "disk_usage": 23.1, - "uptime": 86400 - } - } - ``` - """) - -def create_troubleshooting_guide(): - """Create comprehensive troubleshooting guide""" - st.subheader("šŸ”§ Troubleshooting Guide") - - issue_categories = st.tabs([ - "Installation Issues", - "Processing Errors", - "Vector Store Problems", - "Performance Issues", - "Integration Problems" - ]) - - with issue_categories[0]: - st.markdown(""" - ### šŸ› ļø Installation Issues - - **Problem: ModuleNotFoundError for required packages** - ```bash - # Solution: Install missing packages - pip install -r requirements.txt - - # For development environment - pip install -r requirements-dev.txt - - # For specific packages - pip install sentence-transformers faiss-cpu streamlit - ``` - - **Problem: Ollama not found or not responding** - ```bash - # Install Ollama - curl -fsSL https://ollama.com/install.sh | sh - - # Start Ollama service - ollama serve - - # Pull required model - ollama pull mistral:7b - - # Check if running - curl http://localhost:11434/api/tags - ``` - - **Problem: CUDA/GPU issues with embeddings** - ```bash - # For CPU-only deployment - pip install torch --index-url https://download.pytorch.org/whl/cpu - - # For GPU support - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - ``` - """) - - with issue_categories[1]: - st.markdown(""" - ### āš ļø Processing Errors - - **Problem: PDF extraction fails** - - Check file permissions and size - - Ensure PDF is not password protected - - Try converting PDF to images first - - Check Docling installation - - **Problem: AI extraction returns empty results** - - Verify Ollama is running - - Check model availability - - Increase timeout settings - - Fall back to regex extraction - - **Problem: JSON serialization errors** - ```python - # Debug extracted data - print(f"Extracted data type: {type(extracted_data)}") - print(f"Data content: {extracted_data}") - - # Handle non-serializable data - cleaned_data = {k: str(v) for k, v in extracted_data.items()} - ``` - """) - - with issue_categories[2]: - st.markdown(""" - ### šŸ” Vector Store Problems - - **Problem: Vector store fails to load** - ```python - # Check file permissions - import os - print(f"Vector file exists: {os.path.exists('invoice_vectors.faiss')}") - print(f"Metadata file exists: {os.path.exists('vector_metadata.pkl')}") - - # Rebuild if corrupted - vector_store.rebuild_vector_store(json_data) - ``` - - **Problem: Embedding model fails to load** - ```python - # Try different models - models_to_try = [ - "all-MiniLM-L6-v2", - "all-mpnet-base-v2", - "paraphrase-MiniLM-L6-v2" - ] - - for model in models_to_try: - try: - embedding_model = SentenceTransformer(model) - break - except Exception as e: - print(f"Failed to load {model}: {e}") - ``` - - **Problem: Search returns no results** - - Check if documents are indexed - - Verify similarity threshold - - Try different query formulations - - Check embedding model compatibility - """) - - with issue_categories[3]: - st.markdown(""" - ### ⚔ Performance Issues - - **Problem: Slow processing speed** - ```python - # Optimize batch processing - BATCH_SIZE = 10 # Reduce if memory issues - - # Use CPU-optimized models - embedding_model = "all-MiniLM-L6-v2" # Fastest - - # Limit text length - text = text[:5000] # Truncate long documents - ``` - - **Problem: High memory usage** - ```python - # Monitor memory - import psutil - process = psutil.Process() - print(f"Memory usage: {process.memory_info().rss / 1024 / 1024:.2f} MB") - - # Optimize settings - torch.set_num_threads(2) # Limit CPU threads - os.environ['OMP_NUM_THREADS'] = '2' - ``` - - **Problem: Slow vector search** - - Use FAISS optimization - - Implement result caching - - Reduce embedding dimensions - - Use approximate search methods - """) - - with issue_categories[4]: - st.markdown(""" - ### šŸ”— Integration Problems - - **Problem: Database connection issues** - ```python - # Check database file - import sqlite3 - try: - conn = sqlite3.connect('invoices.db') - cursor = conn.cursor() - cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") - tables = cursor.fetchall() - print(f"Available tables: {tables}") - conn.close() - except Exception as e: - print(f"Database error: {e}") - ``` - - **Problem: Streamlit deployment issues** - ```bash - # Check Streamlit version - streamlit --version - - # Run with debug mode - streamlit run app.py --logger.level=debug - - # Check port availability - netstat -tulpn | grep :8501 - ``` - - **Problem: Cross-platform compatibility** - - Use pathlib instead of os.path - - Handle file encoding explicitly - - Test on target platform - - Use platform-specific configurations - """) - -# =============================================================================== -# FINAL SYSTEM INTEGRATION -# =============================================================================== - -def create_system_dashboard(): - """Create comprehensive system dashboard""" - st.title("šŸŽ›ļø Enhanced Invoice System Dashboard") - - # System overview tabs - main_tabs = st.tabs([ - "šŸ“Š Overview", - "šŸ„ Health", - "⚔ Performance", - "šŸ’¾ Backup", - "šŸš€ Deploy", - "šŸ”§ Troubleshoot" - ]) - - with main_tabs[0]: - # System overview - col1, col2, col3 = st.columns(3) - - with col1: - st.metric("System Status", "āœ… Operational") - st.metric("Uptime", "24h 35m") - - with col2: - st.metric("Total Processing", "1,234 files") - st.metric("Success Rate", "96.8%") - - with col3: - st.metric("Vector Store", "1,190 docs") - st.metric("Storage Used", "2.3 GB") - - with main_tabs[1]: - create_system_health_dashboard() - - with main_tabs[2]: - performance_benchmark() - - with main_tabs[3]: - create_backup_system() - - with main_tabs[4]: - create_deployment_guide() - - with main_tabs[5]: - create_troubleshooting_guide() - -# =============================================================================== -# REQUIREMENTS AND DEPENDENCIES -# =============================================================================== - -def generate_requirements_file(): - """Generate requirements.txt file content""" - requirements = """ -# Core dependencies -streamlit>=1.28.0 -pandas>=1.5.0 -numpy>=1.21.0 -sqlite3 - -# Document processing -pdfplumber>=0.7.0 -python-docx>=0.8.11 -Pillow>=9.0.0 - -# AI and ML -sentence-transformers>=2.2.0 -torch>=1.13.0 -transformers>=4.21.0 -ollama>=0.1.0 - -# Vector storage -faiss-cpu>=1.7.0 -# faiss-gpu>=1.7.0 # Uncomment for GPU support - -# Web and API -requests>=2.28.0 -streamlit-chat>=0.1.0 - -# Visualization -plotly>=5.0.0 -matplotlib>=3.5.0 -seaborn>=0.11.0 - -# Utilities -python-dateutil>=2.8.0 -pytz>=2022.1 -tqdm>=4.64.0 -psutil>=5.9.0 - -# Development and testing (optional) -pytest>=7.0.0 -black>=22.0.0 -flake8>=5.0.0 -mypy>=0.991 - -# Production deployment (optional) -gunicorn>=20.1.0 -nginx-python>=1.0.0 -supervisor>=4.2.0 -""" - return requirements.strip() - -def generate_docker_files(): - """Generate Docker configuration files""" - - dockerfile_content = """ -FROM python:3.9-slim - -# Set working directory -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \\ - curl \\ - gcc \\ - g++ \\ - && rm -rf /var/lib/apt/lists/* - -# Install Ollama -RUN curl -fsSL https://ollama.com/install.sh | sh - -# Copy requirements and install Python dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY . . - -# Create data directory -RUN mkdir -p /app/data - -# Expose ports -EXPOSE 8501 11434 - -# Health check -HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \\ - CMD curl -f http://localhost:8501/_stcore/health || exit 1 - -# Start script -COPY start.sh . -RUN chmod +x start.sh -CMD ["./start.sh"] -""" - - docker_compose_content = """ -version: '3.8' - -services: - invoice-app: - build: . - ports: - - "8501:8501" - - "11434:11434" - volumes: - - ./data:/app/data - - ./models:/app/models - - ./backups:/app/backups - environment: - - STREAMLIT_SERVER_ADDRESS=0.0.0.0 - - STREAMLIT_SERVER_PORT=8501 - - OLLAMA_HOST=0.0.0.0:11434 - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8501/_stcore/health"] - interval: 30s - timeout: 10s - retries: 3 - start_period: 40s - - # Optional: Add a reverse proxy - nginx: - image: nginx:alpine - ports: - - "80:80" - - "443:443" - volumes: - - ./nginx.conf:/etc/nginx/nginx.conf - - ./ssl:/etc/ssl - depends_on: - - invoice-app - restart: unless-stopped -""" - - start_script_content = """#!/bin/bash -set -e - -# Start Ollama in background -ollama serve & -OLLAMA_PID=$! - -# Wait for Ollama to be ready -echo "Waiting for Ollama to start..." -sleep 10 - -# Pull required model -ollama pull mistral:7b - -# Start Streamlit -echo "Starting Streamlit application..." -streamlit run enhanced_main.py \\ - --server.port=8501 \\ - --server.address=0.0.0.0 \\ - --server.enableCORS=false \\ - --server.enableXsrfProtection=false - -# Keep the script running -wait $OLLAMA_PID -""" - - return { - "Dockerfile": dockerfile_content.strip(), - "docker-compose.yml": docker_compose_content.strip(), - "start.sh": start_script_content.strip() - } - -# =============================================================================== -# =============================================================================== -# FINAL MAIN FUNCTION WITH ALL FEATURES -# =============================================================================== - -def ultimate_enhanced_main(): - # Custom CSS for better UI - st.markdown(""" - - """, unsafe_allow_html=True) - - # Main header - st.markdown('

šŸš€ Ultimate Invoice Processing System

', unsafe_allow_html=True) - st.markdown(""" -
-

- AI-Powered Document Processing • Semantic Search • Advanced Analytics • Production Ready -

-
- """, unsafe_allow_html=True) - - # Initialize session state with error handling - try: - if 'ultimate_processor' not in st.session_state: - with st.spinner("šŸ”§ Initializing Ultimate Invoice Processor..."): - st.session_state.ultimate_processor = EnhancedInvoiceProcessor() - - if 'ultimate_chatbot' not in st.session_state: - with st.spinner("šŸ¤– Setting up Enhanced AI Chatbot..."): - st.session_state.ultimate_chatbot = EnhancedInvoiceChatBot( - vector_store=st.session_state.ultimate_processor.vector_store - ) - - if 'chat_history' not in st.session_state: - st.session_state.chat_history = [] - - if 'system_initialized' not in st.session_state: - st.session_state.system_initialized = True - st.success("āœ… System initialized successfully!") - - except Exception as e: - st.error(f"āŒ System initialization failed: {e}") - st.stop() - - # ------------------------------------------------------------------------- - # ULTIMATE SIDEBAR WITH COMPREHENSIVE STATUS - # ------------------------------------------------------------------------- - - with st.sidebar: - st.markdown("## šŸŽ›ļø System Control Center") - - # System status overview - with st.expander("šŸ„ System Health", expanded=True): - try: - # Check all components - components_status = { - "Docling": hasattr(st.session_state.ultimate_processor, 'docling_available') and st.session_state.ultimate_processor.docling_available, - "AI Processing": hasattr(st.session_state.ultimate_processor, 'use_ai') and st.session_state.ultimate_processor.use_ai, - "Vector Store": st.session_state.ultimate_processor.vector_store and st.session_state.ultimate_processor.vector_store.embedding_model, - "Database": os.path.exists(st.session_state.ultimate_processor.db_path), - "JSON Storage": os.path.exists(st.session_state.ultimate_processor.json_path) - } - - for component, status in components_status.items(): - status_icon = "āœ…" if status else "āŒ" - status_class = "status-ok" if status else "status-error" - st.markdown(f'{status_icon} {component}', unsafe_allow_html=True) - - except Exception as e: - st.error(f"Status check failed: {e}") - - # Quick stats - with st.expander("šŸ“Š Quick Statistics", expanded=True): - try: - json_data = st.session_state.ultimate_chatbot.load_json_data() - total_invoices = len(json_data.get("invoices", [])) - total_amount = json_data.get("summary", {}).get("total_amount", 0) - unique_suppliers = len(json_data.get("summary", {}).get("unique_suppliers", [])) - vector_docs = len(st.session_state.ultimate_processor.vector_store.document_metadata) if st.session_state.ultimate_processor.vector_store else 0 - - st.metric("šŸ“„ Total Invoices", f"{total_invoices:,}") - st.metric("šŸ’° Total Value", f"₹{total_amount:,.2f}") - st.metric("šŸ¢ Suppliers", f"{unique_suppliers:,}") - st.metric("šŸ” Vector Docs", f"{vector_docs:,}") - - except Exception as e: - st.error(f"Stats loading failed: {e}") - - # Quick actions - st.markdown("### ⚔ Quick Actions") - - col1, col2 = st.columns(2) - - with col1: - if st.button("šŸ”„ Refresh", use_container_width=True, key="sidebar_refresh"): - st.rerun() - - if st.button("šŸ’¾ Backup", use_container_width=True,key="sidebar_backup"): - try: - backup_data = export_complete_system_data() - if backup_data: - st.download_button( - "šŸ“„ Download", - backup_data, - f"system_backup_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - "application/json" - ) - except Exception as e: - st.error(f"Backup failed: {e}") - - with col2: - if st.button("🧹 Clean", use_container_width=True, key="sidebar_clean"): - # Clean temporary files - try: - import tempfile - import glob - temp_files = glob.glob(os.path.join(tempfile.gettempdir(), "tmp*")) - for file in temp_files[:10]: # Limit to avoid issues - try: - os.unlink(file) - except: - pass - st.success("āœ… Cleaned temp files") - except: - st.warning("āš ļø Cleanup partially failed") - - if st.button("šŸ“Š Dashboard", use_container_width=True, key="sidebar_dashboard"): - st.session_state.show_dashboard = not st.session_state.get('show_dashboard', False) - st.rerun() - - # Advanced settings - with st.expander("āš™ļø Advanced Settings"): - # Performance settings - st.markdown("**Performance:**") - batch_size = st.slider("Batch Size", 1, 50, 10) - max_text_length = st.slider("Max Text Length", 1000, 10000, 5000) - - # Vector settings - st.markdown("**Vector Search:**") - similarity_threshold = st.slider("Similarity Threshold", 0.0, 1.0, 0.1, 0.05) - max_results = st.slider("Max Results", 1, 50, 10) - - # Save settings - if st.button("šŸ’¾ Save Settings", key="save_settings_admin"): - settings = { - "batch_size": batch_size, - "max_text_length": max_text_length, - "similarity_threshold": similarity_threshold, - "max_results": max_results - } - # Save to session state or file - st.session_state.user_settings = settings - st.success("āœ… Settings saved!") - - # ------------------------------------------------------------------------- - # MAIN NAVIGATION TABS - # ------------------------------------------------------------------------- - - tab_names = [ - "šŸ  Home", - "šŸ“¤ Upload", - "šŸ’¬ AI Chat", - "šŸ“Š Analytics", - "šŸ” Search", - "šŸ“‹ Data", - "šŸŽ›ļø Admin" - ] - - if st.session_state.get('show_dashboard', False): - tab_names.append("šŸ“ˆ Dashboard") - - tabs = st.tabs(tab_names) - - # ------------------------------------------------------------------------- - # TAB: HOME - SYSTEM OVERVIEW - # ------------------------------------------------------------------------- - - with tabs[0]: - st.markdown("## šŸ  Welcome to the Ultimate Invoice Processing System") - - # Feature highlights - col1, col2, col3 = st.columns(3) - - with col1: - st.markdown(""" -
-

šŸ¤– AI-Powered Extraction

-

Advanced AI models extract structured data from any invoice format with high accuracy.

-
- """, unsafe_allow_html=True) - - with col2: - st.markdown(""" -
-

šŸ” Semantic Search

-

Find invoices using natural language queries with vector similarity search.

-
- """, unsafe_allow_html=True) - - with col3: - st.markdown(""" -
-

šŸ“Š Advanced Analytics

-

Comprehensive insights, trends, and patterns in your invoice data.

-
- """, unsafe_allow_html=True) - - # Getting started guide - st.markdown("### šŸš€ Getting Started") - - steps = [ - ("1ļøāƒ£", "Upload Documents", "Go to Upload tab and drop your invoice files"), - ("2ļøāƒ£", "AI Processing", "Watch as AI extracts structured data automatically"), - ("3ļøāƒ£", "Search & Analyze", "Use natural language to search and analyze your data"), - ("4ļøāƒ£", "Export & Integrate", "Download results or integrate with your systems") - ] - - for icon, title, description in steps: - st.markdown(f""" -
-
{icon}
-
- {title}
- {description} -
-
- """, unsafe_allow_html=True) - - # Recent activity - st.markdown("### šŸ“ˆ Recent Activity") - - try: - json_data = st.session_state.ultimate_chatbot.load_json_data() - recent_invoices = sorted( - json_data.get("invoices", []), - key=lambda x: x.get("timestamps", {}).get("created_at", ""), - reverse=True - )[:5] - - if recent_invoices: - for i, invoice in enumerate(recent_invoices, 1): - with st.expander(f"šŸ“„ {invoice.get('invoice_number', f'Invoice {i}')} - {invoice.get('supplier_name', 'Unknown Supplier')}"): - col1, col2, col3 = st.columns(3) - with col1: - st.write(f"**Amount:** ₹{invoice.get('amount', 0):,.2f}") - st.write(f"**Date:** {invoice.get('date', 'N/A')}") - with col2: - st.write(f"**Buyer:** {invoice.get('buyer_name', 'N/A')}") - st.write(f"**Confidence:** {invoice.get('extraction_info', {}).get('confidence', 0):.1%}") - with col3: - st.write(f"**File:** {invoice.get('file_info', {}).get('file_name', 'N/A')}") - st.write(f"**Processed:** {invoice.get('timestamps', {}).get('created_at', 'N/A')[:19]}") - else: - st.info("No invoices processed yet. Upload some documents to get started!") - - except Exception as e: - st.error(f"Error loading recent activity: {e}") - - # ------------------------------------------------------------------------- - # TAB: UPLOAD - ENHANCED FILE PROCESSING - # ------------------------------------------------------------------------- - - with tabs[1]: - st.markdown("## šŸ“¤ Upload & Process Documents") - - # Upload interface with drag & drop - st.markdown(""" -
-

šŸ“ Drag & Drop Your Invoice Files Here

-

Supported formats: PDF, JPG, PNG, DOCX, TXT

-
- """, unsafe_allow_html=True) - - uploaded_files = st.file_uploader( - "Choose invoice files", - type=['pdf', 'jpg', 'jpeg', 'png', 'docx', 'txt'], - accept_multiple_files=True, - label_visibility="collapsed" - ) - - if uploaded_files: - # Processing options - col1, col2, col3 = st.columns(3) - - with col1: - auto_extract = st.checkbox("šŸ¤– AI Extraction", value=True) - vector_index = st.checkbox("šŸ” Vector Indexing", value=True) - - with col2: - batch_process = st.checkbox("⚔ Batch Processing", value=True) - save_originals = st.checkbox("šŸ’¾ Save Originals", value=False) - - with col3: - notify_completion = st.checkbox("šŸ”” Notify on Completion", value=True) - auto_backup = st.checkbox("šŸ’¾ Auto Backup", value=False) - - # File preview - st.markdown("### šŸ“‹ Files to Process") - - total_size = sum(len(f.getvalue()) for f in uploaded_files) - st.info(f"šŸ“Š {len(uploaded_files)} files selected • Total size: {total_size / 1024 / 1024:.2f} MB") - - # Process files - if st.button("šŸš€ Process All Files", type="primary", use_container_width=True, key="process_all_files_ultimate"): - # Create processing container - progress_container = st.container() - results_container = st.container() - - with progress_container: - progress_bar = st.progress(0) - status_text = st.empty() - - # Processing metrics - metrics_col1, metrics_col2, metrics_col3, metrics_col4 = st.columns(4) - processed_metric = metrics_col1.empty() - success_metric = metrics_col2.empty() - failed_metric = metrics_col3.empty() - time_metric = metrics_col4.empty() - - # Process files with enhanced error handling - start_time = datetime.now() - processed_count = 0 - success_count = 0 - failed_count = 0 - - for i, uploaded_file in enumerate(uploaded_files): - current_progress = (i + 1) / len(uploaded_files) - progress_bar.progress(current_progress) - status_text.text(f"Processing: {uploaded_file.name}") - - # Update metrics - processed_count = i + 1 - processed_metric.metric("Processed", f"{processed_count}/{len(uploaded_files)}") - success_metric.metric("Success", success_count) - failed_metric.metric("Failed", failed_count) - - elapsed_time = (datetime.now() - start_time).total_seconds() - time_metric.metric("Time", f"{elapsed_time:.1f}s") - - # Process file - try: - # Save temporarily - with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file: - file_content = uploaded_file.getvalue() - tmp_file.write(file_content) - tmp_file_path = tmp_file.name - file_size = len(file_content) - - # Enhanced processing - invoice_data = st.session_state.ultimate_processor.process_file(tmp_file_path, file_size) - - if invoice_data.invoice_number: - success_count += 1 - - # Show success in results - with results_container: - with st.expander(f"āœ… {uploaded_file.name}", expanded=False): - col1, col2 = st.columns(2) - with col1: - st.write(f"**Invoice #:** {invoice_data.invoice_number}") - st.write(f"**Supplier:** {invoice_data.supplier_name}") - st.write(f"**Amount:** ₹{invoice_data.amount:.2f}") - with col2: - st.write(f"**Date:** {invoice_data.date}") - st.write(f"**Confidence:** {invoice_data.extraction_confidence:.1%}") - st.write(f"**Vector Indexed:** {'Yes' if vector_index else 'No'}") - else: - failed_count += 1 - with results_container: - st.warning(f"āš ļø Limited data extracted from {uploaded_file.name}") - - except Exception as e: - failed_count += 1 - with results_container: - st.error(f"āŒ Error processing {uploaded_file.name}: {str(e)[:100]}...") - - finally: - # Cleanup - try: - os.unlink(tmp_file_path) - except: - pass - - # Final summary - total_time = (datetime.now() - start_time).total_seconds() - status_text.success(f"āœ… Processing complete! {success_count}/{len(uploaded_files)} successful in {total_time:.1f}s") - - # Show completion notification - if notify_completion: - st.balloons() - - # Auto backup if enabled - if auto_backup and success_count > 0: - try: - backup_data = export_complete_system_data() - if backup_data: - st.download_button( - "šŸ“„ Download Auto-Backup", - backup_data, - f"auto_backup_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - "application/json" - ) - except: - st.warning("Auto-backup failed") - - # ------------------------------------------------------------------------- - # TAB: AI CHAT - ENHANCED CONVERSATIONAL INTERFACE - # ------------------------------------------------------------------------- - - with tabs[2]: - st.markdown("## šŸ’¬ AI Chat Interface") - - # Chat configuration - col1, col2, col3 = st.columns([2, 1, 1]) - - with col1: - st.markdown("**Ask anything about your invoices using natural language**") - - with col2: - search_mode = st.selectbox("Search Mode", ["Hybrid", "Semantic Only", "SQL Only"]) - - with col3: - if st.button("šŸ—‘ļø Clear Chat", key="clear_chat_history"): - st.session_state.chat_history = [] - st.rerun() - - # Chat interface - chat_container = st.container() - - # Chat input - user_query = st.chat_input("Ask about your invoices... (e.g., 'Show me high-value technology purchases')") - - if user_query: - # Add user message - st.session_state.chat_history.append({"role": "user", "content": user_query, "timestamp": datetime.now()}) - - # Process query with enhanced AI - with st.spinner("šŸ¤– AI is analyzing your request..."): - try: - if search_mode == "Semantic Only": - # Pure vector search - vector_results = st.session_state.ultimate_processor.vector_store.semantic_search(user_query, 5) - bot_response = f"šŸ” **Semantic Search Results:**\n\n" - for i, result in enumerate(vector_results, 1): - bot_response += f"{i}. **{result.invoice_number}** - {result.supplier_name} (Score: {result.similarity_score:.3f})\n" - - elif search_mode == "SQL Only": - # Pure SQL search - sql_results = st.session_state.ultimate_chatbot.sql_search(user_query) - bot_response = f"šŸ“Š **Database Search Results:**\n\n" - for i, result in enumerate(sql_results, 1): - bot_response += f"{i}. **{result.get('invoice_number', 'N/A')}** - {result.get('supplier_name', 'N/A')} (₹{result.get('amount', 0):,.2f})\n" - - else: - # Hybrid search (default) - bot_response = st.session_state.ultimate_chatbot.query_database(user_query) - - # Add bot response - st.session_state.chat_history.append({ - "role": "assistant", - "content": bot_response, - "timestamp": datetime.now(), - "search_mode": search_mode - }) - - except Exception as e: - error_response = f"āŒ Sorry, I encountered an error: {str(e)[:100]}..." - st.session_state.chat_history.append({ - "role": "assistant", - "content": error_response, - "timestamp": datetime.now() - }) - - # Display chat history with enhanced formatting - with chat_container: - for i, message in enumerate(st.session_state.chat_history): - with st.chat_message(message["role"]): - st.markdown(message["content"]) - - # Show metadata for assistant messages - if message["role"] == "assistant" and message.get("search_mode"): - st.caption(f"šŸ”§ Mode: {message['search_mode']} • {message['timestamp'].strftime('%H:%M:%S')}") - - # Suggested queries with categories - if not st.session_state.chat_history: - st.markdown("### šŸ’” Try These Smart Queries") - - query_categories = st.tabs(["šŸ“Š Analytics", "šŸ” Search", "šŸ¤– AI Insights", "šŸ“ˆ Trends"]) - - with query_categories[0]: - analytics_queries = [ - "What's our total spending this year?", - "Which supplier do we pay the most?", - "Show me invoices over ₹50,000", - "How many invoices do we process monthly?" - ] - for i, query in enumerate(analytics_queries): - if st.button(query, key=f"analytics_query{i}"): - st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) - st.rerun() - - with query_categories[1]: - search_queries = [ - "Find technology equipment purchases", - "Show me office supplies invoices", - "Search for consulting services", - "Find invoices from last quarter" - ] - for i, query in enumerate(search_queries): - if st.button(query, key=f"search_ai_query_{i}"): - st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) - st.rerun() - - with query_categories[2]: - ai_queries = [ - "Analyze our spending patterns", - "Identify cost-saving opportunities", - "Compare supplier performance", - "Find unusual invoice patterns" - ] - for i, query in enumerate(ai_queries): - if st.button(query, key=f"chat_ai_query_{i}"): - - st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) - st.rerun() - - with query_categories[3]: - trend_queries = [ - "Show spending trends over time", - "Which months have highest expenses?", - "How has our supplier diversity changed?", - "Predict next month's spending" - ] - for i, query in enumerate(trend_queries): - if st.button(query, key=f"trend_query_{i}"): - st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) - st.rerun() - - # ------------------------------------------------------------------------- - # TAB: ANALYTICS - COMPREHENSIVE BUSINESS INTELLIGENCE - # ------------------------------------------------------------------------- - - with tabs[3]: - st.markdown("## šŸ“Š Advanced Analytics Dashboard") - - # Load data - try: - json_data = st.session_state.ultimate_chatbot.load_json_data() - invoices = json_data.get("invoices", []) - - if not invoices: - st.warning("šŸ“Š No invoice data available. Upload and process some invoices first!") - return - - # Convert to DataFrame - df_data = [] - for inv in invoices: - df_data.append({ - 'invoice_number': inv.get('invoice_number', ''), - 'supplier_name': inv.get('supplier_name', ''), - 'buyer_name': inv.get('buyer_name', ''), - 'amount': inv.get('amount', 0), - 'quantity': inv.get('quantity', 0), - 'date': inv.get('date', ''), - 'extraction_confidence': inv.get('extraction_info', {}).get('confidence', 0), - 'created_at': inv.get('timestamps', {}).get('created_at', ''), - 'product_description': inv.get('product_description', '') - }) - - df = pd.DataFrame(df_data) - - # KPI Dashboard - st.markdown("### šŸŽÆ Key Performance Indicators") - - kpi_col1, kpi_col2, kpi_col3, kpi_col4, kpi_col5 = st.columns(5) - - with kpi_col1: - total_invoices = len(df) - st.metric("šŸ“„ Total Invoices", f"{total_invoices:,}") - - with kpi_col2: - total_amount = df['amount'].sum() - avg_amount = df['amount'].mean() - st.metric("šŸ’° Total Value", f"₹{total_amount:,.0f}", f"Avg: ₹{avg_amount:,.0f}") - - with kpi_col3: - unique_suppliers = df['supplier_name'].nunique() - top_supplier_pct = (df['supplier_name'].value_counts().iloc[0] / len(df) * 100) if len(df) > 0 else 0 - st.metric("šŸ¢ Suppliers", f"{unique_suppliers:,}", f"Top: {top_supplier_pct:.1f}%") - - with kpi_col4: - avg_confidence = df['extraction_confidence'].mean() - high_confidence = (df['extraction_confidence'] > 0.8).sum() - st.metric("šŸŽÆ Avg Confidence", f"{avg_confidence:.1%}", f"High: {high_confidence}") - - with kpi_col5: - # Processing efficiency - success_rate = (df['invoice_number'].notna()).mean() - vector_indexed = len(st.session_state.ultimate_processor.vector_store.document_metadata) if st.session_state.ultimate_processor.vector_store else 0 - st.metric("⚔ Success Rate", f"{success_rate:.1%}", f"Indexed: {vector_indexed}") - - # Advanced Analytics Tabs - analytics_tabs = st.tabs([ - "šŸ“ˆ Trends", - "šŸ¢ Suppliers", - "šŸ’° Financial", - "šŸ” Quality", - "šŸ“Š Patterns" - ]) - - with analytics_tabs[0]: - st.markdown("#### šŸ“ˆ Spending Trends") - - # Time series analysis - if 'date' in df.columns and df['date'].notna().any(): - df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce') - df_dated = df.dropna(subset=['date_parsed']) - - if len(df_dated) > 0: - # Monthly trends - monthly_data = df_dated.groupby(df_dated['date_parsed'].dt.to_period('M')).agg({ - 'amount': ['sum', 'mean', 'count'] - }).round(2) - - monthly_data.columns = ['Total Amount', 'Average Amount', 'Invoice Count'] - monthly_data.index = monthly_data.index.astype(str) - # Create trend charts - fig_trend = px.line( - x=monthly_data.index, - y=monthly_data['Total Amount'], - title="Monthly Spending Trend", - labels={'x': 'Month', 'y': 'Total Amount (₹)'} - ) - st.plotly_chart(fig_trend, use_container_width=True) - - # Show data table - st.dataframe(monthly_data, use_container_width=True) - else: - st.info("No valid dates found for trend analysis") - else: - st.info("Date information not available for trend analysis") - - with analytics_tabs[1]: - st.markdown("#### šŸ¢ Supplier Analysis") - - # Supplier performance metrics - supplier_analysis = df.groupby('supplier_name').agg({ - 'amount': ['sum', 'mean', 'count'], - 'extraction_confidence': 'mean' - }).round(2) - - supplier_analysis.columns = ['Total Spent', 'Avg Invoice', 'Invoice Count', 'Avg Confidence'] - supplier_analysis = supplier_analysis.sort_values('Total Spent', ascending=False) - - # Top suppliers visualization - top_suppliers = supplier_analysis.head(10) - - col1, col2 = st.columns(2) - - with col1: - fig_suppliers = px.bar( - x=top_suppliers['Total Spent'], - y=top_suppliers.index, - orientation='h', - title="Top 10 Suppliers by Total Spending", - labels={'x': 'Total Spent (₹)', 'y': 'Supplier'} - ) - st.plotly_chart(fig_suppliers, use_container_width=True) - - with col2: - fig_count = px.bar( - x=top_suppliers['Invoice Count'], - y=top_suppliers.index, - orientation='h', - title="Top 10 Suppliers by Invoice Volume", - labels={'x': 'Invoice Count', 'y': 'Supplier'} - ) - st.plotly_chart(fig_count, use_container_width=True) - - # Supplier performance table - st.markdown("**Supplier Performance Summary:**") - st.dataframe( - supplier_analysis.head(20), - column_config={ - "Total Spent": st.column_config.NumberColumn("Total Spent", format="₹%.2f"), - "Avg Invoice": st.column_config.NumberColumn("Avg Invoice", format="₹%.2f"), - "Avg Confidence": st.column_config.ProgressColumn("Avg Confidence", min_value=0, max_value=1), - } - ) - - with analytics_tabs[2]: - st.markdown("#### šŸ’° Financial Analysis") - - # Amount distribution - col1, col2 = st.columns(2) - - with col1: - # Amount histogram - fig_hist = px.histogram( - df, - x='amount', - nbins=30, - title="Invoice Amount Distribution", - labels={'x': 'Amount (₹)', 'y': 'Frequency'} - ) - st.plotly_chart(fig_hist, use_container_width=True) - - with col2: - # Box plot for amount ranges - fig_box = px.box( - df, - y='amount', - title="Invoice Amount Range Analysis", - labels={'y': 'Amount (₹)'} - ) - st.plotly_chart(fig_box, use_container_width=True) - - # Financial summary statistics - st.markdown("**Financial Statistics:**") - - financial_stats = { - "Total Value": f"₹{df['amount'].sum():,.2f}", - "Average Invoice": f"₹{df['amount'].mean():,.2f}", - "Median Invoice": f"₹{df['amount'].median():,.2f}", - "Largest Invoice": f"₹{df['amount'].max():,.2f}", - "Smallest Invoice": f"₹{df['amount'].min():,.2f}", - "Standard Deviation": f"₹{df['amount'].std():,.2f}" - } - - stat_cols = st.columns(3) - for i, (stat, value) in enumerate(financial_stats.items()): - with stat_cols[i % 3]: - st.metric(stat, value) - - # High-value invoice analysis - high_value_threshold = df['amount'].quantile(0.9) - high_value_invoices = df[df['amount'] >= high_value_threshold] - - if len(high_value_invoices) > 0: - st.markdown(f"**High-Value Invoices (Top 10% - Above ₹{high_value_threshold:,.2f}):**") - st.dataframe( - high_value_invoices[['invoice_number', 'supplier_name', 'amount', 'date']].sort_values('amount', ascending=False), - column_config={ - "amount": st.column_config.NumberColumn("Amount", format="₹%.2f") - } - ) - - with analytics_tabs[3]: - st.markdown("#### šŸ” Data Quality Analysis") - - # Data completeness analysis - completeness = {} - for col in ['invoice_number', 'supplier_name', 'buyer_name', 'amount', 'date']: - if col in df.columns: - completeness[col] = (df[col].notna() & (df[col] != '')).mean() - - # Quality metrics - col1, col2 = st.columns(2) - - with col1: - st.markdown("**Data Completeness:**") - for field, percentage in completeness.items(): - st.progress(percentage, text=f"{field}: {percentage:.1%}") - - with col2: - # Confidence distribution - fig_confidence = px.histogram( - df, - x='extraction_confidence', - nbins=20, - title="Extraction Confidence Distribution", - labels={'x': 'Confidence Score', 'y': 'Count'} - ) - st.plotly_chart(fig_confidence, use_container_width=True) - - # Quality issues identification - st.markdown("**Quality Issues:**") - - issues = [] - - # Missing data - missing_invoice_numbers = (df['invoice_number'].isna() | (df['invoice_number'] == '')).sum() - if missing_invoice_numbers > 0: - issues.append(f"āŒ {missing_invoice_numbers} invoices missing invoice numbers") - - missing_amounts = (df['amount'].isna() | (df['amount'] == 0)).sum() - if missing_amounts > 0: - issues.append(f"āŒ {missing_amounts} invoices missing amounts") - - low_confidence = (df['extraction_confidence'] < 0.5).sum() - if low_confidence > 0: - issues.append(f"āš ļø {low_confidence} invoices with low confidence (<50%)") - - # Display issues - if issues: - for issue in issues: - st.write(issue) - else: - st.success("āœ… No major quality issues detected!") - - with analytics_tabs[4]: - st.markdown("#### šŸ“Š Pattern Analysis") - - # Advanced pattern analysis - col1, col2 = st.columns(2) - - with col1: - # Amount vs Confidence correlation - if len(df) > 1: - correlation = df['amount'].corr(df['extraction_confidence']) - - fig_scatter = px.scatter( - df, - x='extraction_confidence', - y='amount', - title=f"Amount vs Confidence (Correlation: {correlation:.2f})", - labels={'x': 'Extraction Confidence', 'y': 'Amount (₹)'} - ) - st.plotly_chart(fig_scatter, use_container_width=True) - - with col2: - # Supplier diversity over time - if 'date' in df.columns: - try: - df['date_parsed'] = pd.to_datetime(df['date'], errors='coerce') - df_dated = df.dropna(subset=['date_parsed']) - - if len(df_dated) > 0: - monthly_suppliers = df_dated.groupby(df_dated['date_parsed'].dt.to_period('M'))['supplier_name'].nunique() - - fig_diversity = px.line( - x=monthly_suppliers.index.astype(str), - y=monthly_suppliers.values, - title="Supplier Diversity Over Time", - labels={'x': 'Month', 'y': 'Unique Suppliers'} - ) - st.plotly_chart(fig_diversity, use_container_width=True) - except: - st.info("Could not analyze supplier diversity over time") - - # Pattern insights - st.markdown("**Pattern Insights:**") - - insights = [] - - # Most common amount ranges - amount_ranges = pd.cut(df['amount'], bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']) - most_common_range = amount_ranges.value_counts().index[0] - insights.append(f"šŸ“Š Most invoices fall in the '{most_common_range}' amount range") - - # Supplier concentration - top_supplier_share = (df['supplier_name'].value_counts().iloc[0] / len(df)) if len(df) > 0 else 0 - if top_supplier_share > 0.3: - insights.append(f"āš ļø High supplier concentration: Top supplier represents {top_supplier_share:.1%} of invoices") - else: - insights.append(f"āœ… Good supplier diversity: Top supplier represents {top_supplier_share:.1%} of invoices") - - # Confidence patterns - avg_confidence = df['extraction_confidence'].mean() - if avg_confidence > 0.8: - insights.append(f"āœ… High extraction quality: Average confidence is {avg_confidence:.1%}") - elif avg_confidence > 0.6: - insights.append(f"āš ļø Moderate extraction quality: Average confidence is {avg_confidence:.1%}") - else: - insights.append(f"āŒ Low extraction quality: Average confidence is {avg_confidence:.1%}") - - for insight in insights: - st.write(insight) - - except Exception as e: - st.error(f"Error in analytics: {e}") - - # ------------------------------------------------------------------------- - # TAB: SEARCH - ADVANCED SEMANTIC SEARCH - # ------------------------------------------------------------------------- - - with tabs[4]: - st.markdown("## šŸ” Advanced Semantic Search") - - vector_store = st.session_state.ultimate_processor.vector_store - - if not vector_store or not vector_store.document_metadata: - st.warning("šŸ” No documents in vector store. Please upload and process some invoices first.") - return - - # Search interface - st.markdown("### šŸŽÆ Natural Language Search") - - col1, col2, col3 = st.columns([3, 1, 1]) - - with col1: - search_query = st.text_input( - "Search Query:", - placeholder="e.g., expensive technology equipment, office supplies under 5000, consulting services from last quarter", - help="Use natural language to describe what you're looking for" - ) - - with col2: - top_k = st.number_input("Results", min_value=1, max_value=50, value=10) - - with col3: - similarity_threshold = st.slider("Min Similarity", 0.0, 1.0, 0.1, 0.05) - - # Advanced filters - with st.expander("šŸ”§ Advanced Filters"): - filter_col1, filter_col2, filter_col3 = st.columns(3) - - with filter_col1: - # Amount filter - amount_range = st.slider( - "Amount Range (₹):", - min_value=0, - max_value=100000, - value=(0, 100000), - step=1000 - ) - - with filter_col2: - # Date filter - date_filter = st.date_input( - "Date Range:", - value=None, - help="Filter by invoice date range" - ) - - with filter_col3: - # Supplier filter - all_suppliers = [meta.get('supplier_name', '') for meta in vector_store.document_metadata if meta.get('supplier_name')] - supplier_filter = st.multiselect( - "Suppliers:", - options=list(set(all_suppliers)), - help="Filter by specific suppliers" - ) - - # Perform search - if search_query: - with st.spinner("šŸ” Searching with AI..."): - try: - # Get search results - results = vector_store.semantic_search(search_query, top_k * 2) # Get more for filtering - - # Apply filters - filtered_results = [] - for result in results: - # Similarity filter - if result.similarity_score < similarity_threshold: - continue - - # Amount filter - amount = result.metadata.get('amount', 0) - if not (amount_range[0] <= amount <= amount_range[1]): - continue - - # Supplier filter - if supplier_filter and result.supplier_name not in supplier_filter: - continue - - # Date filter (if implemented) - # Add date filtering logic here if needed - - filtered_results.append(result) - - # Display results - if filtered_results: - st.success(f"šŸŽÆ Found {len(filtered_results)} matching documents") - - # Results summary - col1, col2, col3 = st.columns(3) - with col1: - avg_similarity = sum(r.similarity_score for r in filtered_results) / len(filtered_results) - st.metric("Avg Similarity", f"{avg_similarity:.3f}") - with col2: - total_value = sum(r.metadata.get('amount', 0) for r in filtered_results) - st.metric("Total Value", f"₹{total_value:,.2f}") - with col3: - unique_suppliers = len(set(r.supplier_name for r in filtered_results)) - st.metric("Unique Suppliers", unique_suppliers) - - # Display individual results - for i, result in enumerate(filtered_results[:top_k], 1): - with st.expander( - f"{i}. {result.invoice_number} - {result.supplier_name} " - f"(Similarity: {result.similarity_score:.3f})", - expanded=i <= 3 - ): - col1, col2 = st.columns(2) - - with col1: - st.write(f"**Invoice Number:** {result.invoice_number}") - st.write(f"**Supplier:** {result.supplier_name}") - st.write(f"**Amount:** ₹{result.metadata.get('amount', 0):,.2f}") - st.write(f"**Date:** {result.metadata.get('date', 'N/A')}") - - with col2: - st.write(f"**Similarity Score:** {result.similarity_score:.4f}") - st.write(f"**File:** {result.metadata.get('file_name', 'N/A')}") - st.write(f"**Confidence:** {result.metadata.get('extraction_confidence', 0):.1%}") - st.write(f"**Indexed:** {result.metadata.get('timestamp', 'N/A')[:19]}") - - st.write("**Content Preview:**") - st.text_area( - "Document content:", - value=result.content_preview, - height=80, - key=f"content_{i}", - disabled=True - ) - - # Export results - if st.button("šŸ“„ Export Search Results"): - export_data = [] - for result in filtered_results[:top_k]: - export_data.append({ - 'search_query': search_query, - 'invoice_number': result.invoice_number, - 'supplier_name': result.supplier_name, - 'similarity_score': result.similarity_score, - 'amount': result.metadata.get('amount', 0), - 'date': result.metadata.get('date', ''), - 'file_name': result.metadata.get('file_name', ''), - 'content_preview': result.content_preview - }) - - export_df = pd.DataFrame(export_data) - csv_data = export_df.to_csv(index=False) - - st.download_button( - "šŸ“„ Download CSV", - csv_data, - f"search_results_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", - "text/csv" - ) - - else: - st.warning("šŸ” No results found matching your criteria. Try:") - st.write("• Broadening your search terms") - st.write("• Lowering the similarity threshold") - st.write("• Removing some filters") - st.write("• Using different keywords") - - except Exception as e: - st.error(f"Search error: {e}") - - # Search suggestions and examples - st.markdown("### šŸ’” Search Examples & Tips") - - example_tabs = st.tabs(["šŸŽÆ By Content", "šŸ’° By Amount", "šŸ¢ By Supplier", "šŸ“… By Time"]) - - with example_tabs[0]: - content_examples = [ - "office supplies and stationery items", - "technology equipment and software", - "consulting and professional services", - "travel and transportation expenses", - "maintenance and repair services" - ] - st.write("**Search by product/service type:**") - for i, example in enumerate(content_examples): - if st.button(f"šŸ” {example}", key=f"content_example_{i}"): - st.text_input("Search Query:", value=example, key="auto_fill_content") - - with example_tabs[1]: - amount_examples = [ - "high value purchases over 50000", - "small expenses under 5000", - "medium range invoices between 10000 and 30000", - "expensive equipment purchases" - ] - st.write("**Search by amount range:**") - for i, example in enumerate(amount_examples): - if st.button(f"šŸ’° {example}", key=f"amount_example_{i}"): - st.text_input("Search Query:", value=example, key="auto_fill_amount") - - with example_tabs[2]: - supplier_examples = [ - "invoices from technology vendors", - "services from consulting companies", - "purchases from office supply stores", - "payments to maintenance contractors" - ] - st.write("**Search by supplier type:**") - for i, example in enumerate(supplier_examples): - if st.button(f"šŸ¢ {example}", key=f"supplier_example_{i}"): - st.text_input("Search Query:", value=example, key="auto_fill_supplier") - - with example_tabs[3]: - time_examples = [ - "recent invoices from this month", - "quarterly expenses and spending", - "annual contract payments", - "recurring monthly services" - ] - st.write("**Search by time period:**") - for i, example in enumerate(time_examples): - if st.button(f"šŸ“… {example}", key=f"time_example_{i}"): - st.text_input("Search Query:", value=example, key="auto_fill_time") - - # ------------------------------------------------------------------------- - # TAB: DATA - COMPREHENSIVE DATA MANAGEMENT - # ------------------------------------------------------------------------- - - with tabs[5]: - st.markdown("## šŸ“‹ Data Management & Export") - - # Data overview - try: - json_data = st.session_state.ultimate_chatbot.load_json_data() - invoices = json_data.get("invoices", []) - - if not invoices: - st.warning("šŸ“Š No invoice data available.") - return - - # Convert to DataFrame for display - df_data = [] - for inv in invoices: - df_data.append({ - 'ID': inv.get('id', ''), - 'Invoice Number': inv.get('invoice_number', ''), - 'Supplier': inv.get('supplier_name', ''), - 'Buyer': inv.get('buyer_name', ''), - 'Date': inv.get('date', ''), - 'Amount': inv.get('amount', 0), - 'Quantity': inv.get('quantity', 0), - 'Description': inv.get('product_description', ''), - 'Confidence': inv.get('extraction_info', {}).get('confidence', 0), - 'Method': inv.get('extraction_info', {}).get('extraction_method', ''), - 'File Type': inv.get('file_info', {}).get('file_type', ''), - 'File Size': inv.get('file_info', {}).get('file_size', 0), - 'Vector Indexed': 'Yes' if any(meta.get('invoice_number') == inv.get('invoice_number') - for meta in st.session_state.ultimate_processor.vector_store.document_metadata) else 'No', - 'Created': inv.get('timestamps', {}).get('created_at', '')[:19] - }) - - df = pd.DataFrame(df_data) - - # Data summary - st.markdown("### šŸ“Š Data Summary") - - summary_col1, summary_col2, summary_col3, summary_col4 = st.columns(4) - - with summary_col1: - st.metric("Total Records", len(df)) - st.metric("Complete Records", (df['Invoice Number'].notna() & (df['Invoice Number'] != '')).sum()) - - with summary_col2: - st.metric("Total Value", f"₹{df['Amount'].sum():,.2f}") - st.metric("Avg Value", f"₹{df['Amount'].mean():,.2f}") - - with summary_col3: - st.metric("Unique Suppliers", df['Supplier'].nunique()) - st.metric("Unique Buyers", df['Buyer'].nunique()) - - with summary_col4: - st.metric("Avg Confidence", f"{df['Confidence'].mean():.1%}") - st.metric("Vector Indexed", (df['Vector Indexed'] == 'Yes').sum()) - - # Filtering interface - st.markdown("### šŸ” Filter & View Data") - - filter_col1, filter_col2, filter_col3, filter_col4, filter_col5 = st.columns(5) - - with filter_col1: - suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist()) - selected_supplier = st.selectbox("Supplier", suppliers) - - with filter_col2: - buyers = ['All'] + sorted(df['Buyer'].dropna().unique().tolist()) - selected_buyer = st.selectbox("Buyer", buyers) - - with filter_col3: - methods = ['All'] + sorted(df['Method'].dropna().unique().tolist()) - selected_method = st.selectbox("Method", methods) - - with filter_col4: - confidence_options = ["All", "High (>80%)", "Medium (50-80%)", "Low (<50%)"] - confidence_filter = st.selectbox("Confidence", confidence_options) - - with filter_col5: - vector_options = ["All", "Yes", "No"] - vector_filter = st.selectbox("Vector Indexed", vector_options) - - # Apply filters - filtered_df = df.copy() - - if selected_supplier != 'All': - filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier] - if selected_buyer != 'All': - filtered_df = filtered_df[filtered_df['Buyer'] == selected_buyer] - if selected_method != 'All': - filtered_df = filtered_df[filtered_df['Method'] == selected_method] - if vector_filter != 'All': - filtered_df = filtered_df[filtered_df['Vector Indexed'] == vector_filter] - - if confidence_filter == "High (>80%)": - filtered_df = filtered_df[filtered_df['Confidence'] > 0.8] - elif confidence_filter == "Medium (50-80%)": - filtered_df = filtered_df[(filtered_df['Confidence'] >= 0.5) & (filtered_df['Confidence'] <= 0.8)] - elif confidence_filter == "Low (<50%)": - filtered_df = filtered_df[filtered_df['Confidence'] < 0.5] - - # Display filtered data - if len(filtered_df) != len(df): - st.info(f"Showing {len(filtered_df)} of {len(df)} records") - - # Data table with enhanced configuration - st.dataframe( - filtered_df, - use_container_width=True, - column_config={ - "Amount": st.column_config.NumberColumn("Amount", format="₹%.2f"), - "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1), - "File Size": st.column_config.NumberColumn("File Size", format="%d bytes"), - "Vector Indexed": st.column_config.SelectboxColumn("Vector Indexed", options=["Yes", "No"]), - }, - height=400 - ) - - # Export options - st.markdown("### šŸ“„ Export Options") - - export_col1, export_col2, export_col3, export_col4 = st.columns(4) - - with export_col1: - # CSV Export - csv_data = filtered_df.to_csv(index=False) - st.download_button( - "šŸ“Š Export CSV", - csv_data, - f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", - "text/csv", - use_container_width=True - ) - - with export_col2: - # JSON Export - filtered_invoices = [inv for inv in invoices if inv.get('id') in filtered_df['ID'].values] - filtered_json = { - "metadata": json_data.get("metadata", {}), - "invoices": filtered_invoices, - "export_info": { - "exported_at": datetime.now().isoformat(), - "total_records": len(filtered_invoices), - "filters_applied": { - "supplier": selected_supplier if selected_supplier != 'All' else None, - "buyer": selected_buyer if selected_buyer != 'All' else None, - "method": selected_method if selected_method != 'All' else None, - "confidence": confidence_filter if confidence_filter != 'All' else None, - "vector_indexed": vector_filter if vector_filter != 'All' else None - } - } - } - - st.download_button( - "šŸ“„ Export JSON", - json.dumps(filtered_json, indent=2, ensure_ascii=False), - f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - "application/json", - use_container_width=True - ) - - with export_col3: - # Excel Export (if openpyxl available) - try: - import io - excel_buffer = io.BytesIO() - with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: - filtered_df.to_excel(writer, sheet_name='Invoice Data', index=False) - - # Add summary sheet - summary_data = { - 'Metric': ['Total Records', 'Total Value', 'Unique Suppliers', 'Avg Confidence'], - 'Value': [len(filtered_df), f"₹{filtered_df['Amount'].sum():,.2f}", - filtered_df['Supplier'].nunique(), f"{filtered_df['Confidence'].mean():.1%}"] - } - pd.DataFrame(summary_data).to_excel(writer, sheet_name='Summary', index=False) - - st.download_button( - "šŸ“ˆ Export Excel", - excel_buffer.getvalue(), - f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M')}.xlsx", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - use_container_width=True - ) - except ImportError: - st.button("šŸ“ˆ Excel Export", disabled=True, help="Install openpyxl for Excel export", use_container_width=True) - - with export_col4: - # Vector Metadata Export - if st.session_state.ultimate_processor.vector_store: - vector_metadata = [meta for meta in st.session_state.ultimate_processor.vector_store.document_metadata - if any(inv.get('invoice_number') == meta.get('invoice_number') for inv in filtered_invoices)] - - st.download_button( - "šŸ” Export Vector Metadata", - json.dumps(vector_metadata, indent=2, ensure_ascii=False), - f"vector_metadata_{datetime.now().strftime('%Y%m%d_%H%M')}.json", - "application/json", - use_container_width=True - ) - # Data management actions - st.markdown("### šŸ› ļø Data Management Actions") - - action_col1, action_col2, action_col3, action_col4 = st.columns(4) - - with action_col1: - if st.button("šŸ”„ Refresh Data", use_container_width=True, key="refresh_data_main"): - st.rerun() - - with action_col2: - if st.button("🧹 Clean Duplicates", use_container_width=True, key="clean_duplicates_main"): - # Find and remove duplicates based on invoice number - duplicates = df[df.duplicated('Invoice Number', keep=False)] - if len(duplicates) > 0: - st.warning(f"Found {len(duplicates)} duplicate records") - # Show duplicates for review - st.dataframe(duplicates[['Invoice Number', 'Supplier', 'Amount', 'Created']]) - else: - st.success("No duplicates found!") - - with action_col3: - if st.button("šŸ“Š Validate Data", use_container_width=True,key="validate_data_main"): - # Data validation - validation_results = [] - - # Check for missing critical fields - missing_invoice_numbers = (df['Invoice Number'].isna() | (df['Invoice Number'] == '')).sum() - if missing_invoice_numbers > 0: - validation_results.append(f"āŒ {missing_invoice_numbers} records missing invoice numbers") - - missing_amounts = (df['Amount'].isna() | (df['Amount'] == 0)).sum() - if missing_amounts > 0: - validation_results.append(f"āŒ {missing_amounts} records missing amounts") - - low_confidence = (df['Confidence'] < 0.5).sum() - if low_confidence > 0: - validation_results.append(f"āš ļø {low_confidence} records with low confidence") - - # Check for unusual patterns - very_high_amounts = (df['Amount'] > df['Amount'].quantile(0.99)).sum() - if very_high_amounts > 0: - validation_results.append(f"šŸ” {very_high_amounts} unusually high amounts detected") - - # Display results - if validation_results: - for result in validation_results: - st.write(result) - else: - st.success("āœ… All validations passed!") - - with action_col4: - if st.button("āš ļø Clear All Data", use_container_width=True,key="clear_all_data_main"): - # Confirmation dialog - if st.button("šŸ—‘ļø Confirm Delete All", type="secondary"): - try: - # Clear database - conn = sqlite3.connect(st.session_state.ultimate_processor.db_path) - cursor = conn.cursor() - cursor.execute("DELETE FROM invoices") - cursor.execute("DELETE FROM file_processing_log") - cursor.execute("DELETE FROM processing_summary") - conn.commit() - conn.close() - - # Reset JSON file - st.session_state.ultimate_processor.setup_json_storage() - - # Clear vector store - vector_store = st.session_state.ultimate_processor.vector_store - if vector_store: - vector_store.vector_store = faiss.IndexFlatIP(vector_store.embedding_dimension) - vector_store.document_metadata = [] - vector_store.save_vector_store() - - st.success("āœ… All data cleared successfully!") - st.rerun() - except Exception as e: - st.error(f"Error clearing data: {e}") - - except Exception as e: - st.error(f"Error in data management: {e}") - # ------------------------------------------------------------------------- - # TAB: ADMIN - SYSTEM ADMINISTRATION - # ------------------------------------------------------------------------- - - with tabs[6]: - st.markdown("## šŸŽ›ļø System Administration") - - # Admin tabs - admin_tabs = st.tabs([ - "šŸ„ Health Monitor", - "⚔ Performance", - "šŸ’¾ Backup & Restore", - "šŸš€ Deployment", - "šŸ”§ Settings", - "šŸ“Š Logs" - ]) - - with admin_tabs[0]: - create_system_health_dashboard() - - with admin_tabs[1]: - performance_benchmark() - - with admin_tabs[2]: - create_backup_system() - - with admin_tabs[3]: - create_deployment_guide() - - with admin_tabs[4]: - st.markdown("### āš™ļø System Settings") - - # Model settings - st.markdown("#### šŸ¤– AI Model Configuration") - - current_embedding_model = st.session_state.ultimate_processor.vector_store.embedding_model_name - - new_embedding_model = st.selectbox( - "Embedding Model:", - [ - "all-MiniLM-L6-v2", - "all-mpnet-base-v2", - "multi-qa-mpnet-base-dot-v1", - "all-distilroberta-v1", - "paraphrase-multilingual-mpnet-base-v2" - ], - index=0 if current_embedding_model == "all-MiniLM-L6-v2" else 0 - ) - - if st.button("šŸ”„ Update Embedding Model"): - if new_embedding_model != current_embedding_model: - with st.spinner(f"Switching to {new_embedding_model}..."): - try: - # Create new vector store - new_vector_store = InvoiceVectorStore(embedding_model=new_embedding_model) - - # Rebuild with existing data - json_data = st.session_state.ultimate_chatbot.load_json_data() - if new_vector_store.rebuild_vector_store(json_data): - st.session_state.ultimate_processor.vector_store = new_vector_store - st.session_state.ultimate_chatbot.vector_store = new_vector_store - st.success(f"āœ… Switched to {new_embedding_model}") - st.rerun() - else: - st.error("Failed to rebuild vector store") - except Exception as e: - st.error(f"Error switching model: {e}") - - # Processing settings - st.markdown("#### āš™ļø Processing Configuration") - - col1, col2 = st.columns(2) - - with col1: - batch_size = st.number_input("Batch Size", min_value=1, max_value=100, value=10) - max_text_length = st.number_input("Max Text Length", min_value=1000, max_value=20000, value=5000) - - with col2: - similarity_threshold = st.slider("Default Similarity Threshold", 0.0, 1.0, 0.1, 0.05) - auto_backup_enabled = st.checkbox("Enable Auto Backup", value=False) - - if st.button("šŸ’¾ Save Settings", key="save_settings_user"): - settings = { - "batch_size": batch_size, - "max_text_length": max_text_length, - "similarity_threshold": similarity_threshold, - "auto_backup_enabled": auto_backup_enabled, - "embedding_model": new_embedding_model, - "updated_at": datetime.now().isoformat() - } - - # Save to session state and file - st.session_state.system_settings = settings - - try: - with open("system_settings.json", "w") as f: - json.dump(settings, f, indent=2) - st.success("āœ… Settings saved successfully!") - except Exception as e: - st.warning(f"Settings saved to session but file save failed: {e}") - - with admin_tabs[5]: - st.markdown("### šŸ“Š System Logs") - - # Log viewer - log_type = st.selectbox("Log Type", ["Application", "Processing", "Errors", "Performance"]) - - # Simulated logs (in real implementation, read from log files) - if log_type == "Application": - logs = [ - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - System initialized successfully", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - Vector store loaded with 150 documents", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - User query processed: 'show me all invoices'", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - Analytics dashboard accessed", - ] - elif log_type == "Processing": - logs = [ - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - Processing invoice_001.pdf", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - SUCCESS - Extracted data from invoice_001.pdf", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - Added to vector store", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - INFO - Processing complete", - ] - elif log_type == "Errors": - logs = [ - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - Failed to process corrupt_file.pdf", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - WARNING - Low extraction confidence for invoice_045.pdf", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - ERROR - Ollama connection timeout", - ] - else: # Performance - logs = [ - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - PERF - Query response time: 1.2s", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - PERF - Vector search: 0.8s", - f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - PERF - Memory usage: 75%", - ] - - # Display logs - log_container = st.container() - with log_container: - for log in logs[-50:]: # Show last 50 entries - if "ERROR" in log: - st.error(log) - elif "WARNING" in log: - st.warning(log) - elif "SUCCESS" in log: - st.success(log) - else: - st.info(log) - - # Log controls - col1, col2, col3 = st.columns(3) - - with col1: - if st.button("šŸ”„ Refresh Logs"): - st.rerun() - - with col2: - if st.button("šŸ“„ Export Logs"): - log_data = "\n".join(logs) - st.download_button( - "Download Log File", - log_data, - f"{log_type.lower()}_logs_{datetime.now().strftime('%Y%m%d_%H%M')}.txt", - "text/plain" - ) - - with col3: - if st.button("🧹 Clear Logs"): - st.info("Logs cleared (simulated)") - - # ------------------------------------------------------------------------- - # OPTIONAL: DASHBOARD TAB - # ------------------------------------------------------------------------- - - if st.session_state.get('show_dashboard', False) and len(tabs) > 6: - with tabs[7]: - create_system_dashboard() - - # ------------------------------------------------------------------------- - # FOOTER - # ------------------------------------------------------------------------- - - st.markdown("---") - st.markdown(""" -
-

šŸš€ Ultimate Invoice Processing System - Powered by AI & Vector Search

-

Built with ā¤ļø using Streamlit, spaCy, FAISS, and Ollama

-
- """, unsafe_allow_html=True) - - -# =============================================================================== -# MAIN ENTRY POINT AND CONFIGURATION -# =============================================================================== - -def main(): - """Main entry point - choose between original or ultimate version""" - - # Check if we should run ultimate version - if st.query_params.get("version") == "ultimate" or st.session_state.get("use_ultimate", False): - ultimate_enhanced_main() - else: - # Show version selector - st.title("šŸ“„ Invoice Processing System") - - col1, col2 = st.columns(2) - - with col1: - st.markdown(""" - ### šŸ”§ Standard Version - - Basic invoice processing - - Simple analytics - - Regular features - """) - if st.button("Use Standard Version", use_container_width=True): - enhanced_main() # From previous parts - - with col2: - st.markdown(""" - ### šŸš€ Ultimate Version - - AI-powered extraction - - Semantic search - - Advanced analytics - - Production features - """) - if st.button("Use Ultimate Version", use_container_width=True, type="primary"): - st.session_state.use_ultimate = True - st.rerun() - - -# =============================================================================== -# UTILITY FUNCTIONS AND HELPERS -# =============================================================================== - -def initialize_system(): - """Initialize the complete system with all components""" - try: - # Check dependencies - requirements_met, issues = validate_system_requirements() - if not requirements_met: - st.error("System requirements not met:") - for issue in issues: - st.write(issue) - return False - - # Initialize logging - logger = setup_advanced_logging() - logger.info("System initialization started") - - # Load configuration - try: - with open("system_settings.json", "r") as f: - settings = json.load(f) - st.session_state.system_settings = settings - except FileNotFoundError: - # Use default settings - st.session_state.system_settings = { - "batch_size": 10, - "max_text_length": 5000, - "similarity_threshold": 0.1, - "embedding_model": "all-MiniLM-L6-v2" - } - - logger.info("System initialization completed successfully") - return True - - except Exception as e: - st.error(f"System initialization failed: {e}") - return False - -def create_installation_guide(): - """Create comprehensive installation guide""" - st.markdown(""" - ## šŸ› ļø Installation Guide - - ### Quick Start (5 minutes) - - ```bash - # 1. Clone the repository - git clone https://github.com/your-repo/enhanced-invoice-system - cd enhanced-invoice-system - - # 2. Install Python dependencies - pip install -r requirements.txt - - # 3. Install and start Ollama - curl -fsSL https://ollama.com/install.sh | sh - ollama serve - ollama pull mistral:7b - - # 4. Run the application - streamlit run enhanced_main.py - ``` - - ### System Requirements - - Python 3.8+ - - 8GB RAM (16GB recommended) - - 10GB disk space - - Internet connection (for initial setup) - - ### Optional GPU Setup - ```bash - # For GPU acceleration (NVIDIA) - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 - pip install faiss-gpu - ``` - - ### Docker Setup - ```bash - # Build and run with Docker - docker-compose up --build - ``` - - ### Troubleshooting - - Ensure Ollama is running: `curl http://localhost:11434/api/tags` - - Check Python version: `python --version` - - Verify dependencies: `pip list` - """) - -def generate_api_endpoints(): - """Generate FastAPI endpoints for the system""" - - api_code = ''' -from fastapi import FastAPI, UploadFile, File, HTTPException -from pydantic import BaseModel -from typing import List, Optional -import uvicorn - -app = FastAPI(title="Enhanced Invoice Processing API", version="1.0.0") - -# Initialize the invoice processor -processor = EnhancedInvoiceProcessor() - -class SearchRequest(BaseModel): - query: str - top_k: int = 5 - similarity_threshold: float = 0.1 - -class SearchResponse(BaseModel): - results: List[dict] - total_found: int - query_time: float - -@app.post("/api/upload", response_model=dict) -async def upload_invoice(file: UploadFile = File(...)): - """Upload and process an invoice""" - try: - # Save uploaded file - file_content = await file.read() - - # Process with the enhanced processor - result = processor.process_file(file.filename, len(file_content)) - - return { - "success": True, - "invoice_data": result.__dict__, - "message": "Invoice processed successfully" - } - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - -@app.post("/api/search", response_model=SearchResponse) -async def semantic_search(request: SearchRequest): - """Perform semantic search on invoices""" - try: - import time - start_time = time.time() - - # Perform search - results = processor.vector_store.semantic_search( - request.query, - request.top_k - ) - - query_time = time.time() - start_time - - return SearchResponse( - results=[r.__dict__ for r in results], - total_found=len(results), - query_time=query_time - ) - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - -@app.get("/api/analytics/summary") -async def get_analytics_summary(): - """Get system analytics summary""" - try: - json_data = processor.load_json_data() - summary = json_data.get("summary", {}) - - return { - "total_invoices": len(json_data.get("invoices", [])), - "total_amount": summary.get("total_amount", 0), - "unique_suppliers": len(summary.get("unique_suppliers", [])), - "processing_stats": summary.get("processing_stats", {}) - } - except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) - -@app.get("/api/health") -async def health_check(): - """System health check""" - return { - "status": "healthy", - "timestamp": datetime.now().isoformat(), - "components": { - "vector_store": "ok" if processor.vector_store else "error", - "database": "ok" if os.path.exists(processor.db_path) else "error", - "embedding_model": "ok" if processor.vector_store.embedding_model else "error" - } - } - -if __name__ == "__main__": - uvicorn.run(app, host="0.0.0.0", port=8000) -''' - - return api_code - -def create_readme_documentation(): - """Generate comprehensive README.md content""" - - readme_content = f''' -# šŸš€ Enhanced Invoice Processing System - -A comprehensive AI-powered invoice processing system with semantic search capabilities, advanced analytics, and production-ready features. - -## ✨ Features - -- **šŸ¤– AI-Powered Extraction**: Uses advanced NLP models for accurate data extraction -- **šŸ” Semantic Search**: Natural language search with vector similarity -- **šŸ“Š Advanced Analytics**: Comprehensive business intelligence dashboards -- **šŸ’¾ Hybrid Storage**: SQLite + JSON + Vector embeddings -- **šŸŽ›ļø Admin Interface**: Complete system management tools -- **šŸš€ Production Ready**: Docker support, monitoring, backup/restore - -## šŸ—ļø Architecture - -``` -ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” -│ PDF Upload │───▶│ AI Extraction │───▶│ Data Storage │ -ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ - │ -ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” -│ Search & Chat │◀───│ Vector Embeddings│◀───│ JSON + SQLite │ -ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ -``` - -## šŸš€ Quick Start - -### Option 1: Local Installation - -1. **Install Dependencies** - ```bash - git clone https://github.com/your-repo/enhanced-invoice-system - cd enhanced-invoice-system - pip install -r requirements.txt - ``` - -2. **Setup Ollama** - ```bash - curl -fsSL https://ollama.com/install.sh | sh - ollama serve - ollama pull mistral:7b - ``` - -3. **Run the Application** - ```bash - streamlit run enhanced_main.py - ``` - -### Option 2: Docker - -```bash -docker-compose up --build -``` - -## šŸ“Š Usage Examples - -### Basic Upload and Processing -```python -from enhanced_invoice_pipeline import EnhancedInvoiceProcessor - -# Initialize processor -processor = EnhancedInvoiceProcessor() - -# Process invoice -result = processor.process_file("invoice.pdf") -print(f"Extracted: {{result.invoice_number}}") -``` - -### Semantic Search -```python -# Search invoices -results = processor.vector_store.semantic_search( - "high value technology purchases", - top_k=5 -) - -for result in results: - print(f"Found: {{result.invoice_number}} (Score: {{result.similarity_score}})") -``` - -### Analytics Query -```python -# Get comprehensive summary -chatbot = EnhancedInvoiceChatBot() -response = chatbot.query_database("Show me spending trends over time") -print(response) -``` - -## šŸ› ļø Configuration - -### Environment Variables -```bash -export OLLAMA_HOST=localhost:11434 -export EMBEDDING_MODEL=all-MiniLM-L6-v2 -export VECTOR_STORE_PATH=./data/vectors -export DATABASE_PATH=./data/invoices.db -``` - -### Custom Settings -```json -{{ - "batch_size": 10, - "max_text_length": 5000, - "similarity_threshold": 0.1, - "auto_backup_enabled": true -}} -``` - -## šŸ“” API Reference - -### REST Endpoints - -- `POST /api/upload` - Upload and process invoice -- `POST /api/search` - Semantic search -- `GET /api/analytics/summary` - Get analytics summary -- `GET /api/health` - Health check - -### Python API - -```python -# Core classes -from enhanced_invoice_pipeline import ( - EnhancedInvoiceProcessor, - InvoiceVectorStore, - EnhancedInvoiceChatBot -) - -# Initialize components -processor = EnhancedInvoiceProcessor() -vector_store = InvoiceVectorStore() -chatbot = EnhancedInvoiceChatBot() -``` - -## šŸ”§ Advanced Features - -### Custom NER Models -Train your own spaCy NER model for domain-specific extraction: - -```python -# Training data format -TRAINING_DATA = [ - ("Invoice Number: INV-2024-001", {{"entities": [(16, 27, "INVOICE_NUMBER")]}}), - ("Total Amount: $1,250.00", {{"entities": [(14, 23, "TOTAL_AMOUNT")]}}), -] - -# Train and use custom model -nlp = train_custom_ner_model(TRAINING_DATA) -processor = EnhancedInvoiceProcessor(ner_model_path="./custom_model") -``` - -### Vector Store Customization -```python -# Use different embedding models -vector_store = InvoiceVectorStore( - embedding_model="all-mpnet-base-v2" # Higher quality -) - -# Custom similarity search -results = vector_store.semantic_search( - query="office supplies", - top_k=10, - similarity_threshold=0.3 -) -``` - -## šŸ“Š Monitoring & Analytics - -### Built-in Dashboards -- **šŸ“ˆ Processing Analytics**: Success rates, processing times -- **šŸ’° Financial Analytics**: Spending trends, supplier analysis -- **šŸ” Search Analytics**: Query patterns, result quality -- **āš™ļø System Health**: Resource usage, component status - -### Custom Metrics -```python -# Get system statistics -stats = processor.get_system_stats() -print(f"Total processed: {{stats['total_invoices']}}") -print(f"Success rate: {{stats['success_rate']}}") -``` - -## šŸ”’ Security & Compliance - -### Data Protection -- Encrypted storage options -- Access control and authentication -- Audit logging -- GDPR compliance features - -### Deployment Security -```yaml -# docker-compose.yml security settings -services: - invoice-app: - environment: - - SECURE_MODE=true - - SSL_CERT_PATH=/certs/cert.pem - - SSL_KEY_PATH=/certs/key.pem -``` - -## šŸš€ Production Deployment - -### Performance Optimization -- GPU acceleration support -- Batch processing optimization -- Caching strategies -- Load balancing ready - -### Scaling Options -- Horizontal scaling with container orchestration -- Database clustering -- Distributed vector storage -- Microservices architecture - -## šŸ¤ Contributing - -1. Fork the repository -2. Create feature branch (`git checkout -b feature/amazing-feature`) -3. Commit changes (`git commit -m 'Add amazing feature'`) -4. Push to branch (`git push origin feature/amazing-feature`) -5. Open a Pull Request - -## šŸ“„ License - -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. - -## šŸ™ Acknowledgments - -- [spaCy](https://spacy.io/) for NLP capabilities -- [Sentence Transformers](https://www.sbert.net/) for embeddings -- [FAISS](https://faiss.ai/) for vector similarity search -- [Streamlit](https://streamlit.io/) for the web interface -- [Ollama](https://ollama.ai/) for local LLM support - -## šŸ“ž Support - -- šŸ“§ Email: support@your-company.com -- šŸ’¬ Discord: [Your Discord Server] -- šŸ“– Documentation: [Full Documentation] -- šŸ› Issues: [GitHub Issues] - ---- - -**Made with ā¤ļø for the AI community** -''' - - return readme_content - -# =============================================================================== -# FINAL SYSTEM INTEGRATION AND STARTUP -# =============================================================================== - -if __name__ == "__main__": - # Initialize system - if initialize_system(): - # Run main application - main() - else: - st.error("āŒ System initialization failed. Please check the logs and try again.") +#!/usr/bin/env python3 +""" +Enhanced Invoice Processing & Analysis System - Hugging Face Spaces Compatible +A comprehensive system with AI-powered extraction, semantic search, and analytics. + +Author: AI Assistant +Date: 2024 +Version: HuggingFace v1.0 +""" + +# =============================================================================== +# IMPORTS AND HUGGING FACE COMPATIBILITY +# =============================================================================== + +import os +import json +import re +import tempfile +import shutil +import pickle +import numpy as np +from datetime import datetime +from typing import Dict, List, Optional, Tuple +from dataclasses import dataclass +from pathlib import Path +import time +import logging + +# Check if running on Hugging Face Spaces +IS_HF_SPACE = os.getenv("SPACE_ID") is not None + +# Streamlit and core libraries +import streamlit as st +import sqlite3 +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import requests + +# Vector storage and embeddings (HF compatible) +try: + import faiss + FAISS_AVAILABLE = True +except ImportError: + FAISS_AVAILABLE = False + st.warning("āš ļø FAISS not available. Vector search will be disabled.") + +try: + from sentence_transformers import SentenceTransformer + SENTENCE_TRANSFORMERS_AVAILABLE = True +except ImportError: + SENTENCE_TRANSFORMERS_AVAILABLE = False + st.warning("āš ļø Sentence Transformers not available. Using fallback methods.") + +try: + import torch + TORCH_AVAILABLE = True +except ImportError: + TORCH_AVAILABLE = False + +# Document processing (simplified for HF) +try: + from docling.document_converter import DocumentConverter + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import PdfFormatOption + DOCLING_AVAILABLE = True +except ImportError: + DOCLING_AVAILABLE = False + st.warning("āš ļø Docling not available. Using simplified document processing.") + +# Alternative document processing for HF +try: + import pdfplumber + PDF_PROCESSING_AVAILABLE = True +except ImportError: + try: + import PyPDF2 + PDF_PROCESSING_AVAILABLE = True + except ImportError: + PDF_PROCESSING_AVAILABLE = False + +# =============================================================================== +# HUGGING FACE CONFIGURATION +# =============================================================================== + +# Hugging Face Spaces configuration +HF_CONFIG = { + "max_file_size_mb": 10, # Reduced for HF Spaces + "max_concurrent_files": 3, # Reduced for HF Spaces + "timeout_seconds": 30, + "use_cpu_only": True, # Force CPU for HF Spaces + "embedding_model": "all-MiniLM-L6-v2", # Lightweight model + "cache_dir": "./cache", + "data_dir": "./data", + "enable_ollama": False, # Disable Ollama for HF Spaces +} + +# Create necessary directories +os.makedirs(HF_CONFIG["cache_dir"], exist_ok=True) +os.makedirs(HF_CONFIG["data_dir"], exist_ok=True) + +# =============================================================================== +# STREAMLIT CONFIGURATION FOR HUGGING FACE +# =============================================================================== + +st.set_page_config( + page_title="AI Invoice Processing System", + page_icon="šŸ“„", + layout="wide", + initial_sidebar_state="expanded", + menu_items={ + 'Get Help': 'https://huggingface.co/spaces/your-space/help', + 'Report a bug': 'https://huggingface.co/spaces/your-space/issues', + 'About': """ + # AI Invoice Processing System + Built for Hugging Face Spaces with AI-powered extraction and semantic search. + """ + } +) + +# =============================================================================== +# SIMPLIFIED DATA STRUCTURES FOR HF +# =============================================================================== + +@dataclass +class InvoiceData: + """Simplified data structure for extracted invoice information""" + supplier_name: str = "" + buyer_name: str = "" + invoice_number: str = "" + date: str = "" + amount: float = 0.0 + quantity: int = 0 + product_description: str = "" + file_path: str = "" + extraction_confidence: float = 0.0 + processing_method: str = "regex" + +@dataclass +class VectorSearchResult: + """Data structure for vector search results""" + invoice_id: str + invoice_number: str + supplier_name: str + similarity_score: float + content_preview: str + metadata: Dict + +# =============================================================================== +# HUGGING FACE COMPATIBLE VECTOR STORE +# =============================================================================== + +class HuggingFaceVectorStore: + """Simplified vector store compatible with Hugging Face Spaces""" + + def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"): + self.embedding_model_name = embedding_model + self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl") + self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl") + self.embedding_model = None + self.vectors = [] + self.document_metadata = [] + self.embedding_dimension = None + + self.setup_embedding_model() + self.load_vector_store() + + def setup_embedding_model(self): + """Initialize the sentence transformer model""" + if not SENTENCE_TRANSFORMERS_AVAILABLE: + st.warning("āš ļø Sentence Transformers not available. Vector search disabled.") + return + + try: + with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."): + self.embedding_model = SentenceTransformer( + self.embedding_model_name, + cache_folder=HF_CONFIG["cache_dir"] + ) + + # Get embedding dimension + test_embedding = self.embedding_model.encode(["test"]) + self.embedding_dimension = test_embedding.shape[0] + + st.success(f"āœ… Embedding model loaded: {self.embedding_model_name}") + + except Exception as e: + st.error(f"āŒ Failed to load embedding model: {e}") + self.embedding_model = None + + def load_vector_store(self): + """Load existing vector store""" + try: + if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path): + with open(self.vector_store_path, 'rb') as f: + self.vectors = pickle.load(f) + + with open(self.metadata_path, 'rb') as f: + self.document_metadata = pickle.load(f) + + st.success(f"āœ… Vector store loaded: {len(self.document_metadata)} documents") + else: + self.vectors = [] + self.document_metadata = [] + st.info("šŸ“„ New vector store initialized") + + except Exception as e: + st.error(f"āŒ Error loading vector store: {e}") + self.vectors = [] + self.document_metadata = [] + + def save_vector_store(self): + """Save vector store to disk""" + try: + with open(self.vector_store_path, 'wb') as f: + pickle.dump(self.vectors, f) + + with open(self.metadata_path, 'wb') as f: + pickle.dump(self.document_metadata, f) + + return True + except Exception as e: + st.error(f"Error saving vector store: {e}") + return False + + def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str: + """Create searchable text from invoice data""" + text_parts = [] + + for field, value in invoice_data.items(): + if value and field != 'id': + text_parts.append(f"{field}: {value}") + + if raw_text: + text_parts.append(f"content: {raw_text[:300]}") + + return " | ".join(text_parts) + + def add_document(self, invoice_data: dict, raw_text: str = "") -> bool: + """Add a document to the vector store""" + if not self.embedding_model: + return False + + try: + document_text = self.create_document_text(invoice_data, raw_text) + + # Generate embedding + embedding = self.embedding_model.encode(document_text, normalize_embeddings=True) + + # Create metadata + metadata = { + 'invoice_id': invoice_data.get('id', ''), + 'invoice_number': invoice_data.get('invoice_number', ''), + 'supplier_name': invoice_data.get('supplier_name', ''), + 'buyer_name': invoice_data.get('buyer_name', ''), + 'amount': invoice_data.get('amount', 0), + 'date': invoice_data.get('date', ''), + 'file_name': invoice_data.get('file_info', {}).get('file_name', ''), + 'document_text': document_text[:200], + 'timestamp': datetime.now().isoformat() + } + + # Add to store + self.vectors.append(embedding) + self.document_metadata.append(metadata) + + return True + + except Exception as e: + st.error(f"Error adding document to vector store: {e}") + return False + + def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]: + """Perform semantic search using cosine similarity""" + if not self.embedding_model or not self.vectors: + return [] + + try: + # Generate query embedding + query_embedding = self.embedding_model.encode(query, normalize_embeddings=True) + + # Calculate similarities + similarities = [] + for i, doc_embedding in enumerate(self.vectors): + similarity = np.dot(query_embedding, doc_embedding) + similarities.append((similarity, i)) + + # Sort by similarity + similarities.sort(reverse=True) + + # Return top results + results = [] + for similarity, idx in similarities[:top_k]: + if similarity > 0.1: # Relevance threshold + metadata = self.document_metadata[idx] + result = VectorSearchResult( + invoice_id=metadata.get('invoice_id', ''), + invoice_number=metadata.get('invoice_number', ''), + supplier_name=metadata.get('supplier_name', ''), + similarity_score=float(similarity), + content_preview=metadata.get('document_text', ''), + metadata=metadata + ) + results.append(result) + + return results + + except Exception as e: + st.error(f"Error in semantic search: {e}") + return [] + + def get_stats(self) -> Dict: + """Get vector store statistics""" + return { + 'total_documents': len(self.document_metadata), + 'embedding_dimension': self.embedding_dimension, + 'model_name': self.embedding_model_name, + 'vector_store_size': len(self.vectors) + } + +# =============================================================================== +# SIMPLIFIED DOCUMENT PROCESSING FOR HF +# =============================================================================== + +class HuggingFaceDocumentProcessor: + """Simplified document processor for Hugging Face Spaces""" + + def __init__(self): + self.setup_processors() + + def setup_processors(self): + """Setup available document processors""" + self.processors = {} + + # PDF processing + if PDF_PROCESSING_AVAILABLE: + try: + import pdfplumber + self.processors['pdf'] = self.extract_with_pdfplumber + st.success("āœ… PDF processing available (pdfplumber)") + except ImportError: + try: + import PyPDF2 + self.processors['pdf'] = self.extract_with_pypdf2 + st.success("āœ… PDF processing available (PyPDF2)") + except ImportError: + st.warning("āš ļø No PDF processor available") + + # Text files + self.processors['txt'] = self.extract_text_file + + # Images (basic OCR alternative) + self.processors['image'] = self.extract_image_text + + def extract_with_pdfplumber(self, file_path: str) -> str: + """Extract text using pdfplumber""" + try: + import pdfplumber + text = "" + with pdfplumber.open(file_path) as pdf: + for page in pdf.pages: + page_text = page.extract_text() + if page_text: + text += page_text + "\n" + return text + except Exception as e: + st.error(f"PDF extraction failed: {e}") + return "" + + def extract_with_pypdf2(self, file_path: str) -> str: + """Extract text using PyPDF2""" + try: + import PyPDF2 + text = "" + with open(file_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + for page in pdf_reader.pages: + text += page.extract_text() + "\n" + return text + except Exception as e: + st.error(f"PDF extraction failed: {e}") + return "" + + def extract_text_file(self, file_path: str) -> str: + """Extract text from text files""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except Exception as e: + st.error(f"Text file extraction failed: {e}") + return "" + + def extract_image_text(self, file_path: str) -> str: + """Basic image text extraction (placeholder for OCR)""" + st.warning("āš ļø OCR not available in this environment. Please use text-based documents.") + return "" + + def extract_text_from_document(self, file_path: str) -> str: + """Extract text from document based on file type""" + file_ext = Path(file_path).suffix.lower() + + if file_ext == '.pdf': + processor = self.processors.get('pdf') + elif file_ext == '.txt': + processor = self.processors.get('txt') + elif file_ext in ['.jpg', '.jpeg', '.png']: + processor = self.processors.get('image') + else: + st.warning(f"Unsupported file type: {file_ext}") + return "" + + if processor: + return processor(file_path) + else: + st.error(f"No processor available for {file_ext}") + return "" + +# =============================================================================== +# SIMPLIFIED AI EXTRACTION FOR HF +# =============================================================================== + +class HuggingFaceAIExtractor: + """Simplified AI extraction for Hugging Face Spaces""" + + def __init__(self): + self.use_transformers = self.setup_transformers() + + def setup_transformers(self): + """Try to setup Hugging Face transformers for NER""" + try: + from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification + + # Use a lightweight NER model + model_name = "dbmdz/bert-large-cased-finetuned-conll03-english" + + with st.spinner("Loading AI extraction model..."): + self.ner_pipeline = pipeline( + "ner", + model=model_name, + tokenizer=model_name, + aggregation_strategy="simple" + ) + + st.success("āœ… AI extraction model loaded") + return True + + except Exception as e: + st.warning(f"āš ļø AI extraction not available: {e}") + return False + + def extract_with_ai(self, text: str) -> InvoiceData: + """Extract invoice data using AI""" + if not self.use_transformers: + return self.extract_with_regex(text) + + try: + # Use NER to extract entities + entities = self.ner_pipeline(text[:512]) # Limit text length + + invoice_data = InvoiceData() + invoice_data.processing_method = "ai_ner" + + # Extract specific entities (simplified) + for entity in entities: + entity_text = entity['word'].replace('##', '') + + # Simple mapping based on entity types + if entity['entity_group'] == 'ORG': + if not invoice_data.supplier_name: + invoice_data.supplier_name = entity_text + elif not invoice_data.buyer_name: + invoice_data.buyer_name = entity_text + + elif entity['entity_group'] == 'MISC': + if not invoice_data.invoice_number and any(c.isdigit() for c in entity_text): + invoice_data.invoice_number = entity_text + + # Fall back to regex for missing fields + regex_data = self.extract_with_regex(text) + + # Combine results + if not invoice_data.invoice_number: + invoice_data.invoice_number = regex_data.invoice_number + if not invoice_data.amount: + invoice_data.amount = regex_data.amount + if not invoice_data.date: + invoice_data.date = regex_data.date + if not invoice_data.quantity: + invoice_data.quantity = regex_data.quantity + + invoice_data.extraction_confidence = 0.8 + + return invoice_data + + except Exception as e: + st.error(f"AI extraction failed: {e}") + return self.extract_with_regex(text) + + def extract_with_regex(self, text: str) -> InvoiceData: + """Fallback regex extraction""" + invoice_data = InvoiceData() + invoice_data.processing_method = "regex" + + # Enhanced regex patterns + patterns = { + 'invoice_number': [ + r'invoice[#\s]*:?\s*([A-Z0-9\-_]+)', + r'inv[#\s]*:?\s*([A-Z0-9\-_]+)', + r'bill[#\s]*:?\s*([A-Z0-9\-_]+)', + r'#([A-Z0-9\-_]{3,})' + ], + 'amount': [ + r'total[:\s]*[\$₹]?([0-9,]+\.?\d*)', + r'amount[:\s]*[\$₹]?([0-9,]+\.?\d*)', + r'[\$₹]([0-9,]+\.?\d*)', + r'([0-9,]+\.?\d*)\s*(?:dollars?|₹|USD|INR)' + ], + 'date': [ + r'date[:\s]*(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})', + r'(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})', + r'(\d{4}[/\-]\d{1,2}[/\-]\d{1,2})' + ], + 'quantity': [ + r'qty[:\s]*(\d+)', + r'quantity[:\s]*(\d+)', + r'(\d+)\s*(?:pcs?|units?|items?)' + ] + } + + text_lower = text.lower() + + # Extract using patterns + for pattern_list in patterns['invoice_number']: + match = re.search(pattern_list, text_lower, re.IGNORECASE) + if match: + invoice_data.invoice_number = match.group(1).upper() + break + + for pattern in patterns['amount']: + match = re.search(pattern, text_lower, re.IGNORECASE) + if match: + try: + amount_str = match.group(1).replace(',', '') + invoice_data.amount = float(amount_str) + break + except ValueError: + continue + + for pattern in patterns['date']: + match = re.search(pattern, text, re.IGNORECASE) + if match: + invoice_data.date = self.parse_date(match.group(1)) + break + + for pattern in patterns['quantity']: + match = re.search(pattern, text_lower, re.IGNORECASE) + if match: + try: + invoice_data.quantity = int(match.group(1)) + break + except ValueError: + continue + + # Extract company names (basic) + company_patterns = [ + r'(?:from|to|vendor|supplier)[:]\s*([A-Z][A-Za-z\s&,\.]{2,30})', + r'([A-Z][A-Za-z\s&,\.]{3,30})\s*(?:Ltd|Inc|Corp|LLC|Co\.|Company)', + ] + + for pattern in company_patterns: + matches = re.findall(pattern, text) + if matches: + if not invoice_data.supplier_name: + invoice_data.supplier_name = matches[0].strip() + elif len(matches) > 1 and not invoice_data.buyer_name: + invoice_data.buyer_name = matches[1].strip() + + # Extract product description (basic) + desc_patterns = [ + r'description[:]\s*([A-Za-z0-9\s,.-]{10,100})', + r'item[:]\s*([A-Za-z0-9\s,.-]{10,100})', + r'service[:]\s*([A-Za-z0-9\s,.-]{10,100})' + ] + + for pattern in desc_patterns: + match = re.search(pattern, text, re.IGNORECASE) + if match: + invoice_data.product_description = match.group(1).strip() + break + + invoice_data.extraction_confidence = 0.6 + return invoice_data + + def parse_date(self, date_str: str) -> str: + """Parse date to YYYY-MM-DD format""" + if not date_str: + return "" + + formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d'] + + for fmt in formats: + try: + parsed_date = datetime.strptime(date_str, fmt) + return parsed_date.strftime('%Y-%m-%d') + except ValueError: + continue + + return date_str + +# =============================================================================== +# MAIN PROCESSOR FOR HUGGING FACE +# =============================================================================== + +class HuggingFaceInvoiceProcessor: + """Main invoice processor optimized for Hugging Face Spaces""" + + def __init__(self): + self.setup_storage() + self.document_processor = HuggingFaceDocumentProcessor() + self.ai_extractor = HuggingFaceAIExtractor() + self.vector_store = HuggingFaceVectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None + + # Initialize stats + self.processing_stats = { + 'total_processed': 0, + 'successful': 0, + 'failed': 0, + 'start_time': datetime.now() + } + + def setup_storage(self): + """Setup storage paths""" + self.data_dir = HF_CONFIG["data_dir"] + self.json_path = os.path.join(self.data_dir, "invoices.json") + + # Initialize JSON storage + if not os.path.exists(self.json_path): + initial_data = { + "metadata": { + "created_at": datetime.now().isoformat(), + "version": "hf_v1.0", + "total_invoices": 0 + }, + "invoices": [], + "summary": { + "total_amount": 0.0, + "unique_suppliers": [], + "processing_stats": {"successful": 0, "failed": 0} + } + } + self.save_json_data(initial_data) + + def load_json_data(self) -> dict: + """Load invoice data from JSON""" + try: + with open(self.json_path, 'r', encoding='utf-8') as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError): + self.setup_storage() + return self.load_json_data() + + def save_json_data(self, data: dict): + """Save invoice data to JSON""" + try: + with open(self.json_path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + except Exception as e: + st.error(f"Error saving data: {e}") + + def process_uploaded_file(self, uploaded_file) -> InvoiceData: + """Process a single uploaded file""" + self.processing_stats['total_processed'] += 1 + + try: + # Check file size + file_size = len(uploaded_file.getvalue()) + if file_size > HF_CONFIG["max_file_size_mb"] * 1024 * 1024: + st.error(f"File too large: {file_size / 1024 / 1024:.2f}MB > {HF_CONFIG['max_file_size_mb']}MB") + self.processing_stats['failed'] += 1 + return InvoiceData() + + # Save temporarily + with tempfile.NamedTemporaryFile(delete=False, suffix=f".{uploaded_file.name.split('.')[-1]}") as tmp_file: + tmp_file.write(uploaded_file.getvalue()) + tmp_file_path = tmp_file.name + + try: + # Extract text + text = self.document_processor.extract_text_from_document(tmp_file_path) + + if not text.strip(): + st.warning(f"No text extracted from {uploaded_file.name}") + self.processing_stats['failed'] += 1 + return InvoiceData() + + # Extract invoice data + invoice_data = self.ai_extractor.extract_with_ai(text) + invoice_data.file_path = uploaded_file.name + + # Save to storage + self.save_invoice_data(invoice_data, text, file_size) + + self.processing_stats['successful'] += 1 + return invoice_data + + finally: + # Cleanup + os.unlink(tmp_file_path) + + except Exception as e: + st.error(f"Error processing {uploaded_file.name}: {e}") + self.processing_stats['failed'] += 1 + return InvoiceData() + + def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int): + """Save invoice data to JSON and vector store""" + try: + # Load existing data + data = self.load_json_data() + + # Create invoice record + invoice_record = { + "id": len(data["invoices"]) + 1, + "invoice_number": invoice_data.invoice_number, + "supplier_name": invoice_data.supplier_name, + "buyer_name": invoice_data.buyer_name, + "date": invoice_data.date, + "amount": invoice_data.amount, + "quantity": invoice_data.quantity, + "product_description": invoice_data.product_description, + "file_info": { + "file_name": invoice_data.file_path, + "file_size": file_size + }, + "extraction_info": { + "confidence": invoice_data.extraction_confidence, + "method": invoice_data.processing_method, + "raw_text_preview": raw_text[:300] + }, + "timestamps": { + "created_at": datetime.now().isoformat() + } + } + + # Add to invoices + data["invoices"].append(invoice_record) + + # Update summary + self.update_summary(data) + + # Save JSON + self.save_json_data(data) + + # Add to vector store + if self.vector_store: + self.vector_store.add_document(invoice_record, raw_text) + self.vector_store.save_vector_store() + + except Exception as e: + st.error(f"Error saving invoice data: {e}") + + def update_summary(self, data: dict): + """Update summary statistics""" + invoices = data["invoices"] + + total_amount = sum(inv.get("amount", 0) for inv in invoices) + unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name"))) + + data["summary"] = { + "total_amount": total_amount, + "unique_suppliers": unique_suppliers, + "processing_stats": { + "successful": self.processing_stats['successful'], + "failed": self.processing_stats['failed'], + "total_processed": self.processing_stats['total_processed'] + } + } + + data["metadata"]["last_updated"] = datetime.now().isoformat() + data["metadata"]["total_invoices"] = len(invoices) + +# =============================================================================== +# SIMPLIFIED CHATBOT FOR HF +# =============================================================================== + +class HuggingFaceChatBot: + """Simplified chatbot for Hugging Face Spaces""" + + def __init__(self, processor: HuggingFaceInvoiceProcessor): + self.processor = processor + + def query_database(self, query: str) -> str: + """Process user query and return response""" + try: + data = self.processor.load_json_data() + invoices = data.get("invoices", []) + + if not invoices: + return "No invoice data found. Please upload some invoices first." + + query_lower = query.lower() + + # Handle different query types + if any(phrase in query_lower for phrase in ["summary", "overview", "total"]): + return self.generate_summary(data) + + elif "count" in query_lower or "how many" in query_lower: + return self.handle_count_query(data) + + elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]): + return self.handle_amount_query(data) + + elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]): + return self.handle_supplier_query(data, query) + + + elif self.processor.vector_store: + return self.handle_semantic_search(query) + + else: + return self.handle_general_query(data, query) + + except Exception as e: + return f"Error processing query: {e}" + + def generate_summary(self, data: dict) -> str: + """Generate comprehensive summary""" + invoices = data.get("invoices", []) + summary = data.get("summary", {}) + + if not invoices: + return "No invoices found in the system." + + total_amount = summary.get("total_amount", 0) + avg_amount = total_amount / len(invoices) if invoices else 0 + unique_suppliers = len(summary.get("unique_suppliers", [])) + + response = f""" +**šŸ“Š Invoice System Summary** + +• **Total Invoices**: {len(invoices):,} +• **Total Value**: ₹{total_amount:,.2f} +• **Average Invoice**: ₹{avg_amount:,.2f} +• **Unique Suppliers**: {unique_suppliers} + +**šŸ“ˆ Processing Stats** +• **Successful**: {summary.get('processing_stats', {}).get('successful', 0)} +• **Failed**: {summary.get('processing_stats', {}).get('failed', 0)} + +**šŸ” Recent Invoices** +""" + + # Show recent invoices + recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5] + for i, inv in enumerate(recent, 1): + response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})" + + return response + + def handle_count_query(self, data: dict) -> str: + """Handle count-related queries""" + invoices = data.get("invoices", []) + total = len(invoices) + unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number'))) + + return f""" +**šŸ“Š Invoice Count Summary** + +• **Total Records**: {total} +• **Unique Invoice Numbers**: {unique_numbers} +• **Duplicates**: {total - unique_numbers if total > unique_numbers else 0} + +**šŸ“… Processing Timeline** +• **First Invoice**: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'} +• **Latest Invoice**: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'} +""" + + def handle_amount_query(self, data: dict) -> str: + """Handle amount-related queries""" + invoices = data.get("invoices", []) + amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0] + + if not amounts: + return "No amount information found in invoices." + + total_amount = sum(amounts) + avg_amount = total_amount / len(amounts) + max_amount = max(amounts) + min_amount = min(amounts) + + # Find high-value invoices + high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount + high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold] + + response = f""" +**šŸ’° Financial Analysis** + +• **Total Amount**: ₹{total_amount:,.2f} +• **Average Amount**: ₹{avg_amount:,.2f} +• **Highest Invoice**: ₹{max_amount:,.2f} +• **Lowest Invoice**: ₹{min_amount:,.2f} + +**šŸŽÆ High-Value Invoices (₹{high_value_threshold:,.2f}+)** +""" + + for i, inv in enumerate(high_value_invoices[:5], 1): + response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})" + + return response + + def handle_supplier_query(self, data: dict, query: str) -> str: + """Handle supplier-related queries""" + invoices = data.get("invoices", []) + + # Count invoices by supplier + supplier_counts = {} + supplier_amounts = {} + + for inv in invoices: + supplier = inv.get('supplier_name', '').strip() + if supplier: + supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1 + supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0) + + if not supplier_counts: + return "No supplier information found in invoices." + + # Sort suppliers by amount + top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10] + + response = f""" +**šŸ¢ Supplier Analysis** + +• **Total Unique Suppliers**: {len(supplier_counts)} +• **Most Active**: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices) + +**šŸ’° Top Suppliers by Amount** +""" + + for i, (supplier, amount) in enumerate(top_suppliers, 1): + count = supplier_counts[supplier] + avg = amount / count if count > 0 else 0 + response += f"\n{i}. **{supplier}** - ₹{amount:,.2f} ({count} invoices, avg: ₹{avg:,.2f})" + + return response + + def handle_semantic_search(self, query: str) -> str: + """Handle semantic search queries""" + try: + results = self.processor.vector_store.semantic_search(query, top_k=5) + + if not results: + return f"No relevant results found for '{query}'. Try different keywords." + + response = f"šŸ” **Semantic Search Results for '{query}'**\n\n" + + for i, result in enumerate(results, 1): + response += f"{i}. **{result.invoice_number}** - {result.supplier_name}\n" + response += f" • Similarity: {result.similarity_score:.3f}\n" + response += f" • Amount: ₹{result.metadata.get('amount', 0):,.2f}\n" + response += f" • Preview: {result.content_preview[:100]}...\n\n" + + return response + + except Exception as e: + return f"Semantic search error: {e}" + + def handle_general_query(self, data: dict, query: str) -> str: + """Handle general queries with keyword search""" + invoices = data.get("invoices", []) + query_words = query.lower().split() + + # Simple keyword matching + matching_invoices = [] + for inv in invoices: + text_to_search = ( + inv.get('supplier_name', '') + ' ' + + inv.get('buyer_name', '') + ' ' + + inv.get('product_description', '') + ' ' + + inv.get('extraction_info', {}).get('raw_text_preview', '') + ).lower() + + if any(word in text_to_search for word in query_words): + matching_invoices.append(inv) + + if not matching_invoices: + return f"No invoices found matching '{query}'. Try different keywords or check the summary." + + response = f"šŸ” **Found {len(matching_invoices)} invoices matching '{query}'**\n\n" + + for i, inv in enumerate(matching_invoices[:5], 1): + response += f"{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')}\n" + response += f" • Amount: ₹{inv.get('amount', 0):,.2f}\n" + response += f" • Date: {inv.get('date', 'N/A')}\n\n" + + if len(matching_invoices) > 5: + response += f"... and {len(matching_invoices) - 5} more results." + + return response + +# =============================================================================== +# STREAMLIT APPLICATION FOR HUGGING FACE +# =============================================================================== + +def create_huggingface_app(): + """Main Streamlit application optimized for Hugging Face Spaces""" + + # Custom CSS for better UI + st.markdown(""" + + """, unsafe_allow_html=True) + + # Header + st.markdown('

šŸ“„ AI Invoice Processing System

', unsafe_allow_html=True) + st.markdown(""" +
+

+ AI-Powered Document Processing • Semantic Search • Smart Analytics • Hugging Face Spaces +

+
+ """, unsafe_allow_html=True) + + # Initialize processor + if 'hf_processor' not in st.session_state: + with st.spinner("šŸ”§ Initializing AI Invoice Processor..."): + st.session_state.hf_processor = HuggingFaceInvoiceProcessor() + st.session_state.hf_chatbot = HuggingFaceChatBot(st.session_state.hf_processor) + st.session_state.chat_history = [] + + # Sidebar with system status + with st.sidebar: + st.header("šŸŽ›ļø System Status") + + # Check component status + processor = st.session_state.hf_processor + + # Document processing + if processor.document_processor.processors: + st.markdown('āœ… Document Processing', unsafe_allow_html=True) + else: + st.markdown('āŒ Document Processing', unsafe_allow_html=True) + + # AI extraction + if processor.ai_extractor.use_transformers: + st.markdown('āœ… AI Extraction', unsafe_allow_html=True) + else: + st.markdown('āš ļø Regex Extraction', unsafe_allow_html=True) + + # Vector search + if processor.vector_store and processor.vector_store.embedding_model: + st.markdown('āœ… Semantic Search', unsafe_allow_html=True) + else: + st.markdown('āš ļø Keyword Search Only', unsafe_allow_html=True) + + # Quick stats + st.header("šŸ“Š Quick Stats") + try: + data = processor.load_json_data() + total_invoices = len(data.get("invoices", [])) + total_amount = data.get("summary", {}).get("total_amount", 0) + + st.metric("Total Invoices", total_invoices) + st.metric("Total Value", f"₹{total_amount:,.2f}") + st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}") + + except Exception as e: + st.error(f"Stats error: {e}") + + # Processing info + st.header("āš™ļø Processing Info") + st.info(f""" + **Limits for Hugging Face Spaces:** + • Max file size: {HF_CONFIG['max_file_size_mb']}MB + • Max concurrent files: {HF_CONFIG['max_concurrent_files']} + • Timeout: {HF_CONFIG['timeout_seconds']}s + """) + + # Main tabs + tab1, tab2, tab3, tab4 = st.tabs([ + "šŸ“¤ Upload & Process", + "šŸ’¬ AI Chat", + "šŸ“Š Analytics", + "šŸ“‹ Data Explorer" + ]) + + # ------------------------------------------------------------------------- + # TAB 1: UPLOAD & PROCESS + # ------------------------------------------------------------------------- + + with tab1: + st.header("šŸ“¤ Upload Invoice Documents") + + # Feature highlights + col1, col2, col3 = st.columns(3) + + with col1: + st.markdown(""" +
+

šŸ¤– AI Extraction

+

Advanced NLP models extract structured data automatically

+
+ """, unsafe_allow_html=True) + + with col2: + st.markdown(""" +
+

šŸ” Smart Search

+

Semantic search finds invoices using natural language

+
+ """, unsafe_allow_html=True) + + with col3: + st.markdown(""" +
+

šŸ“Š Analytics

+

Comprehensive insights and visualizations

+
+ """, unsafe_allow_html=True) + + # File upload interface + st.markdown("### šŸ“ Upload Your Invoices") + + uploaded_files = st.file_uploader( + "Choose invoice files (PDF, TXT supported)", + type=['pdf', 'txt'], + accept_multiple_files=True, + help=f"Maximum file size: {HF_CONFIG['max_file_size_mb']}MB per file" + ) + + if uploaded_files: + # Limit concurrent processing for HF Spaces + if len(uploaded_files) > HF_CONFIG['max_concurrent_files']: + st.warning(f"āš ļø Too many files selected. Processing first {HF_CONFIG['max_concurrent_files']} files.") + uploaded_files = uploaded_files[:HF_CONFIG['max_concurrent_files']] + + st.info(f"šŸ“Š {len(uploaded_files)} files selected") + + if st.button("šŸš€ Process Files", type="primary", use_container_width=True): + progress_bar = st.progress(0) + status_container = st.container() + results_container = st.container() + + successful = 0 + failed = 0 + + for i, uploaded_file in enumerate(uploaded_files): + progress_bar.progress((i + 1) / len(uploaded_files)) + + with status_container: + st.info(f"Processing: {uploaded_file.name}") + + # Process file + result = st.session_state.hf_processor.process_uploaded_file(uploaded_file) + + with results_container: + if result.invoice_number: + successful += 1 + with st.expander(f"āœ… {uploaded_file.name}", expanded=False): + col1, col2 = st.columns(2) + with col1: + st.write(f"**Invoice #:** {result.invoice_number}") + st.write(f"**Supplier:** {result.supplier_name}") + st.write(f"**Amount:** ₹{result.amount:.2f}") + with col2: + st.write(f"**Date:** {result.date}") + st.write(f"**Method:** {result.processing_method}") + st.write(f"**Confidence:** {result.extraction_confidence:.1%}") + else: + failed += 1 + st.warning(f"āš ļø Could not extract data from {uploaded_file.name}") + + # Final status + with status_container: + st.success(f"āœ… Processing complete! {successful} successful, {failed} failed") + + if successful > 0: + st.balloons() + + # ------------------------------------------------------------------------- + # TAB 2: AI CHAT + # ------------------------------------------------------------------------- + + with tab2: + st.header("šŸ’¬ AI Chat Interface") + + # Chat interface + user_query = st.chat_input("Ask about your invoices... (e.g., 'show me total spending')") + + if user_query: + # Add user message + st.session_state.chat_history.append({ + "role": "user", + "content": user_query, + "timestamp": datetime.now() + }) + + # Get AI response + with st.spinner("šŸ¤– AI is analyzing..."): + response = st.session_state.hf_chatbot.query_database(user_query) + + st.session_state.chat_history.append({ + "role": "assistant", + "content": response, + "timestamp": datetime.now() + }) + + # Display chat history + for message in st.session_state.chat_history: + with st.chat_message(message["role"]): + st.markdown(message["content"]) + + # Suggested queries + if not st.session_state.chat_history: + st.markdown("### šŸ’” Try These Queries") + + col1, col2 = st.columns(2) + + with col1: + queries = [ + "Show me a summary of all invoices", + "How much have we spent in total?", + "Who are our top suppliers?", + "Find invoices with high amounts" + ] + for i, query in enumerate(queries): + if st.button(query, key=f"query_{i}"): + st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) + response = st.session_state.hf_chatbot.query_database(query) + st.session_state.chat_history.append({"role": "assistant", "content": response, "timestamp": datetime.now()}) + st.rerun() + + with col2: + if st.session_state.hf_processor.vector_store: + semantic_queries = [ + "Find technology equipment purchases", + "Show me office supplies", + "Search for consulting services", + "Find maintenance contracts" + ] + for i, query in enumerate(semantic_queries): + if st.button(query, key=f"semantic_{i}"): + st.session_state.chat_history.append({"role": "user", "content": query, "timestamp": datetime.now()}) + response = st.session_state.hf_chatbot.query_database(query) + st.session_state.chat_history.append({"role": "assistant", "content": response, "timestamp": datetime.now()}) + st.rerun() + + # ------------------------------------------------------------------------- + # TAB 3: ANALYTICS + # ------------------------------------------------------------------------- + + with tab3: + st.header("šŸ“Š Analytics Dashboard") + + try: + data = st.session_state.hf_processor.load_json_data() + invoices = data.get("invoices", []) + + if not invoices: + st.info("šŸ“Š No data available. Upload some invoices to see analytics.") + return + + # Convert to DataFrame + df_data = [] + for inv in invoices: + df_data.append({ + 'invoice_number': inv.get('invoice_number', ''), + 'supplier_name': inv.get('supplier_name', ''), + 'amount': inv.get('amount', 0), + 'date': inv.get('date', ''), + 'confidence': inv.get('extraction_info', {}).get('confidence', 0) + }) + + df = pd.DataFrame(df_data) + + # Key metrics + col1, col2, col3, col4 = st.columns(4) + + with col1: + st.metric("Total Invoices", len(df)) + with col2: + st.metric("Total Amount", f"₹{df['amount'].sum():,.2f}") + with col3: + st.metric("Avg Amount", f"₹{df['amount'].mean():,.2f}") + with col4: + st.metric("Unique Suppliers", df['supplier_name'].nunique()) + + # Visualizations + if len(df) > 0: + # Amount distribution + fig_hist = px.histogram( + df, + x='amount', + title="Invoice Amount Distribution", + labels={'amount': 'Amount (₹)', 'count': 'Number of Invoices'} + ) + st.plotly_chart(fig_hist, use_container_width=True) + + # Top suppliers + if df['supplier_name'].notna().any(): + supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10) + + fig_suppliers = px.bar( + x=supplier_amounts.values, + y=supplier_amounts.index, + orientation='h', + title="Top 10 Suppliers by Total Amount", + labels={'x': 'Total Amount (₹)', 'y': 'Supplier'} + ) + st.plotly_chart(fig_suppliers, use_container_width=True) + + # Confidence analysis + fig_confidence = px.histogram( + df, + x='confidence', + title="Extraction Confidence Distribution", + labels={'confidence': 'Confidence Score', 'count': 'Number of Invoices'} + ) + st.plotly_chart(fig_confidence, use_container_width=True) + + except Exception as e: + st.error(f"Analytics error: {e}") + + # ------------------------------------------------------------------------- + # TAB 4: DATA EXPLORER + # ------------------------------------------------------------------------- + + with tab4: + st.header("šŸ“‹ Data Explorer") + + try: + data = st.session_state.hf_processor.load_json_data() + invoices = data.get("invoices", []) + + if not invoices: + st.info("šŸ“Š No data available. Upload some invoices first.") + return + + # Convert to DataFrame for display + df_data = [] + for inv in invoices: + df_data.append({ + 'Invoice Number': inv.get('invoice_number', ''), + 'Supplier': inv.get('supplier_name', ''), + 'Buyer': inv.get('buyer_name', ''), + 'Amount': inv.get('amount', 0), + 'Date': inv.get('date', ''), + 'Confidence': inv.get('extraction_info', {}).get('confidence', 0), + 'Method': inv.get('extraction_info', {}).get('method', ''), + 'File': inv.get('file_info', {}).get('file_name', ''), + 'Created': inv.get('timestamps', {}).get('created_at', '')[:19] + }) + + df = pd.DataFrame(df_data) + + # Filters + col1, col2, col3 = st.columns(3) + + with col1: + suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist()) + selected_supplier = st.selectbox("Filter by Supplier", suppliers) + + with col2: + methods = ['All'] + sorted(df['Method'].dropna().unique().tolist()) + selected_method = st.selectbox("Filter by Method", methods) + + with col3: + min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0) + + # Apply filters + filtered_df = df.copy() + if selected_supplier != 'All': + filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier] + if selected_method != 'All': + filtered_df = filtered_df[filtered_df['Method'] == selected_method] + if min_amount > 0: + filtered_df = filtered_df[filtered_df['Amount'] >= min_amount] + + # Display filtered data + st.dataframe( + filtered_df, + use_container_width=True, + column_config={ + "Amount": st.column_config.NumberColumn("Amount", format="₹%.2f"), + "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1) + } + ) + + # Export options + col1, col2 = st.columns(2) + + with col1: + if st.button("šŸ“„ Export CSV", use_container_width=True): + csv_data = filtered_df.to_csv(index=False) + st.download_button( + "Download CSV", + csv_data, + f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv", + "text/csv" + ) + + with col2: + if st.button("šŸ“„ Export JSON", use_container_width=True): + filtered_invoices = [inv for inv in invoices + if inv.get('invoice_number') in filtered_df['Invoice Number'].values] + + export_data = { + "exported_at": datetime.now().isoformat(), + "total_records": len(filtered_invoices), + "invoices": filtered_invoices + } + + st.download_button( + "Download JSON", + json.dumps(export_data, indent=2), + f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json", + "application/json" + ) + + except Exception as e: + st.error(f"Data explorer error: {e}") + + # Footer + st.markdown("---") + st.markdown(""" +
+

šŸš€ AI Invoice Processing System - Optimized for Hugging Face Spaces

+

Built with ā¤ļø using Streamlit, Transformers, and AI

+
+ """, unsafe_allow_html=True) + +# =============================================================================== +# HUGGING FACE REQUIREMENTS AND CONFIGURATION +# =============================================================================== + +def generate_hf_requirements(): + """Generate requirements.txt optimized for Hugging Face Spaces""" + requirements = """streamlit>=1.28.0 +pandas>=1.5.0 +numpy>=1.21.0 +plotly>=5.0.0 +sentence-transformers>=2.2.0 +transformers>=4.21.0 +torch>=1.13.0 +faiss-cpu>=1.7.0 +pdfplumber>=0.7.0 +requests>=2.28.0 +python-dateutil>=2.8.0 +Pillow>=9.0.0 +""" + return requirements.strip() + +def generate_hf_config(): + """Generate app configuration for Hugging Face Spaces""" + config = { + "title": "AI Invoice Processing System", + "emoji": "šŸ“„", + "colorFrom": "blue", + "colorTo": "purple", + "sdk": "streamlit", + "sdk_version": "1.28.0", + "app_file": "app.py", + "pinned": False, + "python_version": "3.9" + } + return config + +# =============================================================================== +# MAIN APPLICATION ENTRY POINT +# =============================================================================== + +def main(): + """Main entry point for Hugging Face Spaces""" + try: + # Display Hugging Face info if running on HF Spaces + if IS_HF_SPACE: + st.sidebar.info("šŸ¤— Running on Hugging Face Spaces") + + # Create and run the app + create_huggingface_app() + + except Exception as e: + st.error(f"Application error: {e}") + st.info("Please refresh the page or contact support if the error persists.") + +if __name__ == "__main__": + main() \ No newline at end of file