Spaces:

shivam701171
/

Invoice_processing_tool

Sleeping

App Files Files Community

Invoice_processing_tool / app.py

shivam701171

Update app.py

c602672 verified 8 months ago

raw

history blame contribute delete

133 kB

	#!/usr/bin/env python3
	"""
	AI Invoice Processing System - Complete Single File for Hugging Face Spaces
	A comprehensive system with AI-powered extraction, semantic search, and analytics.

	Author: AI Assistant
	Date: 2024
	Version: HuggingFace Single File v1.0
	"""

	# ===============================================================================
	# IMPORTS AND COMPATIBILITY CHECKS
	# ===============================================================================

	import os
	import json
	import re
	import tempfile
	import shutil
	import pickle
	import numpy as np
	from datetime import datetime
	from typing import Dict, List, Optional, Tuple
	from dataclasses import dataclass
	from pathlib import Path
	import time
	import logging
	import uuid

	# Check if running on Hugging Face Spaces
	IS_HF_SPACE = os.getenv("SPACE_ID") is not None

	# Get Hugging Face token from environment or Streamlit secrets
	HF_TOKEN = None
	try:
	# Try Streamlit secrets first (for HF Spaces)
	HF_TOKEN = st.secrets.get("HF_TOKEN", None)
	except:
	# Fall back to environment variable
	HF_TOKEN = os.getenv("HF_TOKEN", None)

	# Streamlit and core libraries
	import streamlit as st
	import sqlite3
	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	import requests

	# Vector storage and embeddings (with fallbacks)
	try:
	import faiss
	FAISS_AVAILABLE = True
	except ImportError:
	FAISS_AVAILABLE = False
	st.warning("⚠️ FAISS not available. Vector search will be disabled.")

	try:
	from sentence_transformers import SentenceTransformer
	SENTENCE_TRANSFORMERS_AVAILABLE = True
	except ImportError:
	SENTENCE_TRANSFORMERS_AVAILABLE = False
	st.warning("⚠️ Sentence Transformers not available. Using fallback methods.")

	try:
	import torch
	TORCH_AVAILABLE = True
	except ImportError:
	TORCH_AVAILABLE = False

	# Document processing (simplified for HF)
	try:
	import pdfplumber
	PDF_PROCESSING_AVAILABLE = True
	PDF_PROCESSOR = "pdfplumber"
	except ImportError:
	try:
	import PyPDF2
	PDF_PROCESSING_AVAILABLE = True
	PDF_PROCESSOR = "PyPDF2"
	except ImportError:
	PDF_PROCESSING_AVAILABLE = False
	PDF_PROCESSOR = None

	# ===============================================================================
	# STREAMLIT CONFIGURATION
	# ===============================================================================

	st.set_page_config(
	page_title="AI Invoice Processing System",
	page_icon="📄",
	layout="wide",
	initial_sidebar_state="expanded",
	menu_items={
	'Get Help': 'https://huggingface.co/spaces',
	'Report a bug': 'https://huggingface.co/spaces',
	'About': """
	# AI Invoice Processing System
	Built for Hugging Face Spaces with AI-powered extraction and semantic search.
	"""
	}
	)

	# ===============================================================================
	# CONFIGURATION
	# ===============================================================================

	HF_CONFIG = {
	"max_file_size_mb": 10,
	"max_concurrent_files": 3,
	"timeout_seconds": 30,
	"use_cpu_only": True,
	"embedding_model": "all-MiniLM-L6-v2",
	"cache_dir": "./cache",
	"data_dir": "./data",
	"enable_ollama": False,
	}

	# Create necessary directories
	os.makedirs(HF_CONFIG["cache_dir"], exist_ok=True)
	os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)

	# ===============================================================================
	# DATA STRUCTURES
	# ===============================================================================

	@dataclass
	class InvoiceData:
	"""Data structure for extracted invoice information"""
	supplier_name: str = ""
	buyer_name: str = ""
	invoice_number: str = ""
	date: str = ""
	amount: float = 0.0
	quantity: int = 0
	product_description: str = ""
	file_path: str = ""
	extraction_confidence: float = 0.0
	processing_method: str = "regex"

	@dataclass
	class VectorSearchResult:
	"""Data structure for vector search results"""
	invoice_id: str
	invoice_number: str
	supplier_name: str
	similarity_score: float
	content_preview: str
	metadata: Dict

	# ===============================================================================
	# DOCUMENT PROCESSING CLASSES
	# ===============================================================================

	class DocumentProcessor:
	"""Simplified document processor for Hugging Face Spaces"""

	def __init__(self):
	self.setup_processors()

	def setup_processors(self):
	"""Setup available document processors"""
	self.processors = {}

	# PDF processing
	if PDF_PROCESSING_AVAILABLE:
	if PDF_PROCESSOR == "pdfplumber":
	self.processors['pdf'] = self.extract_with_pdfplumber
	st.success("✅ PDF processing available (pdfplumber)")
	elif PDF_PROCESSOR == "PyPDF2":
	self.processors['pdf'] = self.extract_with_pypdf2
	st.success("✅ PDF processing available (PyPDF2)")
	else:
	st.warning("⚠️ No PDF processor available")

	# Text files
	self.processors['txt'] = self.extract_text_file

	def extract_with_pdfplumber(self, file_path: str) -> str:
	"""Extract text using pdfplumber"""
	try:
	import pdfplumber
	text = ""
	with pdfplumber.open(file_path) as pdf:
	for page in pdf.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text + "\n"
	return text
	except Exception as e:
	st.error(f"PDF extraction failed: {e}")
	return ""

	def extract_with_pypdf2(self, file_path: str) -> str:
	"""Extract text using PyPDF2"""
	try:
	import PyPDF2
	text = ""
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page in pdf_reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	st.error(f"PDF extraction failed: {e}")
	return ""

	def extract_text_file(self, file_path: str) -> str:
	"""Extract text from text files"""
	try:
	with open(file_path, 'r', encoding='utf-8') as f:
	return f.read()
	except Exception as e:
	st.error(f"Text file extraction failed: {e}")
	return ""

	def extract_text_from_document(self, file_path: str) -> str:
	"""Extract text from document based on file type"""
	file_ext = Path(file_path).suffix.lower()

	if file_ext == '.pdf':
	processor = self.processors.get('pdf')
	elif file_ext == '.txt':
	processor = self.processors.get('txt')
	else:
	st.warning(f"Unsupported file type: {file_ext}")
	return ""

	if processor:
	return processor(file_path)
	else:
	st.error(f"No processor available for {file_ext}")
	return ""

	# ===============================================================================
	# AI EXTRACTION CLASS
	# ===============================================================================

	class AIExtractor:
	"""AI extraction for Hugging Face Spaces with Mistral 7B support"""

	def __init__(self):
	self.use_mistral = self.setup_mistral()
	self.use_transformers = self.setup_transformers() if not self.use_mistral else False

	def setup_mistral(self):
	"""Try to setup Mistral 7B model with proper authentication"""
	try:
	# Check if we have HF token
	if not HF_TOKEN:
	st.warning("⚠️ Hugging Face token not found. Add HF_TOKEN to secrets for Mistral access.")
	return False

	# Check if we're in a high-resource environment
	import psutil
	memory_gb = psutil.virtual_memory().total / (1024**3)

	if memory_gb < 8:
	st.warning("⚠️ Insufficient memory for Mistral 7B. Using lighter models.")
	return False

	from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
	from huggingface_hub import login

	# Login with HF token
	login(token=HF_TOKEN)

	with st.spinner("🔄 Loading Mistral 7B model (this may take a few minutes)..."):
	# Use the instruction-tuned model
	model_name = "mistralai/Mistral-7B-Instruct-v0.1"

	# Load with reduced precision for memory efficiency
	self.mistral_tokenizer = AutoTokenizer.from_pretrained(
	model_name,
	cache_dir=HF_CONFIG["cache_dir"],
	token=HF_TOKEN
	)

	self.mistral_model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16 if TORCH_AVAILABLE else None,
	device_map="auto" if TORCH_AVAILABLE else None,
	load_in_8bit=True, # Use 8-bit quantization
	cache_dir=HF_CONFIG["cache_dir"],
	token=HF_TOKEN
	)

	# Create pipeline
	self.mistral_pipeline = pipeline(
	"text-generation",
	model=self.mistral_model,
	tokenizer=self.mistral_tokenizer,
	torch_dtype=torch.float16 if TORCH_AVAILABLE else None,
	device_map="auto" if TORCH_AVAILABLE else None
	)

	st.success("✅ Mistral 7B model loaded successfully!")
	return True

	except ImportError as e:
	st.warning(f"⚠️ Missing dependencies for Mistral 7B: {e}")
	return False
	except Exception as e:
	st.warning(f"⚠️ Mistral 7B not available: {e}")
	st.info("💡 To use Mistral 7B: Add your Hugging Face token to secrets as 'HF_TOKEN'")
	return False

	def setup_transformers(self):
	"""Fallback to lighter NER model"""
	try:
	from transformers import pipeline

	with st.spinner("Loading fallback AI model..."):
	self.ner_pipeline = pipeline(
	"ner",
	model="dbmdz/bert-large-cased-finetuned-conll03-english",
	aggregation_strategy="simple"
	)

	st.success("✅ Fallback AI extraction model loaded")
	return True

	except Exception as e:
	st.warning(f"⚠️ AI extraction not available: {e}")
	return False

	def extract_with_mistral(self, text: str) -> InvoiceData:
	"""Extract invoice data using Mistral 7B"""
	try:
	# Create a detailed prompt for Mistral
	prompt = f"""<s>[INST] You are an expert at extracting structured information from invoices.

	Extract the following information from this invoice text and respond ONLY with valid JSON:

	{{
	"invoice_number": "invoice or bill number",
	"supplier_name": "company providing goods/services",
	"buyer_name": "company receiving goods/services",
	"date": "date in YYYY-MM-DD format",
	"amount": "total amount as number only",
	"quantity": "total quantity as integer",
	"product_description": "brief description of items/services"
	}}

	Invoice text:
	{text[:2000]}

	Respond with JSON only: [/INST]"""

	# Generate response
	response = self.mistral_pipeline(
	prompt,
	max_new_tokens=300,
	temperature=0.1,
	do_sample=True,
	pad_token_id=self.mistral_tokenizer.eos_token_id
	)

	# Extract the generated text
	generated_text = response[0]['generated_text']

	# Find JSON in the response
	json_start = generated_text.find('{')
	json_end = generated_text.rfind('}') + 1

	if json_start != -1 and json_end > json_start:
	json_str = generated_text[json_start:json_end]

	# Parse JSON
	import json
	data = json.loads(json_str)

	# Create InvoiceData object
	invoice_data = InvoiceData()
	invoice_data.supplier_name = str(data.get('supplier_name', '')).strip()
	invoice_data.buyer_name = str(data.get('buyer_name', '')).strip()
	invoice_data.invoice_number = str(data.get('invoice_number', '')).strip()
	invoice_data.date = self.parse_date(str(data.get('date', '')))

	# Parse amount
	try:
	amount_val = data.get('amount', 0)
	if isinstance(amount_val, str):
	amount_clean = re.sub(r'[^\d.]', '', amount_val)
	invoice_data.amount = float(amount_clean) if amount_clean else 0.0
	else:
	invoice_data.amount = float(amount_val)
	except:
	invoice_data.amount = 0.0

	# Parse quantity
	try:
	qty_val = data.get('quantity', 0)
	invoice_data.quantity = int(float(str(qty_val).replace(',', '')))
	except:
	invoice_data.quantity = 0

	invoice_data.product_description = str(data.get('product_description', '')).strip()
	invoice_data.extraction_confidence = 0.95 # High confidence for Mistral
	invoice_data.processing_method = "mistral_7b"

	return invoice_data
	else:
	st.warning("⚠️ Mistral response didn't contain valid JSON, falling back to regex")
	return self.extract_with_regex(text)

	except Exception as e:
	st.error(f"Mistral extraction failed: {e}")
	return self.extract_with_regex(text)

	def extract_with_ai(self, text: str) -> InvoiceData:
	"""Extract invoice data using available AI method"""
	if self.use_mistral:
	st.info("🤖 Using Mistral 7B for extraction...")
	return self.extract_with_mistral(text)
	elif self.use_transformers:
	st.info("🤖 Using NER model for extraction...")
	return self.extract_with_ner(text)
	else:
	st.info("🔧 Using regex extraction...")
	return self.extract_with_regex(text)

	def extract_with_ner(self, text: str) -> InvoiceData:
	"""Extract using NER model (fallback method)"""
	try:
	# Use NER to extract entities
	entities = self.ner_pipeline(text[:512]) # Limit text length

	invoice_data = InvoiceData()
	invoice_data.processing_method = "ai_ner"

	# Extract specific entities
	for entity in entities:
	entity_text = entity['word'].replace('##', '')

	if entity['entity_group'] == 'ORG':
	if not invoice_data.supplier_name:
	invoice_data.supplier_name = entity_text
	elif not invoice_data.buyer_name:
	invoice_data.buyer_name = entity_text

	elif entity['entity_group'] == 'MISC':
	if not invoice_data.invoice_number and any(c.isdigit() for c in entity_text):
	invoice_data.invoice_number = entity_text

	# Fall back to regex for missing fields
	regex_data = self.extract_with_regex(text)

	# Combine results
	if not invoice_data.invoice_number:
	invoice_data.invoice_number = regex_data.invoice_number
	if not invoice_data.amount:
	invoice_data.amount = regex_data.amount
	if not invoice_data.date:
	invoice_data.date = regex_data.date
	if not invoice_data.quantity:
	invoice_data.quantity = regex_data.quantity

	invoice_data.extraction_confidence = 0.8

	return invoice_data

	except Exception as e:
	st.error(f"NER extraction failed: {e}")
	return self.extract_with_regex(text)

	def extract_with_regex(self, text: str) -> InvoiceData:
	"""Enhanced regex extraction with better amount detection"""
	invoice_data = InvoiceData()
	invoice_data.processing_method = "regex"

	# Enhanced regex patterns with more comprehensive matching
	patterns = {
	'invoice_number': [
	r'invoice\s(?:no\|number\|#)?\s:?\s*([A-Z0-9\-_/]+)',
	r'bill\s(?:no\|number\|#)?\s:?\s*([A-Z0-9\-_/]+)',
	r'inv\s(?:no\|number\|#)?\s:?\s*([A-Z0-9\-_/]+)',
	r'ref\s(?:no\|number\|#)?\s:?\s*([A-Z0-9\-_/]+)',
	r'#\s*([A-Z0-9\-_/]{3,})',
	r'(?:^\|\s)([A-Z]{2,}\d{3,}\|\d{3,}[A-Z]{2,})', # Common patterns like ABC123 or 123ABC
	],
	'amount': [
	# Currency symbols with amounts
	r'total\s(?:amount)?\s:?\s[\$₹£€]?\s([0-9,]+\.?\d*)',
	r'amount\s(?:due\|paid\|total)?\s:?\s[\$₹£€]?\s([0-9,]+\.?\d*)',
	r'grand\stotal\s:?\s[\$₹£€]?\s([0-9,]+\.?\d*)',
	r'net\s(?:amount\|total)\s:?\s[\$₹£€]?\s([0-9,]+\.?\d*)',
	r'sub\stotal\s:?\s[\$₹£€]?\s([0-9,]+\.?\d*)',

	# Currency symbols at the beginning
	r'[\$₹£€]\s([0-9,]+\.?\d)',

	# Amounts at end of lines (common in invoices)
	r'([0-9,]+\.?\d)\s[\$₹£€]?\s*

	def parse_date(self, date_str: str) -> str:
	"""Parse date to YYYY-MM-DD format"""
	if not date_str:
	return ""

	formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d']

	for fmt in formats:
	try:
	parsed_date = datetime.strptime(date_str, fmt)
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	continue

	return date_str

	# ===============================================================================
	# VECTOR STORE CLASS
	# ===============================================================================

	class VectorStore:
	"""Simplified vector store for Hugging Face Spaces"""

	def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
	self.embedding_model_name = embedding_model
	self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
	self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
	self.embedding_model = None
	self.vectors = []
	self.document_metadata = []
	self.embedding_dimension = None

	self.setup_embedding_model()
	self.load_vector_store()

	def setup_embedding_model(self):
	"""Initialize the sentence transformer model"""
	if not SENTENCE_TRANSFORMERS_AVAILABLE:
	st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
	return

	try:
	with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
	self.embedding_model = SentenceTransformer(
	self.embedding_model_name,
	cache_folder=HF_CONFIG["cache_dir"]
	)

	# Get embedding dimension
	test_embedding = self.embedding_model.encode(["test"])
	self.embedding_dimension = test_embedding.shape[0]

	st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")

	except Exception as e:
	st.error(f"❌ Failed to load embedding model: {e}")
	self.embedding_model = None

	def load_vector_store(self):
	"""Load existing vector store"""
	try:
	if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
	with open(self.vector_store_path, 'rb') as f:
	self.vectors = pickle.load(f)

	with open(self.metadata_path, 'rb') as f:
	self.document_metadata = pickle.load(f)

	st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
	else:
	self.vectors = []
	self.document_metadata = []
	st.info("📄 New vector store initialized")

	except Exception as e:
	st.error(f"❌ Error loading vector store: {e}")
	self.vectors = []
	self.document_metadata = []

	def save_vector_store(self):
	"""Save vector store to disk"""
	try:
	with open(self.vector_store_path, 'wb') as f:
	pickle.dump(self.vectors, f)

	with open(self.metadata_path, 'wb') as f:
	pickle.dump(self.document_metadata, f)

	return True
	except Exception as e:
	st.error(f"Error saving vector store: {e}")
	return False

	def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
	"""Create searchable text from invoice data"""
	text_parts = []

	for field, value in invoice_data.items():
	if value and field != 'id':
	text_parts.append(f"{field}: {value}")

	if raw_text:
	text_parts.append(f"content: {raw_text[:300]}")

	return " \| ".join(text_parts)

	def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
	"""Add a document to the vector store"""
	if not self.embedding_model:
	return False

	try:
	document_text = self.create_document_text(invoice_data, raw_text)

	# Generate embedding
	embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)

	# Create metadata
	metadata = {
	'invoice_id': invoice_data.get('id', ''),
	'invoice_number': invoice_data.get('invoice_number', ''),
	'supplier_name': invoice_data.get('supplier_name', ''),
	'buyer_name': invoice_data.get('buyer_name', ''),
	'amount': invoice_data.get('amount', 0),
	'date': invoice_data.get('date', ''),
	'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
	'document_text': document_text[:200],
	'timestamp': datetime.now().isoformat()
	}

	# Add to store
	self.vectors.append(embedding)
	self.document_metadata.append(metadata)

	return True

	except Exception as e:
	st.error(f"Error adding document to vector store: {e}")
	return False

	def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
	"""Perform semantic search using cosine similarity"""
	if not self.embedding_model or not self.vectors:
	return []

	try:
	# Generate query embedding
	query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)

	# Calculate similarities
	similarities = []
	for i, doc_embedding in enumerate(self.vectors):
	similarity = np.dot(query_embedding, doc_embedding)
	similarities.append((similarity, i))

	# Sort by similarity
	similarities.sort(reverse=True)

	# Return top results
	results = []
	for similarity, idx in similarities[:top_k]:
	if similarity > 0.1: # Relevance threshold
	metadata = self.document_metadata[idx]
	result = VectorSearchResult(
	invoice_id=metadata.get('invoice_id', ''),
	invoice_number=metadata.get('invoice_number', ''),
	supplier_name=metadata.get('supplier_name', ''),
	similarity_score=float(similarity),
	content_preview=metadata.get('document_text', ''),
	metadata=metadata
	)
	results.append(result)

	return results

	except Exception as e:
	st.error(f"Error in semantic search: {e}")
	return []

	# ===============================================================================
	# MAIN PROCESSOR CLASS
	# ===============================================================================

	class InvoiceProcessor:
	"""Main invoice processor for Hugging Face Spaces"""

	def __init__(self):
	self.setup_storage()
	self.document_processor = DocumentProcessor()
	self.ai_extractor = AIExtractor()
	self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None

	# Initialize stats
	self.processing_stats = {
	'total_processed': 0,
	'successful': 0,
	'failed': 0,
	'start_time': datetime.now()
	}

	def setup_storage(self):
	"""Setup storage paths"""
	self.data_dir = HF_CONFIG["data_dir"]
	self.json_path = os.path.join(self.data_dir, "invoices.json")

	# Initialize JSON storage
	if not os.path.exists(self.json_path):
	initial_data = {
	"metadata": {
	"created_at": datetime.now().isoformat(),
	"version": "hf_v1.0",
	"total_invoices": 0
	},
	"invoices": [],
	"summary": {
	"total_amount": 0.0,
	"unique_suppliers": [],
	"processing_stats": {"successful": 0, "failed": 0}
	}
	}
	self.save_json_data(initial_data)

	def load_json_data(self) -> dict:
	"""Load invoice data from JSON"""
	try:
	with open(self.json_path, 'r', encoding='utf-8') as f:
	return json.load(f)
	except (FileNotFoundError, json.JSONDecodeError):
	self.setup_storage()
	return self.load_json_data()

	def save_json_data(self, data: dict):
	"""Save invoice data to JSON"""
	try:
	with open(self.json_path, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	except Exception as e:
	st.error(f"Error saving data: {e}")

	def process_uploaded_file(self, uploaded_file) -> InvoiceData:
	"""Process a single uploaded file with enhanced debugging"""
	self.processing_stats['total_processed'] += 1

	try:
	# Debug file info
	file_size = len(uploaded_file.getvalue())
	file_extension = uploaded_file.name.split('.')[-1].lower() if '.' in uploaded_file.name else 'unknown'

	st.info(f"📄 Processing: {uploaded_file.name} ({file_size/1024:.1f} KB, .{file_extension})")

	# Check file size
	if file_size > HF_CONFIG["max_file_size_mb"] * 1024 * 1024:
	error_msg = f"File too large: {file_size / 1024 / 1024:.2f}MB > {HF_CONFIG['max_file_size_mb']}MB"
	st.error(error_msg)
	self.processing_stats['failed'] += 1
	return InvoiceData()

	# Check file type
	if file_extension not in ['pdf', 'txt']:
	error_msg = f"Unsupported file type: .{file_extension} (supported: PDF, TXT)"
	st.warning(error_msg)
	self.processing_stats['failed'] += 1
	return InvoiceData()

	# Save temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp_file:
	file_content = uploaded_file.getvalue()
	tmp_file.write(file_content)
	tmp_file_path = tmp_file.name

	st.info(f"💾 Saved temporarily to: {tmp_file_path}")

	try:
	# Extract text
	st.info("🔍 Extracting text from document...")
	text = self.document_processor.extract_text_from_document(tmp_file_path)

	if not text or not text.strip():
	st.warning(f"❌ No text extracted from {uploaded_file.name}")
	self.processing_stats['failed'] += 1
	return InvoiceData()

	text_length = len(text)
	st.info(f"📝 Extracted {text_length} characters of text")

	# Show text preview
	if text_length > 0:
	with st.expander("📄 Text Preview (First 500 characters)", expanded=False):
	st.text(text[:500] + "..." if len(text) > 500 else text)

	# Extract invoice data
	st.info("🤖 Extracting invoice data using AI/Regex...")
	invoice_data = self.ai_extractor.extract_with_ai(text)
	invoice_data.file_path = uploaded_file.name

	# Show extraction results
	st.info(f"📊 Extraction completed with {invoice_data.extraction_confidence:.1%} confidence")

	# Save to storage
	st.info("💾 Saving extracted data...")
	self.save_invoice_data(invoice_data, text, file_size)

	self.processing_stats['successful'] += 1
	st.success(f"✅ Successfully processed {uploaded_file.name}")

	return invoice_data

	finally:
	# Cleanup
	try:
	os.unlink(tmp_file_path)
	st.info("🧹 Cleaned up temporary file")
	except:
	pass

	except Exception as e:
	error_msg = f"Error processing {uploaded_file.name}: {str(e)}"
	st.error(error_msg)
	self.processing_stats['failed'] += 1

	# Show detailed error for debugging
	with st.expander("🔍 Error Details", expanded=False):
	st.code(str(e))
	import traceback
	st.code(traceback.format_exc())

	return InvoiceData()

	def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int):
	"""Save invoice data to JSON and vector store"""
	try:
	# Load existing data
	data = self.load_json_data()

	# Create invoice record
	invoice_record = {
	"id": len(data["invoices"]) + 1,
	"invoice_number": invoice_data.invoice_number,
	"supplier_name": invoice_data.supplier_name,
	"buyer_name": invoice_data.buyer_name,
	"date": invoice_data.date,
	"amount": invoice_data.amount,
	"quantity": invoice_data.quantity,
	"product_description": invoice_data.product_description,
	"file_info": {
	"file_name": invoice_data.file_path,
	"file_size": file_size
	},
	"extraction_info": {
	"confidence": invoice_data.extraction_confidence,
	"method": invoice_data.processing_method,
	"raw_text_preview": raw_text[:300]
	},
	"timestamps": {
	"created_at": datetime.now().isoformat()
	}
	}

	# Add to invoices
	data["invoices"].append(invoice_record)

	# Update summary
	self.update_summary(data)

	# Save JSON
	self.save_json_data(data)

	# Add to vector store
	if self.vector_store:
	self.vector_store.add_document(invoice_record, raw_text)
	self.vector_store.save_vector_store()

	except Exception as e:
	st.error(f"Error saving invoice data: {e}")

	def update_summary(self, data: dict):
	"""Update summary statistics"""
	invoices = data["invoices"]

	total_amount = sum(inv.get("amount", 0) for inv in invoices)
	unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name")))

	data["summary"] = {
	"total_amount": total_amount,
	"unique_suppliers": unique_suppliers,
	"processing_stats": {
	"successful": self.processing_stats['successful'],
	"failed": self.processing_stats['failed'],
	"total_processed": self.processing_stats['total_processed']
	}
	}

	data["metadata"]["last_updated"] = datetime.now().isoformat()
	data["metadata"]["total_invoices"] = len(invoices)

	# ===============================================================================
	# CHATBOT CLASS
	# ===============================================================================

	class ChatBot:
	"""Chatbot for invoice queries"""

	def __init__(self, processor: InvoiceProcessor):
	self.processor = processor

	def query_database(self, query: str) -> str:
	"""Process user query and return response"""
	try:
	data = self.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	return "No invoice data found. Please upload some invoices first."

	query_lower = query.lower()

	# Handle different query types
	if any(phrase in query_lower for phrase in ["summary", "overview", "total"]):
	return self.generate_summary(data)

	elif "count" in query_lower or "how many" in query_lower:
	return self.handle_count_query(data)

	elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]):
	return self.handle_amount_query(data)

	elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
	return self.handle_supplier_query(data, query)

	elif self.processor.vector_store:
	return self.handle_semantic_search(query)

	else:
	return self.handle_general_query(data, query)

	except Exception as e:
	return f"Error processing query: {e}"

	def generate_summary(self, data: dict) -> str:
	"""Generate comprehensive summary"""
	invoices = data.get("invoices", [])
	summary = data.get("summary", {})

	if not invoices:
	return "No invoices found in the system."

	total_amount = summary.get("total_amount", 0)
	avg_amount = total_amount / len(invoices) if invoices else 0
	unique_suppliers = len(summary.get("unique_suppliers", []))

	response = f"""
	📊 Invoice System Summary

	• Total Invoices: {len(invoices):,}
	• Total Value: ₹{total_amount:,.2f}
	• Average Invoice: ₹{avg_amount:,.2f}
	• Unique Suppliers: {unique_suppliers}

	📈 Processing Stats
	• Successful: {summary.get('processing_stats', {}).get('successful', 0)}
	• Failed: {summary.get('processing_stats', {}).get('failed', 0)}

	🔍 Recent Invoices
	"""

	# Show recent invoices
	recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5]
	for i, inv in enumerate(recent, 1):
	response += f"\n{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})"

	return response

	def handle_count_query(self, data: dict) -> str:
	"""Handle count-related queries"""
	invoices = data.get("invoices", [])
	total = len(invoices)
	unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number')))

	return f"""
	📊 Invoice Count Summary

	• Total Records: {total}
	• Unique Invoice Numbers: {unique_numbers}
	• Duplicates: {total - unique_numbers if total > unique_numbers else 0}

	📅 Processing Timeline
	• First Invoice: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
	• Latest Invoice: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
	"""

	def handle_amount_query(self, data: dict) -> str:
	"""Handle amount-related queries"""
	invoices = data.get("invoices", [])
	amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0]

	if not amounts:
	return "No amount information found in invoices."

	total_amount = sum(amounts)
	avg_amount = total_amount / len(amounts)
	max_amount = max(amounts)
	min_amount = min(amounts)

	# Find high-value invoices
	high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount
	high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold]

	response = f"""
	💰 Financial Analysis

	• Total Amount: ₹{total_amount:,.2f}
	• Average Amount: ₹{avg_amount:,.2f}
	• Highest Invoice: ₹{max_amount:,.2f}
	• Lowest Invoice: ₹{min_amount:,.2f}

	🎯 High-Value Invoices (₹{high_value_threshold:,.2f}+)
	"""

	for i, inv in enumerate(high_value_invoices[:5], 1):
	response += f"\n{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})"

	return response

	def handle_supplier_query(self, data: dict, query: str) -> str:
	"""Handle supplier-related queries"""
	invoices = data.get("invoices", [])

	# Count invoices by supplier
	supplier_counts = {}
	supplier_amounts = {}

	for inv in invoices:
	supplier = inv.get('supplier_name', '').strip()
	if supplier:
	supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1
	supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0)

	if not supplier_counts:
	return "No supplier information found in invoices."

	# Sort suppliers by amount
	top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10]

	response = f"""
	🏢 Supplier Analysis

	• Total Unique Suppliers: {len(supplier_counts)}
	• Most Active: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices)

	💰 Top Suppliers by Amount
	"""

	for i, (supplier, amount) in enumerate(top_suppliers, 1):
	count = supplier_counts[supplier]
	avg = amount / count if count > 0 else 0
	response += f"\n{i}. {supplier} - ₹{amount:,.2f} ({count} invoices, avg: ₹{avg:,.2f})"

	return response

	def handle_semantic_search(self, query: str) -> str:
	"""Handle semantic search queries"""
	try:
	results = self.processor.vector_store.semantic_search(query, top_k=5)

	if not results:
	return f"No relevant results found for '{query}'. Try different keywords."

	response = f"🔍 Semantic Search Results for '{query}'\n\n"

	for i, result in enumerate(results, 1):
	response += f"{i}. {result.invoice_number} - {result.supplier_name}\n"
	response += f" • Similarity: {result.similarity_score:.3f}\n"
	response += f" • Amount: ₹{result.metadata.get('amount', 0):,.2f}\n"
	response += f" • Preview: {result.content_preview[:100]}...\n\n"

	return response

	except Exception as e:
	return f"Semantic search error: {e}"

	def handle_general_query(self, data: dict, query: str) -> str:
	"""Handle general queries with keyword search"""
	invoices = data.get("invoices", [])
	query_words = query.lower().split()

	# Simple keyword matching
	matching_invoices = []
	for inv in invoices:
	text_to_search = (
	inv.get('supplier_name', '') + ' ' +
	inv.get('buyer_name', '') + ' ' +
	inv.get('product_description', '') + ' ' +
	inv.get('extraction_info', {}).get('raw_text_preview', '')
	).lower()

	if any(word in text_to_search for word in query_words):
	matching_invoices.append(inv)

	if not matching_invoices:
	return f"No invoices found matching '{query}'. Try different keywords or check the summary."

	response = f"🔍 Found {len(matching_invoices)} invoices matching '{query}'\n\n"

	for i, inv in enumerate(matching_invoices[:5], 1):
	response += f"{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')}\n"
	response += f" • Amount: ₹{inv.get('amount', 0):,.2f}\n"
	response += f" • Date: {inv.get('date', 'N/A')}\n\n"

	if len(matching_invoices) > 5:
	response += f"... and {len(matching_invoices) - 5} more results."

	return response

	# ===============================================================================
	# STREAMLIT APPLICATION
	# ===============================================================================

	def create_app():
	"""Main Streamlit application"""

	# Generate unique session ID for this run
	if 'session_id' not in st.session_state:
	st.session_state.session_id = str(uuid.uuid4())[:8]

	session_id = st.session_state.session_id

	# Custom CSS
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	text-align: center;
	color: #FF6B35;
	margin-bottom: 1rem;
	}
	.feature-box {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 1rem;
	border-radius: 10px;
	color: white;
	margin: 0.5rem 0;
	text-align: center;
	}
	.status-ok { color: #28a745; font-weight: bold; }
	.status-warning { color: #ffc107; font-weight: bold; }
	.status-error { color: #dc3545; font-weight: bold; }
	</style>
	""", unsafe_allow_html=True)

	# Header
	st.markdown('<h1 class="main-header">📄 AI Invoice Processing System</h1>', unsafe_allow_html=True)
	st.markdown("""
	<div style="text-align: center; margin-bottom: 2rem;">
	<p style="font-size: 1.1rem; color: #666;">
	AI-Powered Document Processing • Semantic Search • Smart Analytics • Hugging Face Spaces
	</p>
	</div>
	""", unsafe_allow_html=True)

	# Initialize processor
	if 'processor' not in st.session_state:
	with st.spinner("🔧 Initializing AI Invoice Processor..."):
	try:
	st.session_state.processor = InvoiceProcessor()
	st.session_state.chatbot = ChatBot(st.session_state.processor)
	st.session_state.chat_history = []
	st.success("✅ System initialized successfully!")
	except Exception as e:
	st.error(f"❌ Initialization failed: {e}")
	st.stop()

	# Sidebar
	with st.sidebar:
	st.header("🎛️ System Status")

	processor = st.session_state.processor

	# Component status
	if processor.document_processor.processors:
	st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)

	if processor.ai_extractor.use_transformers:
	st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)

	if processor.vector_store and processor.vector_store.embedding_model:
	st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)

	# Quick stats
	st.header("📊 Quick Stats")
	try:
	data = processor.load_json_data()
	total_invoices = len(data.get("invoices", []))
	total_amount = data.get("summary", {}).get("total_amount", 0)

	st.metric("Total Invoices", total_invoices)
	st.metric("Total Value", f"₹{total_amount:,.2f}")
	st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")

	except Exception as e:
	st.error(f"Stats error: {e}")

	# System info
	st.header("⚙️ System Info")
	st.info(f"""
	Session ID: {session_id}

	Limits:
	• Max file size: 10MB
	• Max concurrent files: 3
	• Timeout: 30s
	""")

	# Main navigation
	selected_tab = st.radio(
	"Choose a section:",
	["📤 Upload & Process", "💬 AI Chat", "📊 Analytics", "📋 Data Explorer"],
	horizontal=True,
	key=f"main_navigation_{session_id}"
	)

	# -------------------------------------------------------------------------
	# UPLOAD & PROCESS SECTION
	# -------------------------------------------------------------------------

	if selected_tab == "📤 Upload & Process":
	st.header("📤 Upload Invoice Documents")

	# Feature highlights
	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("""
	<div class="feature-box">
	<h4>🤖 AI Extraction</h4>
	<p>Advanced NLP models extract structured data automatically</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown("""
	<div class="feature-box">
	<h4>🔍 Smart Search</h4>
	<p>Semantic search finds invoices using natural language</p>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	st.markdown("""
	<div class="feature-box">
	<h4>📊 Analytics</h4>
	<p>Comprehensive insights and visualizations</p>
	</div>
	""", unsafe_allow_html=True)

	# File upload
	st.markdown("### 📁 Upload Your Invoices")

	# Initialize session state for files if not exists
	if f'uploaded_files_{session_id}' not in st.session_state:
	st.session_state[f'uploaded_files_{session_id}'] = None
	if f'processing_complete_{session_id}' not in st.session_state:
	st.session_state[f'processing_complete_{session_id}'] = False
	if f'currently_processing_{session_id}' not in st.session_state:
	st.session_state[f'currently_processing_{session_id}'] = False
	if f'processed_file_hashes_{session_id}' not in st.session_state:
	st.session_state[f'processed_file_hashes_{session_id}'] = set()

	# File uploader with stable key
	uploaded_files = st.file_uploader(
	"Choose invoice files (PDF, TXT supported)",
	type=['pdf', 'txt'],
	accept_multiple_files=True,
	help="Maximum file size: 10MB per file",
	key=f"file_uploader_stable_{session_id}"
	)

	# Store uploaded files in session state only if they're new
	if uploaded_files:
	# Create file hashes to detect if files have changed
	current_file_hashes = set()
	for file in uploaded_files:
	file_hash = hash((file.name, file.size))
	current_file_hashes.add(file_hash)

	# Check if files have changed
	stored_hashes = st.session_state.get(f'uploaded_file_hashes_{session_id}', set())
	if current_file_hashes != stored_hashes:
	st.session_state[f'uploaded_files_{session_id}'] = uploaded_files
	st.session_state[f'uploaded_file_hashes_{session_id}'] = current_file_hashes
	st.session_state[f'processing_complete_{session_id}'] = False
	st.session_state[f'currently_processing_{session_id}'] = False
	st.info("📄 New files detected - ready for processing")

	# Get files from session state
	current_files = st.session_state[f'uploaded_files_{session_id}']
	is_processing = st.session_state[f'currently_processing_{session_id}']
	is_complete = st.session_state[f'processing_complete_{session_id}']

	if current_files:
	max_files = 3
	if len(current_files) > max_files:
	st.warning(f"⚠️ Too many files selected. Processing first {max_files} files.")
	current_files = current_files[:max_files]

	st.info(f"📊 {len(current_files)} files selected")

	# Show file names
	st.markdown("Selected Files:")
	for i, file in enumerate(current_files, 1):
	file_size_mb = len(file.getvalue()) / (1024 * 1024)
	file_hash = hash((file.name, file.size))
	processed_icon = "✅" if file_hash in st.session_state[f'processed_file_hashes_{session_id}'] else "📄"
	st.write(f"{processed_icon} {i}. {file.name} ({file_size_mb:.2f} MB)")

	# Process button - only show if not currently processing
	col1, col2 = st.columns([1, 1])

	with col1:
	if not is_processing and not is_complete:
	if st.button("🚀 Process Files", type="primary", key=f"process_btn_{session_id}"):
	st.session_state[f'currently_processing_{session_id}'] = True
	st.rerun()
	elif is_processing:
	st.info("🔄 Processing in progress...")
	# Actually process the files here
	process_files_once(current_files, session_id)
	elif is_complete:
	st.success("✅ Processing completed!")
	if st.button("🔄 Process Again", key=f"reprocess_btn_{session_id}"):
	st.session_state[f'processing_complete_{session_id}'] = False
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processed_file_hashes_{session_id}'] = set()
	st.rerun()

	with col2:
	if st.button("🗑️ Clear Files", key=f"clear_files_{session_id}"):
	# Clear all session state related to files
	keys_to_clear = [
	f'uploaded_files_{session_id}',
	f'uploaded_file_hashes_{session_id}',
	f'processing_complete_{session_id}',
	f'currently_processing_{session_id}',
	f'processed_file_hashes_{session_id}'
	]

	for key in keys_to_clear:
	if key in st.session_state:
	del st.session_state[key]

	st.success("🗑️ Files cleared successfully!")
	time.sleep(1) # Brief pause to show message
	st.rerun()

	else:
	st.info("👆 Please select invoice files to upload and process")

	# Show processing results if completed
	if is_complete:
	st.markdown("### 📋 Recent Processing Results")
	try:
	data = st.session_state.processor.load_json_data()
	recent_invoices = sorted(
	data.get("invoices", []),
	key=lambda x: x.get('timestamps', {}).get('created_at', ''),
	reverse=True
	)[:5]

	if recent_invoices:
	for i, inv in enumerate(recent_invoices, 1):
	with st.expander(f"📄 {inv.get('invoice_number', f'Invoice {i}')} - {inv.get('supplier_name', 'Unknown')}", expanded=False):
	col1, col2 = st.columns(2)
	with col1:
	st.write(f"Invoice #: {inv.get('invoice_number', 'N/A')}")
	st.write(f"Supplier: {inv.get('supplier_name', 'N/A')}")
	st.write(f"Amount: ₹{inv.get('amount', 0):.2f}")
	with col2:
	st.write(f"Date: {inv.get('date', 'N/A')}")
	st.write(f"Method: {inv.get('extraction_info', {}).get('method', 'N/A')}")
	st.write(f"Confidence: {inv.get('extraction_info', {}).get('confidence', 0):.1%}")
	else:
	st.info("No recent processing results found.")
	except Exception as e:
	st.error(f"Error loading recent results: {e}")

	# -------------------------------------------------------------------------
	# AI CHAT SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "💬 AI Chat":
	st.header("💬 AI Chat Interface")

	# Display chat history
	if st.session_state.chat_history:
	st.markdown("### 💬 Chat History")
	for i, message in enumerate(st.session_state.chat_history):
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input
	st.markdown("### ✍️ Ask a Question")

	col1, col2 = st.columns([4, 1])

	with col1:
	user_input = st.text_input(
	"Type your question:",
	placeholder="e.g., 'show me total spending'",
	key=f"chat_input_{session_id}"
	)

	with col2:
	ask_btn = st.button("🚀 Ask", type="primary", key=f"ask_btn_{session_id}")

	if ask_btn and user_input:
	handle_chat_query(user_input)

	# Suggested queries
	if not st.session_state.chat_history:
	st.markdown("### 💡 Try These Queries")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("📊 Basic Queries:")
	basic_queries = [
	"Show me a summary of all invoices",
	"How much have we spent in total?",
	"Who are our top suppliers?",
	"Find invoices with high amounts"
	]
	for i, query in enumerate(basic_queries):
	if st.button(query, key=f"basic_{session_id}_{i}"):
	handle_chat_query(query)

	with col2:
	st.markdown("🔍 Advanced Queries:")
	advanced_queries = [
	"Find technology purchases",
	"Show office supplies",
	"Search consulting services",
	"Recent high-value invoices"
	]
	for i, query in enumerate(advanced_queries):
	if st.button(query, key=f"advanced_{session_id}_{i}"):
	handle_chat_query(query)

	# Clear chat
	if st.session_state.chat_history:
	if st.button("🗑️ Clear Chat", key=f"clear_chat_{session_id}"):
	st.session_state.chat_history = []
	st.rerun()

	# -------------------------------------------------------------------------
	# ANALYTICS SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "📊 Analytics":
	st.header("📊 Analytics Dashboard")

	try:
	data = st.session_state.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	st.info("📊 No data available. Upload some invoices to see analytics.")
	return

	# Convert to DataFrame
	df_data = []
	for inv in invoices:
	df_data.append({
	'invoice_number': inv.get('invoice_number', ''),
	'supplier_name': inv.get('supplier_name', ''),
	'amount': inv.get('amount', 0),
	'date': inv.get('date', ''),
	'confidence': inv.get('extraction_info', {}).get('confidence', 0)
	})

	df = pd.DataFrame(df_data)

	# Key metrics
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Total Invoices", len(df))
	with col2:
	st.metric("Total Amount", f"₹{df['amount'].sum():,.2f}")
	with col3:
	st.metric("Avg Amount", f"₹{df['amount'].mean():,.2f}")
	with col4:
	st.metric("Unique Suppliers", df['supplier_name'].nunique())

	# Visualizations
	if len(df) > 0:
	# Amount distribution
	fig_hist = px.histogram(
	df,
	x='amount',
	title="Invoice Amount Distribution",
	labels={'amount': 'Amount (₹)', 'count': 'Number of Invoices'}
	)
	st.plotly_chart(fig_hist, use_container_width=True)

	# Top suppliers
	if df['supplier_name'].notna().any():
	supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10)

	if len(supplier_amounts) > 0:
	fig_suppliers = px.bar(
	x=supplier_amounts.values,
	y=supplier_amounts.index,
	orientation='h',
	title="Top 10 Suppliers by Total Amount",
	labels={'x': 'Total Amount (₹)', 'y': 'Supplier'}
	)
	st.plotly_chart(fig_suppliers, use_container_width=True)

	except Exception as e:
	st.error(f"Analytics error: {e}")

	# -------------------------------------------------------------------------
	# DATA EXPLORER SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "📋 Data Explorer":
	st.header("📋 Data Explorer")

	try:
	data = st.session_state.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	st.info("📊 No data available. Upload some invoices first.")
	return

	# Convert to DataFrame
	df_data = []
	for inv in invoices:
	df_data.append({
	'Invoice Number': inv.get('invoice_number', ''),
	'Supplier': inv.get('supplier_name', ''),
	'Buyer': inv.get('buyer_name', ''),
	'Amount': inv.get('amount', 0),
	'Date': inv.get('date', ''),
	'Confidence': inv.get('extraction_info', {}).get('confidence', 0),
	'Method': inv.get('extraction_info', {}).get('method', ''),
	'File': inv.get('file_info', {}).get('file_name', ''),
	'Created': inv.get('timestamps', {}).get('created_at', '')[:19]
	})

	df = pd.DataFrame(df_data)

	# Filters
	col1, col2, col3 = st.columns(3)

	with col1:
	suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist())
	selected_supplier = st.selectbox("Filter by Supplier", suppliers, key=f"supplier_filter_{session_id}")

	with col2:
	methods = ['All'] + sorted(df['Method'].dropna().unique().tolist())
	selected_method = st.selectbox("Filter by Method", methods, key=f"method_filter_{session_id}")

	with col3:
	min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0, key=f"amount_filter_{session_id}")

	# Apply filters
	filtered_df = df.copy()
	if selected_supplier != 'All':
	filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier]
	if selected_method != 'All':
	filtered_df = filtered_df[filtered_df['Method'] == selected_method]
	if min_amount > 0:
	filtered_df = filtered_df[filtered_df['Amount'] >= min_amount]

	# Display data
	st.dataframe(
	filtered_df,
	use_container_width=True,
	column_config={
	"Amount": st.column_config.NumberColumn("Amount", format="₹%.2f"),
	"Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1)
	}
	)

	# Export options
	col1, col2 = st.columns(2)

	with col1:
	if st.button("📥 Export CSV", key=f"export_csv_{session_id}"):
	csv_data = filtered_df.to_csv(index=False)
	st.download_button(
	"Download CSV",
	csv_data,
	f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
	"text/csv",
	key=f"download_csv_{session_id}"
	)

	with col2:
	if st.button("📄 Export JSON", key=f"export_json_{session_id}"):
	filtered_invoices = [inv for inv in invoices
	if inv.get('invoice_number') in filtered_df['Invoice Number'].values]

	export_data = {
	"exported_at": datetime.now().isoformat(),
	"total_records": len(filtered_invoices),
	"invoices": filtered_invoices
	}

	st.download_button(
	"Download JSON",
	json.dumps(export_data, indent=2),
	f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
	"application/json",
	key=f"download_json_{session_id}"
	)

	except Exception as e:
	st.error(f"Data explorer error: {e}")

	# -------------------------------------------------------------------------
	# GLOBAL CHAT INPUT
	# -------------------------------------------------------------------------

	st.markdown("---")
	st.markdown("### 💬 Quick Chat (Works from any section)")

	global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")

	if global_query:
	handle_chat_query(global_query, show_response=True)

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; color: #666;">
	<p>🚀 <strong>AI Invoice Processing System</strong> - Optimized for Hugging Face Spaces</p>
	<p>Built with ❤️ using Streamlit, Transformers, and AI</p>
	</div>
	""", unsafe_allow_html=True)

	# ===============================================================================
	# HELPER FUNCTIONS
	# ===============================================================================

	def process_files_once(uploaded_files, session_id):
	"""Process uploaded files only once with proper state management"""
	if not uploaded_files:
	st.error("No files to process!")
	st.session_state[f'currently_processing_{session_id}'] = False
	return

	st.markdown("### 🔄 Processing Files...")

	# Get already processed file hashes
	processed_hashes = st.session_state[f'processed_file_hashes_{session_id}']

	# Filter out already processed files
	files_to_process = []
	for file in uploaded_files:
	file_hash = hash((file.name, file.size))
	if file_hash not in processed_hashes:
	files_to_process.append((file, file_hash))

	if not files_to_process:
	st.info("✅ All files have already been processed!")
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processing_complete_{session_id}'] = True
	return

	# Create containers for dynamic updates
	progress_container = st.container()
	status_container = st.container()
	results_container = st.container()

	successful = 0
	failed = 0

	# Show progress
	with progress_container:
	progress_bar = st.progress(0)
	progress_text = st.empty()

	with status_container:
	st.info(f"Starting to process {len(files_to_process)} new files...")

	# Process each file only once
	for i, (uploaded_file, file_hash) in enumerate(files_to_process):
	current_progress = (i + 1) / len(files_to_process)

	with progress_container:
	progress_bar.progress(current_progress)
	progress_text.text(f"Processing file {i+1}/{len(files_to_process)}: {uploaded_file.name}")

	with status_container:
	st.info(f"🔄 Processing: {uploaded_file.name} ({len(uploaded_file.getvalue())/1024:.1f} KB)")

	try:
	# Process the file
	result = st.session_state.processor.process_uploaded_file(uploaded_file)

	# Mark file as processed regardless of result
	processed_hashes.add(file_hash)

	# Show result immediately
	with results_container:
	if result and hasattr(result, 'invoice_number') and result.invoice_number:
	successful += 1
	st.success(f"✅ Successfully processed: {uploaded_file.name}")

	# Show extracted data
	col1, col2, col3 = st.columns(3)
	with col1:
	st.write(f"Invoice #: {result.invoice_number}")
	st.write(f"Supplier: {result.supplier_name or 'Not found'}")
	with col2:
	st.write(f"Amount: ₹{result.amount:.2f}")
	st.write(f"Date: {result.date or 'Not found'}")
	with col3:
	st.write(f"Method: {result.processing_method}")
	st.write(f"Confidence: {result.extraction_confidence:.1%}")

	st.markdown("---")
	else:
	failed += 1
	st.warning(f"⚠️ Could not extract complete data from: {uploaded_file.name}")
	if result:
	st.write(f"Partial data: {result.supplier_name}, ₹{result.amount}")
	st.markdown("---")

	except Exception as e:
	failed += 1
	# Still mark as processed to avoid reprocessing
	processed_hashes.add(file_hash)

	with results_container:
	st.error(f"❌ Error processing {uploaded_file.name}: {str(e)}")
	st.markdown("---")

	# Update session state
	st.session_state[f'processed_file_hashes_{session_id}'] = processed_hashes

	# Final summary
	with progress_container:
	progress_bar.progress(1.0)
	progress_text.text("✅ Processing completed!")

	with status_container:
	if successful > 0:
	st.success(f"🎉 Processing complete! {successful} successful, {failed} failed")
	if successful > 0:
	st.balloons()
	else:
	st.error(f"❌ Processing failed for all {failed} files. Please check file formats and content.")

	# Update processing state
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processing_complete_{session_id}'] = True

	# Force rerun to update UI
	st.rerun()

	def process_files(uploaded_files, session_id):
	"""Legacy function - redirect to process_files_once"""
	return process_files_once(uploaded_files, session_id)

	def handle_chat_query(query, show_response=False):
	"""Handle chat query"""
	st.session_state.chat_history.append({
	"role": "user",
	"content": query,
	"timestamp": datetime.now()
	})

	try:
	with st.spinner("🤖 AI is analyzing..."):
	response = st.session_state.chatbot.query_database(query)

	st.session_state.chat_history.append({
	"role": "assistant",
	"content": response,
	"timestamp": datetime.now()
	})

	if show_response:
	with st.chat_message("assistant"):
	st.markdown(response)
	st.info("💡 Switch to the 'AI Chat' section to see full conversation history!")

	st.rerun()

	except Exception as e:
	st.error(f"Chat error: {e}")

	# ===============================================================================
	# MAIN ENTRY POINT
	# ===============================================================================

	def main():
	"""Main entry point for Hugging Face Spaces"""
	try:
	if IS_HF_SPACE:
	st.sidebar.info("🤗 Running on Hugging Face Spaces")

	create_app()

	except Exception as e:
	st.error(f"""
	## 🚨 Application Error

	{e}

	Please refresh the page or check the logs for more details.
	""")

	if __name__ == "__main__":
	main(),

	# Standalone amounts with currency words
	r'([0-9,]+\.?\d)\s(?:dollars?\|rupees?\|usd\|inr\|eur\|gbp)',

	# Table-like patterns
	r'(?:price\|cost\|rate)\s:?\s[\$₹£€]?\s([0-9,]+\.?\d)',

	# Amount with decimal precision
	r'(?:^\|\s)([0-9]{1,3}(?:,\d{3})*\.?\d{0,2})(?=\s\|$)',
	],
	'date': [
	r'date\s:?\s(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
	r'(?:invoice\|bill)\sdate\s:?\s*(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
	r'(?:^\|\s)(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})(?=\s\|$)',
	r'(\d{4}[/\-\.]\d{1,2}[/\-\.]\d{1,2})',
	r'(\d{1,2}\s+(?:jan\|feb\|mar\|apr\|may\|jun\|jul\|aug\|sep\|oct\|nov\|dec)\s+\d{2,4})',
	],
	'quantity': [
	r'qty\s:?\s(\d+)',
	r'quantity\s:?\s(\d+)',
	r'(?:units?\|pcs?\|pieces?)\s:?\s(\d+)',
	r'(\d+)\s*(?:pcs?\|units?\|items?\|pieces?)',
	]
	}

	text_lower = text.lower()

	# Extract invoice number with multiple attempts
	for pattern in patterns['invoice_number']:
	match = re.search(pattern, text_lower, re.IGNORECASE \| re.MULTILINE)
	if match:
	invoice_data.invoice_number = match.group(1).upper().strip()
	break

	# Extract amount with enhanced logic
	amounts_found = []
	for pattern in patterns['amount']:
	matches = re.finditer(pattern, text_lower, re.IGNORECASE \| re.MULTILINE)
	for match in matches:
	try:
	amount_str = match.group(1).replace(',', '').replace(' ', '')
	amount_val = float(amount_str)
	if 0.01 <= amount_val <= 1000000: # Reasonable range
	amounts_found.append(amount_val)
	except (ValueError, IndexError):
	continue

	# Choose the most likely amount (highest value or most repeated)
	if amounts_found:
	# Remove duplicates and sort
	unique_amounts = sorted(set(amounts_found), reverse=True)
	# Take the highest reasonable amount
	invoice_data.amount = unique_amounts[0]

	# Extract date
	for pattern in patterns['date']:
	match = re.search(pattern, text, re.IGNORECASE \| re.MULTILINE)
	if match:
	invoice_data.date = self.parse_date(match.group(1))
	break

	# Extract quantity
	for pattern in patterns['quantity']:
	match = re.search(pattern, text_lower, re.IGNORECASE)
	if match:
	try:
	invoice_data.quantity = int(match.group(1))
	break
	except ValueError:
	continue

	# Enhanced company name extraction
	company_patterns = [
	r'(?:from\|supplier\|vendor)\s:?\s([A-Z][A-Za-z\s&,\.]{3,50})',
	r'(?:to\|buyer\|client)\s:?\s([A-Z][A-Za-z\s&,\.]{3,50})',
	r'([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:ltd\|inc\|corp\|llc\|co\.\|company\|pvt\|private\|limited)',
	r'(?:^\|\n)([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:\n\|$)',
	]

	companies_found = []
	for pattern in company_patterns:
	matches = re.findall(pattern, text, re.MULTILINE)
	for match in matches:
	clean_company = match.strip().title()
	if len(clean_company) > 3 and not any(word in clean_company.lower() for word in ['total', 'amount', 'date', 'invoice']):
	companies_found.append(clean_company)

	# Assign companies (first as supplier, second as buyer)
	if companies_found:
	invoice_data.supplier_name = companies_found[0]
	if len(companies_found) > 1:
	invoice_data.buyer_name = companies_found[1]

	# Extract product description
	desc_patterns = [
	r'(?:description\|item\|product\|service)\s:?\s([A-Za-z0-9\s,.-]{10,200})',
	r'(?:for\|regarding)\s:?\s([A-Za-z0-9\s,.-]{10,200})',
	]

	for pattern in desc_patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	desc = match.group(1).strip()
	if len(desc) > 5:
	invoice_data.product_description = desc[:200] # Limit length
	break

	# Set confidence based on how much we extracted
	confidence_factors = []
	if invoice_data.invoice_number:
	confidence_factors.append(0.3)
	if invoice_data.amount > 0:
	confidence_factors.append(0.3)
	if invoice_data.supplier_name:
	confidence_factors.append(0.2)
	if invoice_data.date:
	confidence_factors.append(0.1)
	if invoice_data.quantity > 0:
	confidence_factors.append(0.1)

	invoice_data.extraction_confidence = sum(confidence_factors)

	return invoice_data

	def parse_date(self, date_str: str) -> str:
	"""Parse date to YYYY-MM-DD format"""
	if not date_str:
	return ""

	formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d']

	for fmt in formats:
	try:
	parsed_date = datetime.strptime(date_str, fmt)
	return parsed_date.strftime('%Y-%m-%d')
	except ValueError:
	continue

	return date_str

	# ===============================================================================
	# VECTOR STORE CLASS
	# ===============================================================================

	class VectorStore:
	"""Simplified vector store for Hugging Face Spaces"""

	def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
	self.embedding_model_name = embedding_model
	self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
	self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
	self.embedding_model = None
	self.vectors = []
	self.document_metadata = []
	self.embedding_dimension = None

	self.setup_embedding_model()
	self.load_vector_store()

	def setup_embedding_model(self):
	"""Initialize the sentence transformer model"""
	if not SENTENCE_TRANSFORMERS_AVAILABLE:
	st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
	return

	try:
	with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
	self.embedding_model = SentenceTransformer(
	self.embedding_model_name,
	cache_folder=HF_CONFIG["cache_dir"]
	)

	# Get embedding dimension
	test_embedding = self.embedding_model.encode(["test"])
	self.embedding_dimension = test_embedding.shape[0]

	st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")

	except Exception as e:
	st.error(f"❌ Failed to load embedding model: {e}")
	self.embedding_model = None

	def load_vector_store(self):
	"""Load existing vector store"""
	try:
	if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
	with open(self.vector_store_path, 'rb') as f:
	self.vectors = pickle.load(f)

	with open(self.metadata_path, 'rb') as f:
	self.document_metadata = pickle.load(f)

	st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
	else:
	self.vectors = []
	self.document_metadata = []
	st.info("📄 New vector store initialized")

	except Exception as e:
	st.error(f"❌ Error loading vector store: {e}")
	self.vectors = []
	self.document_metadata = []

	def save_vector_store(self):
	"""Save vector store to disk"""
	try:
	with open(self.vector_store_path, 'wb') as f:
	pickle.dump(self.vectors, f)

	with open(self.metadata_path, 'wb') as f:
	pickle.dump(self.document_metadata, f)

	return True
	except Exception as e:
	st.error(f"Error saving vector store: {e}")
	return False

	def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
	"""Create searchable text from invoice data"""
	text_parts = []

	for field, value in invoice_data.items():
	if value and field != 'id':
	text_parts.append(f"{field}: {value}")

	if raw_text:
	text_parts.append(f"content: {raw_text[:300]}")

	return " \| ".join(text_parts)

	def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
	"""Add a document to the vector store"""
	if not self.embedding_model:
	return False

	try:
	document_text = self.create_document_text(invoice_data, raw_text)

	# Generate embedding
	embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)

	# Create metadata
	metadata = {
	'invoice_id': invoice_data.get('id', ''),
	'invoice_number': invoice_data.get('invoice_number', ''),
	'supplier_name': invoice_data.get('supplier_name', ''),
	'buyer_name': invoice_data.get('buyer_name', ''),
	'amount': invoice_data.get('amount', 0),
	'date': invoice_data.get('date', ''),
	'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
	'document_text': document_text[:200],
	'timestamp': datetime.now().isoformat()
	}

	# Add to store
	self.vectors.append(embedding)
	self.document_metadata.append(metadata)

	return True

	except Exception as e:
	st.error(f"Error adding document to vector store: {e}")
	return False

	def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
	"""Perform semantic search using cosine similarity"""
	if not self.embedding_model or not self.vectors:
	return []

	try:
	# Generate query embedding
	query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)

	# Calculate similarities
	similarities = []
	for i, doc_embedding in enumerate(self.vectors):
	similarity = np.dot(query_embedding, doc_embedding)
	similarities.append((similarity, i))

	# Sort by similarity
	similarities.sort(reverse=True)

	# Return top results
	results = []
	for similarity, idx in similarities[:top_k]:
	if similarity > 0.1: # Relevance threshold
	metadata = self.document_metadata[idx]
	result = VectorSearchResult(
	invoice_id=metadata.get('invoice_id', ''),
	invoice_number=metadata.get('invoice_number', ''),
	supplier_name=metadata.get('supplier_name', ''),
	similarity_score=float(similarity),
	content_preview=metadata.get('document_text', ''),
	metadata=metadata
	)
	results.append(result)

	return results

	except Exception as e:
	st.error(f"Error in semantic search: {e}")
	return []

	# ===============================================================================
	# MAIN PROCESSOR CLASS
	# ===============================================================================

	class InvoiceProcessor:
	"""Main invoice processor for Hugging Face Spaces"""

	def __init__(self):
	self.setup_storage()
	self.document_processor = DocumentProcessor()
	self.ai_extractor = AIExtractor()
	self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None

	# Initialize stats
	self.processing_stats = {
	'total_processed': 0,
	'successful': 0,
	'failed': 0,
	'start_time': datetime.now()
	}

	def setup_storage(self):
	"""Setup storage paths"""
	self.data_dir = HF_CONFIG["data_dir"]
	self.json_path = os.path.join(self.data_dir, "invoices.json")

	# Initialize JSON storage
	if not os.path.exists(self.json_path):
	initial_data = {
	"metadata": {
	"created_at": datetime.now().isoformat(),
	"version": "hf_v1.0",
	"total_invoices": 0
	},
	"invoices": [],
	"summary": {
	"total_amount": 0.0,
	"unique_suppliers": [],
	"processing_stats": {"successful": 0, "failed": 0}
	}
	}
	self.save_json_data(initial_data)

	def load_json_data(self) -> dict:
	"""Load invoice data from JSON"""
	try:
	with open(self.json_path, 'r', encoding='utf-8') as f:
	return json.load(f)
	except (FileNotFoundError, json.JSONDecodeError):
	self.setup_storage()
	return self.load_json_data()

	def save_json_data(self, data: dict):
	"""Save invoice data to JSON"""
	try:
	with open(self.json_path, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=2, ensure_ascii=False)
	except Exception as e:
	st.error(f"Error saving data: {e}")

	def process_uploaded_file(self, uploaded_file) -> InvoiceData:
	"""Process a single uploaded file with enhanced debugging"""
	self.processing_stats['total_processed'] += 1

	try:
	# Debug file info
	file_size = len(uploaded_file.getvalue())
	file_extension = uploaded_file.name.split('.')[-1].lower() if '.' in uploaded_file.name else 'unknown'

	st.info(f"📄 Processing: {uploaded_file.name} ({file_size/1024:.1f} KB, .{file_extension})")

	# Check file size
	if file_size > HF_CONFIG["max_file_size_mb"] * 1024 * 1024:
	error_msg = f"File too large: {file_size / 1024 / 1024:.2f}MB > {HF_CONFIG['max_file_size_mb']}MB"
	st.error(error_msg)
	self.processing_stats['failed'] += 1
	return InvoiceData()

	# Check file type
	if file_extension not in ['pdf', 'txt']:
	error_msg = f"Unsupported file type: .{file_extension} (supported: PDF, TXT)"
	st.warning(error_msg)
	self.processing_stats['failed'] += 1
	return InvoiceData()

	# Save temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp_file:
	file_content = uploaded_file.getvalue()
	tmp_file.write(file_content)
	tmp_file_path = tmp_file.name

	st.info(f"💾 Saved temporarily to: {tmp_file_path}")

	try:
	# Extract text
	st.info("🔍 Extracting text from document...")
	text = self.document_processor.extract_text_from_document(tmp_file_path)

	if not text or not text.strip():
	st.warning(f"❌ No text extracted from {uploaded_file.name}")
	self.processing_stats['failed'] += 1
	return InvoiceData()

	text_length = len(text)
	st.info(f"📝 Extracted {text_length} characters of text")

	# Show text preview
	if text_length > 0:
	with st.expander("📄 Text Preview (First 500 characters)", expanded=False):
	st.text(text[:500] + "..." if len(text) > 500 else text)

	# Extract invoice data
	st.info("🤖 Extracting invoice data using AI/Regex...")
	invoice_data = self.ai_extractor.extract_with_ai(text)
	invoice_data.file_path = uploaded_file.name

	# Show extraction results
	st.info(f"📊 Extraction completed with {invoice_data.extraction_confidence:.1%} confidence")

	# Save to storage
	st.info("💾 Saving extracted data...")
	self.save_invoice_data(invoice_data, text, file_size)

	self.processing_stats['successful'] += 1
	st.success(f"✅ Successfully processed {uploaded_file.name}")

	return invoice_data

	finally:
	# Cleanup
	try:
	os.unlink(tmp_file_path)
	st.info("🧹 Cleaned up temporary file")
	except:
	pass

	except Exception as e:
	error_msg = f"Error processing {uploaded_file.name}: {str(e)}"
	st.error(error_msg)
	self.processing_stats['failed'] += 1

	# Show detailed error for debugging
	with st.expander("🔍 Error Details", expanded=False):
	st.code(str(e))
	import traceback
	st.code(traceback.format_exc())

	return InvoiceData()

	def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int):
	"""Save invoice data to JSON and vector store"""
	try:
	# Load existing data
	data = self.load_json_data()

	# Create invoice record
	invoice_record = {
	"id": len(data["invoices"]) + 1,
	"invoice_number": invoice_data.invoice_number,
	"supplier_name": invoice_data.supplier_name,
	"buyer_name": invoice_data.buyer_name,
	"date": invoice_data.date,
	"amount": invoice_data.amount,
	"quantity": invoice_data.quantity,
	"product_description": invoice_data.product_description,
	"file_info": {
	"file_name": invoice_data.file_path,
	"file_size": file_size
	},
	"extraction_info": {
	"confidence": invoice_data.extraction_confidence,
	"method": invoice_data.processing_method,
	"raw_text_preview": raw_text[:300]
	},
	"timestamps": {
	"created_at": datetime.now().isoformat()
	}
	}

	# Add to invoices
	data["invoices"].append(invoice_record)

	# Update summary
	self.update_summary(data)

	# Save JSON
	self.save_json_data(data)

	# Add to vector store
	if self.vector_store:
	self.vector_store.add_document(invoice_record, raw_text)
	self.vector_store.save_vector_store()

	except Exception as e:
	st.error(f"Error saving invoice data: {e}")

	def update_summary(self, data: dict):
	"""Update summary statistics"""
	invoices = data["invoices"]

	total_amount = sum(inv.get("amount", 0) for inv in invoices)
	unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name")))

	data["summary"] = {
	"total_amount": total_amount,
	"unique_suppliers": unique_suppliers,
	"processing_stats": {
	"successful": self.processing_stats['successful'],
	"failed": self.processing_stats['failed'],
	"total_processed": self.processing_stats['total_processed']
	}
	}

	data["metadata"]["last_updated"] = datetime.now().isoformat()
	data["metadata"]["total_invoices"] = len(invoices)

	# ===============================================================================
	# CHATBOT CLASS
	# ===============================================================================

	class ChatBot:
	"""Chatbot for invoice queries"""

	def __init__(self, processor: InvoiceProcessor):
	self.processor = processor

	def query_database(self, query: str) -> str:
	"""Process user query and return response"""
	try:
	data = self.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	return "No invoice data found. Please upload some invoices first."

	query_lower = query.lower()

	# Handle different query types
	if any(phrase in query_lower for phrase in ["summary", "overview", "total"]):
	return self.generate_summary(data)

	elif "count" in query_lower or "how many" in query_lower:
	return self.handle_count_query(data)

	elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]):
	return self.handle_amount_query(data)

	elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
	return self.handle_supplier_query(data, query)

	elif self.processor.vector_store:
	return self.handle_semantic_search(query)

	else:
	return self.handle_general_query(data, query)

	except Exception as e:
	return f"Error processing query: {e}"

	def generate_summary(self, data: dict) -> str:
	"""Generate comprehensive summary"""
	invoices = data.get("invoices", [])
	summary = data.get("summary", {})

	if not invoices:
	return "No invoices found in the system."

	total_amount = summary.get("total_amount", 0)
	avg_amount = total_amount / len(invoices) if invoices else 0
	unique_suppliers = len(summary.get("unique_suppliers", []))

	response = f"""
	📊 Invoice System Summary

	• Total Invoices: {len(invoices):,}
	• Total Value: ₹{total_amount:,.2f}
	• Average Invoice: ₹{avg_amount:,.2f}
	• Unique Suppliers: {unique_suppliers}

	📈 Processing Stats
	• Successful: {summary.get('processing_stats', {}).get('successful', 0)}
	• Failed: {summary.get('processing_stats', {}).get('failed', 0)}

	🔍 Recent Invoices
	"""

	# Show recent invoices
	recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5]
	for i, inv in enumerate(recent, 1):
	response += f"\n{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})"

	return response

	def handle_count_query(self, data: dict) -> str:
	"""Handle count-related queries"""
	invoices = data.get("invoices", [])
	total = len(invoices)
	unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number')))

	return f"""
	📊 Invoice Count Summary

	• Total Records: {total}
	• Unique Invoice Numbers: {unique_numbers}
	• Duplicates: {total - unique_numbers if total > unique_numbers else 0}

	📅 Processing Timeline
	• First Invoice: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
	• Latest Invoice: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
	"""

	def handle_amount_query(self, data: dict) -> str:
	"""Handle amount-related queries"""
	invoices = data.get("invoices", [])
	amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0]

	if not amounts:
	return "No amount information found in invoices."

	total_amount = sum(amounts)
	avg_amount = total_amount / len(amounts)
	max_amount = max(amounts)
	min_amount = min(amounts)

	# Find high-value invoices
	high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount
	high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold]

	response = f"""
	💰 Financial Analysis

	• Total Amount: ₹{total_amount:,.2f}
	• Average Amount: ₹{avg_amount:,.2f}
	• Highest Invoice: ₹{max_amount:,.2f}
	• Lowest Invoice: ₹{min_amount:,.2f}

	🎯 High-Value Invoices (₹{high_value_threshold:,.2f}+)
	"""

	for i, inv in enumerate(high_value_invoices[:5], 1):
	response += f"\n{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')} (₹{inv.get('amount', 0):,.2f})"

	return response

	def handle_supplier_query(self, data: dict, query: str) -> str:
	"""Handle supplier-related queries"""
	invoices = data.get("invoices", [])

	# Count invoices by supplier
	supplier_counts = {}
	supplier_amounts = {}

	for inv in invoices:
	supplier = inv.get('supplier_name', '').strip()
	if supplier:
	supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1
	supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0)

	if not supplier_counts:
	return "No supplier information found in invoices."

	# Sort suppliers by amount
	top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10]

	response = f"""
	🏢 Supplier Analysis

	• Total Unique Suppliers: {len(supplier_counts)}
	• Most Active: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices)

	💰 Top Suppliers by Amount
	"""

	for i, (supplier, amount) in enumerate(top_suppliers, 1):
	count = supplier_counts[supplier]
	avg = amount / count if count > 0 else 0
	response += f"\n{i}. {supplier} - ₹{amount:,.2f} ({count} invoices, avg: ₹{avg:,.2f})"

	return response

	def handle_semantic_search(self, query: str) -> str:
	"""Handle semantic search queries"""
	try:
	results = self.processor.vector_store.semantic_search(query, top_k=5)

	if not results:
	return f"No relevant results found for '{query}'. Try different keywords."

	response = f"🔍 Semantic Search Results for '{query}'\n\n"

	for i, result in enumerate(results, 1):
	response += f"{i}. {result.invoice_number} - {result.supplier_name}\n"
	response += f" • Similarity: {result.similarity_score:.3f}\n"
	response += f" • Amount: ₹{result.metadata.get('amount', 0):,.2f}\n"
	response += f" • Preview: {result.content_preview[:100]}...\n\n"

	return response

	except Exception as e:
	return f"Semantic search error: {e}"

	def handle_general_query(self, data: dict, query: str) -> str:
	"""Handle general queries with keyword search"""
	invoices = data.get("invoices", [])
	query_words = query.lower().split()

	# Simple keyword matching
	matching_invoices = []
	for inv in invoices:
	text_to_search = (
	inv.get('supplier_name', '') + ' ' +
	inv.get('buyer_name', '') + ' ' +
	inv.get('product_description', '') + ' ' +
	inv.get('extraction_info', {}).get('raw_text_preview', '')
	).lower()

	if any(word in text_to_search for word in query_words):
	matching_invoices.append(inv)

	if not matching_invoices:
	return f"No invoices found matching '{query}'. Try different keywords or check the summary."

	response = f"🔍 Found {len(matching_invoices)} invoices matching '{query}'\n\n"

	for i, inv in enumerate(matching_invoices[:5], 1):
	response += f"{i}. {inv.get('invoice_number', 'N/A')} - {inv.get('supplier_name', 'Unknown')}\n"
	response += f" • Amount: ₹{inv.get('amount', 0):,.2f}\n"
	response += f" • Date: {inv.get('date', 'N/A')}\n\n"

	if len(matching_invoices) > 5:
	response += f"... and {len(matching_invoices) - 5} more results."

	return response

	# ===============================================================================
	# STREAMLIT APPLICATION
	# ===============================================================================

	def create_app():
	"""Main Streamlit application"""

	# Generate unique session ID for this run
	if 'session_id' not in st.session_state:
	st.session_state.session_id = str(uuid.uuid4())[:8]

	session_id = st.session_state.session_id

	# Custom CSS
	st.markdown("""
	<style>
	.main-header {
	font-size: 2.5rem;
	font-weight: bold;
	text-align: center;
	color: #FF6B35;
	margin-bottom: 1rem;
	}
	.feature-box {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 1rem;
	border-radius: 10px;
	color: white;
	margin: 0.5rem 0;
	text-align: center;
	}
	.status-ok { color: #28a745; font-weight: bold; }
	.status-warning { color: #ffc107; font-weight: bold; }
	.status-error { color: #dc3545; font-weight: bold; }
	</style>
	""", unsafe_allow_html=True)

	# Header
	st.markdown('<h1 class="main-header">📄 AI Invoice Processing System</h1>', unsafe_allow_html=True)
	st.markdown("""
	<div style="text-align: center; margin-bottom: 2rem;">
	<p style="font-size: 1.1rem; color: #666;">
	AI-Powered Document Processing • Semantic Search • Smart Analytics • Hugging Face Spaces
	</p>
	</div>
	""", unsafe_allow_html=True)

	# Initialize processor
	if 'processor' not in st.session_state:
	with st.spinner("🔧 Initializing AI Invoice Processor..."):
	try:
	st.session_state.processor = InvoiceProcessor()
	st.session_state.chatbot = ChatBot(st.session_state.processor)
	st.session_state.chat_history = []
	st.success("✅ System initialized successfully!")
	except Exception as e:
	st.error(f"❌ Initialization failed: {e}")
	st.stop()

	# Sidebar
	with st.sidebar:
	st.header("🎛️ System Status")

	processor = st.session_state.processor

	# Component status
	if processor.document_processor.processors:
	st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)

	if processor.ai_extractor.use_transformers:
	st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)

	if processor.vector_store and processor.vector_store.embedding_model:
	st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
	else:
	st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)

	# Quick stats
	st.header("📊 Quick Stats")
	try:
	data = processor.load_json_data()
	total_invoices = len(data.get("invoices", []))
	total_amount = data.get("summary", {}).get("total_amount", 0)

	st.metric("Total Invoices", total_invoices)
	st.metric("Total Value", f"₹{total_amount:,.2f}")
	st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")

	except Exception as e:
	st.error(f"Stats error: {e}")

	# System info
	st.header("⚙️ System Info")
	st.info(f"""
	Session ID: {session_id}

	Limits:
	• Max file size: 10MB
	• Max concurrent files: 3
	• Timeout: 30s
	""")

	# Main navigation
	selected_tab = st.radio(
	"Choose a section:",
	["📤 Upload & Process", "💬 AI Chat", "📊 Analytics", "📋 Data Explorer"],
	horizontal=True,
	key=f"main_navigation_{session_id}"
	)

	# -------------------------------------------------------------------------
	# UPLOAD & PROCESS SECTION
	# -------------------------------------------------------------------------

	if selected_tab == "📤 Upload & Process":
	st.header("📤 Upload Invoice Documents")

	# Feature highlights
	col1, col2, col3 = st.columns(3)

	with col1:
	st.markdown("""
	<div class="feature-box">
	<h4>🤖 AI Extraction</h4>
	<p>Advanced NLP models extract structured data automatically</p>
	</div>
	""", unsafe_allow_html=True)

	with col2:
	st.markdown("""
	<div class="feature-box">
	<h4>🔍 Smart Search</h4>
	<p>Semantic search finds invoices using natural language</p>
	</div>
	""", unsafe_allow_html=True)

	with col3:
	st.markdown("""
	<div class="feature-box">
	<h4>📊 Analytics</h4>
	<p>Comprehensive insights and visualizations</p>
	</div>
	""", unsafe_allow_html=True)

	# File upload
	st.markdown("### 📁 Upload Your Invoices")

	# Initialize session state for files if not exists
	if f'uploaded_files_{session_id}' not in st.session_state:
	st.session_state[f'uploaded_files_{session_id}'] = None
	if f'processing_complete_{session_id}' not in st.session_state:
	st.session_state[f'processing_complete_{session_id}'] = False
	if f'currently_processing_{session_id}' not in st.session_state:
	st.session_state[f'currently_processing_{session_id}'] = False
	if f'processed_file_hashes_{session_id}' not in st.session_state:
	st.session_state[f'processed_file_hashes_{session_id}'] = set()

	# File uploader with stable key
	uploaded_files = st.file_uploader(
	"Choose invoice files (PDF, TXT supported)",
	type=['pdf', 'txt'],
	accept_multiple_files=True,
	help="Maximum file size: 10MB per file",
	key=f"file_uploader_stable_{session_id}"
	)

	# Store uploaded files in session state only if they're new
	if uploaded_files:
	# Create file hashes to detect if files have changed
	current_file_hashes = set()
	for file in uploaded_files:
	file_hash = hash((file.name, file.size))
	current_file_hashes.add(file_hash)

	# Check if files have changed
	stored_hashes = st.session_state.get(f'uploaded_file_hashes_{session_id}', set())
	if current_file_hashes != stored_hashes:
	st.session_state[f'uploaded_files_{session_id}'] = uploaded_files
	st.session_state[f'uploaded_file_hashes_{session_id}'] = current_file_hashes
	st.session_state[f'processing_complete_{session_id}'] = False
	st.session_state[f'currently_processing_{session_id}'] = False
	st.info("📄 New files detected - ready for processing")

	# Get files from session state
	current_files = st.session_state[f'uploaded_files_{session_id}']
	is_processing = st.session_state[f'currently_processing_{session_id}']
	is_complete = st.session_state[f'processing_complete_{session_id}']

	if current_files:
	max_files = 3
	if len(current_files) > max_files:
	st.warning(f"⚠️ Too many files selected. Processing first {max_files} files.")
	current_files = current_files[:max_files]

	st.info(f"📊 {len(current_files)} files selected")

	# Show file names
	st.markdown("Selected Files:")
	for i, file in enumerate(current_files, 1):
	file_size_mb = len(file.getvalue()) / (1024 * 1024)
	file_hash = hash((file.name, file.size))
	processed_icon = "✅" if file_hash in st.session_state[f'processed_file_hashes_{session_id}'] else "📄"
	st.write(f"{processed_icon} {i}. {file.name} ({file_size_mb:.2f} MB)")

	# Process button - only show if not currently processing
	col1, col2 = st.columns([1, 1])

	with col1:
	if not is_processing and not is_complete:
	if st.button("🚀 Process Files", type="primary", key=f"process_btn_{session_id}"):
	st.session_state[f'currently_processing_{session_id}'] = True
	st.rerun()
	elif is_processing:
	st.info("🔄 Processing in progress...")
	# Actually process the files here
	process_files_once(current_files, session_id)
	elif is_complete:
	st.success("✅ Processing completed!")
	if st.button("🔄 Process Again", key=f"reprocess_btn_{session_id}"):
	st.session_state[f'processing_complete_{session_id}'] = False
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processed_file_hashes_{session_id}'] = set()
	st.rerun()

	with col2:
	if st.button("🗑️ Clear Files", key=f"clear_files_{session_id}"):
	st.session_state[f'uploaded_files_{session_id}'] = None
	st.session_state[f'uploaded_file_hashes_{session_id}'] = set()
	st.session_state[f'processing_complete_{session_id}'] = False
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processed_file_hashes_{session_id}'] = set()
	st.rerun()

	else:
	st.info("👆 Please select invoice files to upload and process")

	# Show processing results if completed
	if is_complete:
	st.markdown("### 📋 Recent Processing Results")
	try:
	data = st.session_state.processor.load_json_data()
	recent_invoices = sorted(
	data.get("invoices", []),
	key=lambda x: x.get('timestamps', {}).get('created_at', ''),
	reverse=True
	)[:5]

	if recent_invoices:
	for i, inv in enumerate(recent_invoices, 1):
	with st.expander(f"📄 {inv.get('invoice_number', f'Invoice {i}')} - {inv.get('supplier_name', 'Unknown')}", expanded=False):
	col1, col2 = st.columns(2)
	with col1:
	st.write(f"Invoice #: {inv.get('invoice_number', 'N/A')}")
	st.write(f"Supplier: {inv.get('supplier_name', 'N/A')}")
	st.write(f"Amount: ₹{inv.get('amount', 0):.2f}")
	with col2:
	st.write(f"Date: {inv.get('date', 'N/A')}")
	st.write(f"Method: {inv.get('extraction_info', {}).get('method', 'N/A')}")
	st.write(f"Confidence: {inv.get('extraction_info', {}).get('confidence', 0):.1%}")
	else:
	st.info("No recent processing results found.")
	except Exception as e:
	st.error(f"Error loading recent results: {e}")

	# -------------------------------------------------------------------------
	# AI CHAT SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "💬 AI Chat":
	st.header("💬 AI Chat Interface")

	# Display chat history
	if st.session_state.chat_history:
	st.markdown("### 💬 Chat History")
	for i, message in enumerate(st.session_state.chat_history):
	with st.chat_message(message["role"]):
	st.markdown(message["content"])

	# Chat input
	st.markdown("### ✍️ Ask a Question")

	col1, col2 = st.columns([4, 1])

	with col1:
	user_input = st.text_input(
	"Type your question:",
	placeholder="e.g., 'show me total spending'",
	key=f"chat_input_{session_id}"
	)

	with col2:
	ask_btn = st.button("🚀 Ask", type="primary", key=f"ask_btn_{session_id}")

	if ask_btn and user_input:
	handle_chat_query(user_input)

	# Suggested queries
	if not st.session_state.chat_history:
	st.markdown("### 💡 Try These Queries")

	col1, col2 = st.columns(2)

	with col1:
	st.markdown("📊 Basic Queries:")
	basic_queries = [
	"Show me a summary of all invoices",
	"How much have we spent in total?",
	"Who are our top suppliers?",
	"Find invoices with high amounts"
	]
	for i, query in enumerate(basic_queries):
	if st.button(query, key=f"basic_{session_id}_{i}"):
	handle_chat_query(query)

	with col2:
	st.markdown("🔍 Advanced Queries:")
	advanced_queries = [
	"Find technology purchases",
	"Show office supplies",
	"Search consulting services",
	"Recent high-value invoices"
	]
	for i, query in enumerate(advanced_queries):
	if st.button(query, key=f"advanced_{session_id}_{i}"):
	handle_chat_query(query)

	# Clear chat
	if st.session_state.chat_history:
	if st.button("🗑️ Clear Chat", key=f"clear_chat_{session_id}"):
	st.session_state.chat_history = []
	st.rerun()

	# -------------------------------------------------------------------------
	# ANALYTICS SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "📊 Analytics":
	st.header("📊 Analytics Dashboard")

	try:
	data = st.session_state.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	st.info("📊 No data available. Upload some invoices to see analytics.")
	return

	# Convert to DataFrame
	df_data = []
	for inv in invoices:
	df_data.append({
	'invoice_number': inv.get('invoice_number', ''),
	'supplier_name': inv.get('supplier_name', ''),
	'amount': inv.get('amount', 0),
	'date': inv.get('date', ''),
	'confidence': inv.get('extraction_info', {}).get('confidence', 0)
	})

	df = pd.DataFrame(df_data)

	# Key metrics
	col1, col2, col3, col4 = st.columns(4)

	with col1:
	st.metric("Total Invoices", len(df))
	with col2:
	st.metric("Total Amount", f"₹{df['amount'].sum():,.2f}")
	with col3:
	st.metric("Avg Amount", f"₹{df['amount'].mean():,.2f}")
	with col4:
	st.metric("Unique Suppliers", df['supplier_name'].nunique())

	# Visualizations
	if len(df) > 0:
	# Amount distribution
	fig_hist = px.histogram(
	df,
	x='amount',
	title="Invoice Amount Distribution",
	labels={'amount': 'Amount (₹)', 'count': 'Number of Invoices'}
	)
	st.plotly_chart(fig_hist, use_container_width=True)

	# Top suppliers
	if df['supplier_name'].notna().any():
	supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10)

	if len(supplier_amounts) > 0:
	fig_suppliers = px.bar(
	x=supplier_amounts.values,
	y=supplier_amounts.index,
	orientation='h',
	title="Top 10 Suppliers by Total Amount",
	labels={'x': 'Total Amount (₹)', 'y': 'Supplier'}
	)
	st.plotly_chart(fig_suppliers, use_container_width=True)

	except Exception as e:
	st.error(f"Analytics error: {e}")

	# -------------------------------------------------------------------------
	# DATA EXPLORER SECTION
	# -------------------------------------------------------------------------

	elif selected_tab == "📋 Data Explorer":
	st.header("📋 Data Explorer")

	try:
	data = st.session_state.processor.load_json_data()
	invoices = data.get("invoices", [])

	if not invoices:
	st.info("📊 No data available. Upload some invoices first.")
	return

	# Convert to DataFrame
	df_data = []
	for inv in invoices:
	df_data.append({
	'Invoice Number': inv.get('invoice_number', ''),
	'Supplier': inv.get('supplier_name', ''),
	'Buyer': inv.get('buyer_name', ''),
	'Amount': inv.get('amount', 0),
	'Date': inv.get('date', ''),
	'Confidence': inv.get('extraction_info', {}).get('confidence', 0),
	'Method': inv.get('extraction_info', {}).get('method', ''),
	'File': inv.get('file_info', {}).get('file_name', ''),
	'Created': inv.get('timestamps', {}).get('created_at', '')[:19]
	})

	df = pd.DataFrame(df_data)

	# Filters
	col1, col2, col3 = st.columns(3)

	with col1:
	suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist())
	selected_supplier = st.selectbox("Filter by Supplier", suppliers, key=f"supplier_filter_{session_id}")

	with col2:
	methods = ['All'] + sorted(df['Method'].dropna().unique().tolist())
	selected_method = st.selectbox("Filter by Method", methods, key=f"method_filter_{session_id}")

	with col3:
	min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0, key=f"amount_filter_{session_id}")

	# Apply filters
	filtered_df = df.copy()
	if selected_supplier != 'All':
	filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier]
	if selected_method != 'All':
	filtered_df = filtered_df[filtered_df['Method'] == selected_method]
	if min_amount > 0:
	filtered_df = filtered_df[filtered_df['Amount'] >= min_amount]

	# Display data
	st.dataframe(
	filtered_df,
	use_container_width=True,
	column_config={
	"Amount": st.column_config.NumberColumn("Amount", format="₹%.2f"),
	"Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1)
	}
	)

	# Export options
	col1, col2 = st.columns(2)

	with col1:
	if st.button("📥 Export CSV", key=f"export_csv_{session_id}"):
	csv_data = filtered_df.to_csv(index=False)
	st.download_button(
	"Download CSV",
	csv_data,
	f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
	"text/csv",
	key=f"download_csv_{session_id}"
	)

	with col2:
	if st.button("📄 Export JSON", key=f"export_json_{session_id}"):
	filtered_invoices = [inv for inv in invoices
	if inv.get('invoice_number') in filtered_df['Invoice Number'].values]

	export_data = {
	"exported_at": datetime.now().isoformat(),
	"total_records": len(filtered_invoices),
	"invoices": filtered_invoices
	}

	st.download_button(
	"Download JSON",
	json.dumps(export_data, indent=2),
	f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
	"application/json",
	key=f"download_json_{session_id}"
	)

	except Exception as e:
	st.error(f"Data explorer error: {e}")

	# -------------------------------------------------------------------------
	# GLOBAL CHAT INPUT
	# -------------------------------------------------------------------------

	st.markdown("---")
	st.markdown("### 💬 Quick Chat (Works from any section)")

	global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")

	if global_query:
	handle_chat_query(global_query, show_response=True)

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style="text-align: center; color: #666;">
	<p>🚀 <strong>AI Invoice Processing System</strong> - Optimized for Hugging Face Spaces</p>
	<p>Built with ❤️ using Streamlit, Transformers, and AI</p>
	</div>
	""", unsafe_allow_html=True)

	# ===============================================================================
	# HELPER FUNCTIONS
	# ===============================================================================

	def process_files_once(uploaded_files, session_id):
	"""Process uploaded files only once with proper state management"""
	if not uploaded_files:
	st.error("No files to process!")
	st.session_state[f'currently_processing_{session_id}'] = False
	return

	st.markdown("### 🔄 Processing Files...")

	# Get already processed file hashes
	processed_hashes = st.session_state[f'processed_file_hashes_{session_id}']

	# Filter out already processed files
	files_to_process = []
	for file in uploaded_files:
	file_hash = hash((file.name, file.size))
	if file_hash not in processed_hashes:
	files_to_process.append((file, file_hash))

	if not files_to_process:
	st.info("✅ All files have already been processed!")
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processing_complete_{session_id}'] = True
	return

	# Create containers for dynamic updates
	progress_container = st.container()
	status_container = st.container()
	results_container = st.container()

	successful = 0
	failed = 0

	# Show progress
	with progress_container:
	progress_bar = st.progress(0)
	progress_text = st.empty()

	with status_container:
	st.info(f"Starting to process {len(files_to_process)} new files...")

	# Process each file only once
	for i, (uploaded_file, file_hash) in enumerate(files_to_process):
	current_progress = (i + 1) / len(files_to_process)

	with progress_container:
	progress_bar.progress(current_progress)
	progress_text.text(f"Processing file {i+1}/{len(files_to_process)}: {uploaded_file.name}")

	with status_container:
	st.info(f"🔄 Processing: {uploaded_file.name} ({len(uploaded_file.getvalue())/1024:.1f} KB)")

	try:
	# Process the file
	result = st.session_state.processor.process_uploaded_file(uploaded_file)

	# Mark file as processed regardless of result
	processed_hashes.add(file_hash)

	# Show result immediately
	with results_container:
	if result and hasattr(result, 'invoice_number') and result.invoice_number:
	successful += 1
	st.success(f"✅ Successfully processed: {uploaded_file.name}")

	# Show extracted data
	col1, col2, col3 = st.columns(3)
	with col1:
	st.write(f"Invoice #: {result.invoice_number}")
	st.write(f"Supplier: {result.supplier_name or 'Not found'}")
	with col2:
	st.write(f"Amount: ₹{result.amount:.2f}")
	st.write(f"Date: {result.date or 'Not found'}")
	with col3:
	st.write(f"Method: {result.processing_method}")
	st.write(f"Confidence: {result.extraction_confidence:.1%}")

	st.markdown("---")
	else:
	failed += 1
	st.warning(f"⚠️ Could not extract complete data from: {uploaded_file.name}")
	if result:
	st.write(f"Partial data: {result.supplier_name}, ₹{result.amount}")
	st.markdown("---")

	except Exception as e:
	failed += 1
	# Still mark as processed to avoid reprocessing
	processed_hashes.add(file_hash)

	with results_container:
	st.error(f"❌ Error processing {uploaded_file.name}: {str(e)}")
	st.markdown("---")

	# Update session state
	st.session_state[f'processed_file_hashes_{session_id}'] = processed_hashes

	# Final summary
	with progress_container:
	progress_bar.progress(1.0)
	progress_text.text("✅ Processing completed!")

	with status_container:
	if successful > 0:
	st.success(f"🎉 Processing complete! {successful} successful, {failed} failed")
	if successful > 0:
	st.balloons()
	else:
	st.error(f"❌ Processing failed for all {failed} files. Please check file formats and content.")

	# Update processing state
	st.session_state[f'currently_processing_{session_id}'] = False
	st.session_state[f'processing_complete_{session_id}'] = True

	# Force rerun to update UI
	st.rerun()

	def process_files(uploaded_files, session_id):
	"""Legacy function - redirect to process_files_once"""
	return process_files_once(uploaded_files, session_id)

	def handle_chat_query(query, show_response=False):
	"""Handle chat query"""
	st.session_state.chat_history.append({
	"role": "user",
	"content": query,
	"timestamp": datetime.now()
	})

	try:
	with st.spinner("🤖 AI is analyzing..."):
	response = st.session_state.chatbot.query_database(query)

	st.session_state.chat_history.append({
	"role": "assistant",
	"content": response,
	"timestamp": datetime.now()
	})

	if show_response:
	with st.chat_message("assistant"):
	st.markdown(response)
	st.info("💡 Switch to the 'AI Chat' section to see full conversation history!")

	st.rerun()

	except Exception as e:
	st.error(f"Chat error: {e}")

	# ===============================================================================
	# MAIN ENTRY POINT
	# ===============================================================================

	def main():
	"""Main entry point for Hugging Face Spaces"""
	try:
	if IS_HF_SPACE:
	st.sidebar.info("🤗 Running on Hugging Face Spaces")

	create_app()

	except Exception as e:
	st.error(f"""
	## 🚨 Application Error

	{e}

	Please refresh the page or check the logs for more details.
	""")

	if __name__ == "__main__":
	main()