diff --git "a/app.py" "b/app.py"
--- "a/app.py"
+++ "b/app.py"
@@ -30,6 +30,15 @@ import uuid
# Check if running on Hugging Face Spaces
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
+# Get Hugging Face token from environment or Streamlit secrets
+HF_TOKEN = None
+try:
+ # Try Streamlit secrets first (for HF Spaces)
+ HF_TOKEN = st.secrets.get("HF_TOKEN", None)
+except:
+ # Fall back to environment variable
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
+
# Streamlit and core libraries
import streamlit as st
import sqlite3
@@ -235,8 +244,13 @@ class AIExtractor:
self.use_transformers = self.setup_transformers() if not self.use_mistral else False
def setup_mistral(self):
- """Try to setup Mistral 7B model"""
+ """Try to setup Mistral 7B model with proper authentication"""
try:
+ # Check if we have HF token
+ if not HF_TOKEN:
+ st.warning("โ ๏ธ Hugging Face token not found. Add HF_TOKEN to secrets for Mistral access.")
+ return False
+
# Check if we're in a high-resource environment
import psutil
memory_gb = psutil.virtual_memory().total / (1024**3)
@@ -246,15 +260,20 @@ class AIExtractor:
return False
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
+ from huggingface_hub import login
+
+ # Login with HF token
+ login(token=HF_TOKEN)
with st.spinner("๐ Loading Mistral 7B model (this may take a few minutes)..."):
- # Use quantized version for better performance
+ # Use the instruction-tuned model
model_name = "mistralai/Mistral-7B-Instruct-v0.1"
# Load with reduced precision for memory efficiency
self.mistral_tokenizer = AutoTokenizer.from_pretrained(
model_name,
- cache_dir=HF_CONFIG["cache_dir"]
+ cache_dir=HF_CONFIG["cache_dir"],
+ token=HF_TOKEN
)
self.mistral_model = AutoModelForCausalLM.from_pretrained(
@@ -262,7 +281,8 @@ class AIExtractor:
torch_dtype=torch.float16 if TORCH_AVAILABLE else None,
device_map="auto" if TORCH_AVAILABLE else None,
load_in_8bit=True, # Use 8-bit quantization
- cache_dir=HF_CONFIG["cache_dir"]
+ cache_dir=HF_CONFIG["cache_dir"],
+ token=HF_TOKEN
)
# Create pipeline
@@ -282,6 +302,7 @@ class AIExtractor:
return False
except Exception as e:
st.warning(f"โ ๏ธ Mistral 7B not available: {e}")
+ st.info("๐ก To use Mistral 7B: Add your Hugging Face token to secrets as 'HF_TOKEN'")
return False
def setup_transformers(self):
@@ -444,98 +465,3997 @@ Respond with JSON only: [/INST]"""
return self.extract_with_regex(text)
def extract_with_regex(self, text: str) -> InvoiceData:
- """Fallback regex extraction"""
+ """Enhanced regex extraction with better amount detection"""
invoice_data = InvoiceData()
invoice_data.processing_method = "regex"
- # Enhanced regex patterns
+ # Enhanced regex patterns with more comprehensive matching
patterns = {
'invoice_number': [
- r'invoice[#\s]*:?\s*([A-Z0-9\-_]+)',
- r'inv[#\s]*:?\s*([A-Z0-9\-_]+)',
- r'bill[#\s]*:?\s*([A-Z0-9\-_]+)',
- r'#([A-Z0-9\-_]{3,})'
+ r'invoice\s*(?:no|number|#)?\s*:?\s*([A-Z0-9\-_/]+)',
+ r'bill\s*(?:no|number|#)?\s*:?\s*([A-Z0-9\-_/]+)',
+ r'inv\s*(?:no|number|#)?\s*:?\s*([A-Z0-9\-_/]+)',
+ r'ref\s*(?:no|number|#)?\s*:?\s*([A-Z0-9\-_/]+)',
+ r'#\s*([A-Z0-9\-_/]{3,})',
+ r'(?:^|\s)([A-Z]{2,}\d{3,}|\d{3,}[A-Z]{2,})', # Common patterns like ABC123 or 123ABC
],
'amount': [
- r'total[:\s]*[\$โน]?([0-9,]+\.?\d*)',
- r'amount[:\s]*[\$โน]?([0-9,]+\.?\d*)',
- r'[\$โน]([0-9,]+\.?\d*)',
- r'([0-9,]+\.?\d*)\s*(?:dollars?|โน|USD|INR)'
- ],
- 'date': [
- r'date[:\s]*(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})',
- r'(\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4})',
- r'(\d{4}[/\-]\d{1,2}[/\-]\d{1,2})'
- ],
- 'quantity': [
- r'qty[:\s]*(\d+)',
- r'quantity[:\s]*(\d+)',
- r'(\d+)\s*(?:pcs?|units?|items?)'
- ]
- }
+ # Currency symbols with amounts
+ r'total\s*(?:amount)?\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+ r'amount\s*(?:due|paid|total)?\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+ r'grand\s*total\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+ r'net\s*(?:amount|total)\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+ r'sub\s*total\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+
+ # Currency symbols at the beginning
+ r'[\$โนยฃโฌ]\s*([0-9,]+\.?\d*)',
+
+ # Amounts at end of lines (common in invoices)
+ r'([0-9,]+\.?\d*)\s*[\$โนยฃโฌ]?\s*
+
+ def parse_date(self, date_str: str) -> str:
+ """Parse date to YYYY-MM-DD format"""
+ if not date_str:
+ return ""
- text_lower = text.lower()
+ formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d']
- # Extract using patterns
- for pattern_list in patterns['invoice_number']:
- match = re.search(pattern_list, text_lower, re.IGNORECASE)
- if match:
- invoice_data.invoice_number = match.group(1).upper()
- break
+ for fmt in formats:
+ try:
+ parsed_date = datetime.strptime(date_str, fmt)
+ return parsed_date.strftime('%Y-%m-%d')
+ except ValueError:
+ continue
- for pattern in patterns['amount']:
- match = re.search(pattern, text_lower, re.IGNORECASE)
- if match:
- try:
- amount_str = match.group(1).replace(',', '')
- invoice_data.amount = float(amount_str)
- break
- except ValueError:
- continue
+ return date_str
+
+# ===============================================================================
+# VECTOR STORE CLASS
+# ===============================================================================
+
+class VectorStore:
+ """Simplified vector store for Hugging Face Spaces"""
+
+ def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
+ self.embedding_model_name = embedding_model
+ self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
+ self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
+ self.embedding_model = None
+ self.vectors = []
+ self.document_metadata = []
+ self.embedding_dimension = None
- for pattern in patterns['date']:
- match = re.search(pattern, text, re.IGNORECASE)
- if match:
- invoice_data.date = self.parse_date(match.group(1))
- break
+ self.setup_embedding_model()
+ self.load_vector_store()
+
+ def setup_embedding_model(self):
+ """Initialize the sentence transformer model"""
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
+ st.warning("โ ๏ธ Sentence Transformers not available. Vector search disabled.")
+ return
- for pattern in patterns['quantity']:
- match = re.search(pattern, text_lower, re.IGNORECASE)
- if match:
- try:
- invoice_data.quantity = int(match.group(1))
- break
- except ValueError:
- continue
+ try:
+ with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
+ self.embedding_model = SentenceTransformer(
+ self.embedding_model_name,
+ cache_folder=HF_CONFIG["cache_dir"]
+ )
+
+ # Get embedding dimension
+ test_embedding = self.embedding_model.encode(["test"])
+ self.embedding_dimension = test_embedding.shape[0]
+
+ st.success(f"โ
Embedding model loaded: {self.embedding_model_name}")
+
+ except Exception as e:
+ st.error(f"โ Failed to load embedding model: {e}")
+ self.embedding_model = None
+
+ def load_vector_store(self):
+ """Load existing vector store"""
+ try:
+ if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
+ with open(self.vector_store_path, 'rb') as f:
+ self.vectors = pickle.load(f)
+
+ with open(self.metadata_path, 'rb') as f:
+ self.document_metadata = pickle.load(f)
+
+ st.success(f"โ
Vector store loaded: {len(self.document_metadata)} documents")
+ else:
+ self.vectors = []
+ self.document_metadata = []
+ st.info("๐ New vector store initialized")
+
+ except Exception as e:
+ st.error(f"โ Error loading vector store: {e}")
+ self.vectors = []
+ self.document_metadata = []
+
+ def save_vector_store(self):
+ """Save vector store to disk"""
+ try:
+ with open(self.vector_store_path, 'wb') as f:
+ pickle.dump(self.vectors, f)
+
+ with open(self.metadata_path, 'wb') as f:
+ pickle.dump(self.document_metadata, f)
+
+ return True
+ except Exception as e:
+ st.error(f"Error saving vector store: {e}")
+ return False
+
+ def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
+ """Create searchable text from invoice data"""
+ text_parts = []
- # Extract company names (basic)
- company_patterns = [
- r'(?:from|to|vendor|supplier)[:]\s*([A-Z][A-Za-z\s&,\.]{2,30})',
- r'([A-Z][A-Za-z\s&,\.]{3,30})\s*(?:Ltd|Inc|Corp|LLC|Co\.|Company)',
- ]
+ for field, value in invoice_data.items():
+ if value and field != 'id':
+ text_parts.append(f"{field}: {value}")
- for pattern in company_patterns:
- matches = re.findall(pattern, text)
- if matches:
- if not invoice_data.supplier_name:
- invoice_data.supplier_name = matches[0].strip()
- elif len(matches) > 1 and not invoice_data.buyer_name:
- invoice_data.buyer_name = matches[1].strip()
-
- # Extract product description (basic)
- desc_patterns = [
- r'description[:]\s*([A-Za-z0-9\s,.-]{10,100})',
- r'item[:]\s*([A-Za-z0-9\s,.-]{10,100})',
- r'service[:]\s*([A-Za-z0-9\s,.-]{10,100})'
- ]
+ if raw_text:
+ text_parts.append(f"content: {raw_text[:300]}")
- for pattern in desc_patterns:
- match = re.search(pattern, text, re.IGNORECASE)
- if match:
- invoice_data.product_description = match.group(1).strip()
- break
+ return " | ".join(text_parts)
+
+ def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
+ """Add a document to the vector store"""
+ if not self.embedding_model:
+ return False
+
+ try:
+ document_text = self.create_document_text(invoice_data, raw_text)
+
+ # Generate embedding
+ embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
+
+ # Create metadata
+ metadata = {
+ 'invoice_id': invoice_data.get('id', ''),
+ 'invoice_number': invoice_data.get('invoice_number', ''),
+ 'supplier_name': invoice_data.get('supplier_name', ''),
+ 'buyer_name': invoice_data.get('buyer_name', ''),
+ 'amount': invoice_data.get('amount', 0),
+ 'date': invoice_data.get('date', ''),
+ 'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
+ 'document_text': document_text[:200],
+ 'timestamp': datetime.now().isoformat()
+ }
+
+ # Add to store
+ self.vectors.append(embedding)
+ self.document_metadata.append(metadata)
+
+ return True
+
+ except Exception as e:
+ st.error(f"Error adding document to vector store: {e}")
+ return False
+
+ def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
+ """Perform semantic search using cosine similarity"""
+ if not self.embedding_model or not self.vectors:
+ return []
+
+ try:
+ # Generate query embedding
+ query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
+
+ # Calculate similarities
+ similarities = []
+ for i, doc_embedding in enumerate(self.vectors):
+ similarity = np.dot(query_embedding, doc_embedding)
+ similarities.append((similarity, i))
+
+ # Sort by similarity
+ similarities.sort(reverse=True)
+
+ # Return top results
+ results = []
+ for similarity, idx in similarities[:top_k]:
+ if similarity > 0.1: # Relevance threshold
+ metadata = self.document_metadata[idx]
+ result = VectorSearchResult(
+ invoice_id=metadata.get('invoice_id', ''),
+ invoice_number=metadata.get('invoice_number', ''),
+ supplier_name=metadata.get('supplier_name', ''),
+ similarity_score=float(similarity),
+ content_preview=metadata.get('document_text', ''),
+ metadata=metadata
+ )
+ results.append(result)
+
+ return results
+
+ except Exception as e:
+ st.error(f"Error in semantic search: {e}")
+ return []
+
+# ===============================================================================
+# MAIN PROCESSOR CLASS
+# ===============================================================================
+
+class InvoiceProcessor:
+ """Main invoice processor for Hugging Face Spaces"""
+
+ def __init__(self):
+ self.setup_storage()
+ self.document_processor = DocumentProcessor()
+ self.ai_extractor = AIExtractor()
+ self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
+
+ # Initialize stats
+ self.processing_stats = {
+ 'total_processed': 0,
+ 'successful': 0,
+ 'failed': 0,
+ 'start_time': datetime.now()
+ }
+
+ def setup_storage(self):
+ """Setup storage paths"""
+ self.data_dir = HF_CONFIG["data_dir"]
+ self.json_path = os.path.join(self.data_dir, "invoices.json")
+
+ # Initialize JSON storage
+ if not os.path.exists(self.json_path):
+ initial_data = {
+ "metadata": {
+ "created_at": datetime.now().isoformat(),
+ "version": "hf_v1.0",
+ "total_invoices": 0
+ },
+ "invoices": [],
+ "summary": {
+ "total_amount": 0.0,
+ "unique_suppliers": [],
+ "processing_stats": {"successful": 0, "failed": 0}
+ }
+ }
+ self.save_json_data(initial_data)
+
+ def load_json_data(self) -> dict:
+ """Load invoice data from JSON"""
+ try:
+ with open(self.json_path, 'r', encoding='utf-8') as f:
+ return json.load(f)
+ except (FileNotFoundError, json.JSONDecodeError):
+ self.setup_storage()
+ return self.load_json_data()
+
+ def save_json_data(self, data: dict):
+ """Save invoice data to JSON"""
+ try:
+ with open(self.json_path, 'w', encoding='utf-8') as f:
+ json.dump(data, f, indent=2, ensure_ascii=False)
+ except Exception as e:
+ st.error(f"Error saving data: {e}")
+
+ def process_uploaded_file(self, uploaded_file) -> InvoiceData:
+ """Process a single uploaded file with enhanced debugging"""
+ self.processing_stats['total_processed'] += 1
+
+ try:
+ # Debug file info
+ file_size = len(uploaded_file.getvalue())
+ file_extension = uploaded_file.name.split('.')[-1].lower() if '.' in uploaded_file.name else 'unknown'
+
+ st.info(f"๐ Processing: {uploaded_file.name} ({file_size/1024:.1f} KB, .{file_extension})")
+
+ # Check file size
+ if file_size > HF_CONFIG["max_file_size_mb"] * 1024 * 1024:
+ error_msg = f"File too large: {file_size / 1024 / 1024:.2f}MB > {HF_CONFIG['max_file_size_mb']}MB"
+ st.error(error_msg)
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ # Check file type
+ if file_extension not in ['pdf', 'txt']:
+ error_msg = f"Unsupported file type: .{file_extension} (supported: PDF, TXT)"
+ st.warning(error_msg)
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ # Save temporarily
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp_file:
+ file_content = uploaded_file.getvalue()
+ tmp_file.write(file_content)
+ tmp_file_path = tmp_file.name
+
+ st.info(f"๐พ Saved temporarily to: {tmp_file_path}")
+
+ try:
+ # Extract text
+ st.info("๐ Extracting text from document...")
+ text = self.document_processor.extract_text_from_document(tmp_file_path)
+
+ if not text or not text.strip():
+ st.warning(f"โ No text extracted from {uploaded_file.name}")
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ text_length = len(text)
+ st.info(f"๐ Extracted {text_length} characters of text")
+
+ # Show text preview and extraction debug info
+ if text_length > 0:
+ with st.expander("๐ Text Preview & Extraction Debug", expanded=True):
+ st.text_area("Extracted Text (First 1000 chars):", value=text[:1000], height=150, disabled=True)
+
+ # Debug amount detection
+ st.markdown("**๐ Amount Detection Debug:**")
+ amount_patterns = [
+ r'total\s*(?:amount)?\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+ r'[\$โนยฃโฌ]\s*([0-9,]+\.?\d*)',
+ r'([0-9,]+\.?\d*)\s*[\$โนยฃโฌ]?\s*
+
+ # Extract invoice data
+ st.info("๐ค Extracting invoice data using AI/Regex...")
+ invoice_data = self.ai_extractor.extract_with_ai(text)
+ invoice_data.file_path = uploaded_file.name
+
+ # Show extraction results
+ st.info(f"๐ Extraction completed with {invoice_data.extraction_confidence:.1%} confidence")
+
+ # Save to storage
+ st.info("๐พ Saving extracted data...")
+ self.save_invoice_data(invoice_data, text, file_size)
+
+ self.processing_stats['successful'] += 1
+ st.success(f"โ
Successfully processed {uploaded_file.name}")
+
+ return invoice_data
+
+ finally:
+ # Cleanup
+ try:
+ os.unlink(tmp_file_path)
+ st.info("๐งน Cleaned up temporary file")
+ except:
+ pass
+
+ except Exception as e:
+ error_msg = f"Error processing {uploaded_file.name}: {str(e)}"
+ st.error(error_msg)
+ self.processing_stats['failed'] += 1
+
+ # Show detailed error for debugging
+ with st.expander("๐ Error Details", expanded=False):
+ st.code(str(e))
+ import traceback
+ st.code(traceback.format_exc())
+
+ return InvoiceData()
+
+ def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int):
+ """Save invoice data to JSON and vector store"""
+ try:
+ # Load existing data
+ data = self.load_json_data()
+
+ # Create invoice record
+ invoice_record = {
+ "id": len(data["invoices"]) + 1,
+ "invoice_number": invoice_data.invoice_number,
+ "supplier_name": invoice_data.supplier_name,
+ "buyer_name": invoice_data.buyer_name,
+ "date": invoice_data.date,
+ "amount": invoice_data.amount,
+ "quantity": invoice_data.quantity,
+ "product_description": invoice_data.product_description,
+ "file_info": {
+ "file_name": invoice_data.file_path,
+ "file_size": file_size
+ },
+ "extraction_info": {
+ "confidence": invoice_data.extraction_confidence,
+ "method": invoice_data.processing_method,
+ "raw_text_preview": raw_text[:300]
+ },
+ "timestamps": {
+ "created_at": datetime.now().isoformat()
+ }
+ }
+
+ # Add to invoices
+ data["invoices"].append(invoice_record)
+
+ # Update summary
+ self.update_summary(data)
+
+ # Save JSON
+ self.save_json_data(data)
+
+ # Add to vector store
+ if self.vector_store:
+ self.vector_store.add_document(invoice_record, raw_text)
+ self.vector_store.save_vector_store()
+
+ except Exception as e:
+ st.error(f"Error saving invoice data: {e}")
+
+ def update_summary(self, data: dict):
+ """Update summary statistics"""
+ invoices = data["invoices"]
+
+ total_amount = sum(inv.get("amount", 0) for inv in invoices)
+ unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name")))
+
+ data["summary"] = {
+ "total_amount": total_amount,
+ "unique_suppliers": unique_suppliers,
+ "processing_stats": {
+ "successful": self.processing_stats['successful'],
+ "failed": self.processing_stats['failed'],
+ "total_processed": self.processing_stats['total_processed']
+ }
+ }
+
+ data["metadata"]["last_updated"] = datetime.now().isoformat()
+ data["metadata"]["total_invoices"] = len(invoices)
+
+# ===============================================================================
+# CHATBOT CLASS
+# ===============================================================================
+
+class ChatBot:
+ """Chatbot for invoice queries"""
+
+ def __init__(self, processor: InvoiceProcessor):
+ self.processor = processor
+
+ def query_database(self, query: str) -> str:
+ """Process user query and return response"""
+ try:
+ data = self.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ return "No invoice data found. Please upload some invoices first."
+
+ query_lower = query.lower()
+
+ # Handle different query types
+ if any(phrase in query_lower for phrase in ["summary", "overview", "total"]):
+ return self.generate_summary(data)
+
+ elif "count" in query_lower or "how many" in query_lower:
+ return self.handle_count_query(data)
+
+ elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]):
+ return self.handle_amount_query(data)
+
+ elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
+ return self.handle_supplier_query(data, query)
+
+ elif self.processor.vector_store:
+ return self.handle_semantic_search(query)
+
+ else:
+ return self.handle_general_query(data, query)
+
+ except Exception as e:
+ return f"Error processing query: {e}"
+
+ def generate_summary(self, data: dict) -> str:
+ """Generate comprehensive summary"""
+ invoices = data.get("invoices", [])
+ summary = data.get("summary", {})
+
+ if not invoices:
+ return "No invoices found in the system."
+
+ total_amount = summary.get("total_amount", 0)
+ avg_amount = total_amount / len(invoices) if invoices else 0
+ unique_suppliers = len(summary.get("unique_suppliers", []))
+
+ response = f"""
+**๐ Invoice System Summary**
+
+โข **Total Invoices**: {len(invoices):,}
+โข **Total Value**: โน{total_amount:,.2f}
+โข **Average Invoice**: โน{avg_amount:,.2f}
+โข **Unique Suppliers**: {unique_suppliers}
+
+**๐ Processing Stats**
+โข **Successful**: {summary.get('processing_stats', {}).get('successful', 0)}
+โข **Failed**: {summary.get('processing_stats', {}).get('failed', 0)}
+
+**๐ Recent Invoices**
+"""
+
+ # Show recent invoices
+ recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5]
+ for i, inv in enumerate(recent, 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_count_query(self, data: dict) -> str:
+ """Handle count-related queries"""
+ invoices = data.get("invoices", [])
+ total = len(invoices)
+ unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number')))
+
+ return f"""
+**๐ Invoice Count Summary**
+
+โข **Total Records**: {total}
+โข **Unique Invoice Numbers**: {unique_numbers}
+โข **Duplicates**: {total - unique_numbers if total > unique_numbers else 0}
+
+**๐
Processing Timeline**
+โข **First Invoice**: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+โข **Latest Invoice**: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+"""
+
+ def handle_amount_query(self, data: dict) -> str:
+ """Handle amount-related queries"""
+ invoices = data.get("invoices", [])
+ amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0]
+
+ if not amounts:
+ return "No amount information found in invoices."
+
+ total_amount = sum(amounts)
+ avg_amount = total_amount / len(amounts)
+ max_amount = max(amounts)
+ min_amount = min(amounts)
+
+ # Find high-value invoices
+ high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount
+ high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold]
+
+ response = f"""
+**๐ฐ Financial Analysis**
+
+โข **Total Amount**: โน{total_amount:,.2f}
+โข **Average Amount**: โน{avg_amount:,.2f}
+โข **Highest Invoice**: โน{max_amount:,.2f}
+โข **Lowest Invoice**: โน{min_amount:,.2f}
+
+**๐ฏ High-Value Invoices (โน{high_value_threshold:,.2f}+)**
+"""
+
+ for i, inv in enumerate(high_value_invoices[:5], 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_supplier_query(self, data: dict, query: str) -> str:
+ """Handle supplier-related queries"""
+ invoices = data.get("invoices", [])
+
+ # Count invoices by supplier
+ supplier_counts = {}
+ supplier_amounts = {}
+
+ for inv in invoices:
+ supplier = inv.get('supplier_name', '').strip()
+ if supplier:
+ supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1
+ supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0)
+
+ if not supplier_counts:
+ return "No supplier information found in invoices."
+
+ # Sort suppliers by amount
+ top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10]
+
+ response = f"""
+**๐ข Supplier Analysis**
+
+โข **Total Unique Suppliers**: {len(supplier_counts)}
+โข **Most Active**: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices)
+
+**๐ฐ Top Suppliers by Amount**
+"""
+
+ for i, (supplier, amount) in enumerate(top_suppliers, 1):
+ count = supplier_counts[supplier]
+ avg = amount / count if count > 0 else 0
+ response += f"\n{i}. **{supplier}** - โน{amount:,.2f} ({count} invoices, avg: โน{avg:,.2f})"
+
+ return response
+
+ def handle_semantic_search(self, query: str) -> str:
+ """Handle semantic search queries"""
+ try:
+ results = self.processor.vector_store.semantic_search(query, top_k=5)
+
+ if not results:
+ return f"No relevant results found for '{query}'. Try different keywords."
+
+ response = f"๐ **Semantic Search Results for '{query}'**\n\n"
+
+ for i, result in enumerate(results, 1):
+ response += f"{i}. **{result.invoice_number}** - {result.supplier_name}\n"
+ response += f" โข Similarity: {result.similarity_score:.3f}\n"
+ response += f" โข Amount: โน{result.metadata.get('amount', 0):,.2f}\n"
+ response += f" โข Preview: {result.content_preview[:100]}...\n\n"
+
+ return response
+
+ except Exception as e:
+ return f"Semantic search error: {e}"
+
+ def handle_general_query(self, data: dict, query: str) -> str:
+ """Handle general queries with keyword search"""
+ invoices = data.get("invoices", [])
+ query_words = query.lower().split()
+
+ # Simple keyword matching
+ matching_invoices = []
+ for inv in invoices:
+ text_to_search = (
+ inv.get('supplier_name', '') + ' ' +
+ inv.get('buyer_name', '') + ' ' +
+ inv.get('product_description', '') + ' ' +
+ inv.get('extraction_info', {}).get('raw_text_preview', '')
+ ).lower()
+
+ if any(word in text_to_search for word in query_words):
+ matching_invoices.append(inv)
+
+ if not matching_invoices:
+ return f"No invoices found matching '{query}'. Try different keywords or check the summary."
+
+ response = f"๐ **Found {len(matching_invoices)} invoices matching '{query}'**\n\n"
+
+ for i, inv in enumerate(matching_invoices[:5], 1):
+ response += f"{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')}\n"
+ response += f" โข Amount: โน{inv.get('amount', 0):,.2f}\n"
+ response += f" โข Date: {inv.get('date', 'N/A')}\n\n"
+
+ if len(matching_invoices) > 5:
+ response += f"... and {len(matching_invoices) - 5} more results."
+
+ return response
+
+# ===============================================================================
+# STREAMLIT APPLICATION
+# ===============================================================================
+
+def create_app():
+ """Main Streamlit application"""
+
+ # Generate unique session ID for this run
+ if 'session_id' not in st.session_state:
+ st.session_state.session_id = str(uuid.uuid4())[:8]
+
+ session_id = st.session_state.session_id
+
+ # Custom CSS
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Header
+ st.markdown('
๐ AI Invoice Processing System
', unsafe_allow_html=True)
+ st.markdown("""
+
+
+ AI-Powered Document Processing โข Semantic Search โข Smart Analytics โข Hugging Face Spaces
+
+
+ """, unsafe_allow_html=True)
+
+ # Initialize processor
+ if 'processor' not in st.session_state:
+ with st.spinner("๐ง Initializing AI Invoice Processor..."):
+ try:
+ st.session_state.processor = InvoiceProcessor()
+ st.session_state.chatbot = ChatBot(st.session_state.processor)
+ st.session_state.chat_history = []
+ st.success("โ
System initialized successfully!")
+ except Exception as e:
+ st.error(f"โ Initialization failed: {e}")
+ st.stop()
+
+ # Sidebar
+ with st.sidebar:
+ st.header("๐๏ธ System Status")
+
+ processor = st.session_state.processor
+
+ # Component status
+ if processor.document_processor.processors:
+ st.markdown('โ
Document Processing', unsafe_allow_html=True)
+ else:
+ st.markdown('โ Document Processing', unsafe_allow_html=True)
+
+ if processor.ai_extractor.use_transformers:
+ st.markdown('โ
AI Extraction', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Regex Extraction', unsafe_allow_html=True)
+
+ if processor.vector_store and processor.vector_store.embedding_model:
+ st.markdown('โ
Semantic Search', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Keyword Search Only', unsafe_allow_html=True)
+
+ # Quick stats
+ st.header("๐ Quick Stats")
+ try:
+ data = processor.load_json_data()
+ total_invoices = len(data.get("invoices", []))
+ total_amount = data.get("summary", {}).get("total_amount", 0)
+
+ st.metric("Total Invoices", total_invoices)
+ st.metric("Total Value", f"โน{total_amount:,.2f}")
+ st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
+
+ except Exception as e:
+ st.error(f"Stats error: {e}")
+
+ # System info
+ st.header("โ๏ธ System Info")
+ st.info(f"""
+ **Session ID:** {session_id}
+
+ **Limits:**
+ โข Max file size: 10MB
+ โข Max concurrent files: 3
+ โข Timeout: 30s
+ """)
+
+ # Main navigation
+ selected_tab = st.radio(
+ "Choose a section:",
+ ["๐ค Upload & Process", "๐ฌ AI Chat", "๐ Analytics", "๐ Data Explorer"],
+ horizontal=True,
+ key=f"main_navigation_{session_id}"
+ )
+
+ # -------------------------------------------------------------------------
+ # UPLOAD & PROCESS SECTION
+ # -------------------------------------------------------------------------
+
+ if selected_tab == "๐ค Upload & Process":
+ st.header("๐ค Upload Invoice Documents")
+
+ # Feature highlights
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.markdown("""
+
+
๐ค AI Extraction
+
Advanced NLP models extract structured data automatically
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown("""
+
+
๐ Smart Search
+
Semantic search finds invoices using natural language
+
+ """, unsafe_allow_html=True)
+
+ with col3:
+ st.markdown("""
+
+
๐ Analytics
+
Comprehensive insights and visualizations
+
+ """, unsafe_allow_html=True)
+
+ # File upload
+ st.markdown("### ๐ Upload Your Invoices")
+
+ # Initialize session state for files if not exists
+ if f'uploaded_files_{session_id}' not in st.session_state:
+ st.session_state[f'uploaded_files_{session_id}'] = None
+ if f'processing_complete_{session_id}' not in st.session_state:
+ st.session_state[f'processing_complete_{session_id}'] = False
+ if f'currently_processing_{session_id}' not in st.session_state:
+ st.session_state[f'currently_processing_{session_id}'] = False
+ if f'processed_file_hashes_{session_id}' not in st.session_state:
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+
+ # File uploader with stable key
+ uploaded_files = st.file_uploader(
+ "Choose invoice files (PDF, TXT supported)",
+ type=['pdf', 'txt'],
+ accept_multiple_files=True,
+ help="Maximum file size: 10MB per file",
+ key=f"file_uploader_stable_{session_id}"
+ )
+
+ # Store uploaded files in session state only if they're new
+ if uploaded_files:
+ # Create file hashes to detect if files have changed
+ current_file_hashes = set()
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ current_file_hashes.add(file_hash)
+
+ # Check if files have changed
+ stored_hashes = st.session_state.get(f'uploaded_file_hashes_{session_id}', set())
+ if current_file_hashes != stored_hashes:
+ st.session_state[f'uploaded_files_{session_id}'] = uploaded_files
+ st.session_state[f'uploaded_file_hashes_{session_id}'] = current_file_hashes
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.info("๐ New files detected - ready for processing")
+
+ # Get files from session state
+ current_files = st.session_state[f'uploaded_files_{session_id}']
+ is_processing = st.session_state[f'currently_processing_{session_id}']
+ is_complete = st.session_state[f'processing_complete_{session_id}']
+
+ if current_files:
+ max_files = 3
+ if len(current_files) > max_files:
+ st.warning(f"โ ๏ธ Too many files selected. Processing first {max_files} files.")
+ current_files = current_files[:max_files]
+
+ st.info(f"๐ {len(current_files)} files selected")
+
+ # Show file names
+ st.markdown("**Selected Files:**")
+ for i, file in enumerate(current_files, 1):
+ file_size_mb = len(file.getvalue()) / (1024 * 1024)
+ file_hash = hash((file.name, file.size))
+ processed_icon = "โ
" if file_hash in st.session_state[f'processed_file_hashes_{session_id}'] else "๐"
+ st.write(f"{processed_icon} {i}. {file.name} ({file_size_mb:.2f} MB)")
+
+ # Process button - only show if not currently processing
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ if not is_processing and not is_complete:
+ if st.button("๐ Process Files", type="primary", key=f"process_btn_{session_id}"):
+ st.session_state[f'currently_processing_{session_id}'] = True
+ st.rerun()
+ elif is_processing:
+ st.info("๐ Processing in progress...")
+ # Actually process the files here
+ process_files_once(current_files, session_id)
+ elif is_complete:
+ st.success("โ
Processing completed!")
+ if st.button("๐ Process Again", key=f"reprocess_btn_{session_id}"):
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+ st.rerun()
+
+ with col2:
+ if st.button("๐๏ธ Clear Files", key=f"clear_files_{session_id}"):
+ # Clear all session state related to files
+ keys_to_clear = [
+ f'uploaded_files_{session_id}',
+ f'uploaded_file_hashes_{session_id}',
+ f'processing_complete_{session_id}',
+ f'currently_processing_{session_id}',
+ f'processed_file_hashes_{session_id}'
+ ]
+
+ for key in keys_to_clear:
+ if key in st.session_state:
+ del st.session_state[key]
+
+ st.success("๐๏ธ Files cleared successfully!")
+ time.sleep(1) # Brief pause to show message
+ st.rerun()
+
+ else:
+ st.info("๐ Please select invoice files to upload and process")
+
+ # Show processing results if completed
+ if is_complete:
+ st.markdown("### ๐ Recent Processing Results")
+ try:
+ data = st.session_state.processor.load_json_data()
+ recent_invoices = sorted(
+ data.get("invoices", []),
+ key=lambda x: x.get('timestamps', {}).get('created_at', ''),
+ reverse=True
+ )[:5]
+
+ if recent_invoices:
+ for i, inv in enumerate(recent_invoices, 1):
+ with st.expander(f"๐ {inv.get('invoice_number', f'Invoice {i}')} - {inv.get('supplier_name', 'Unknown')}", expanded=False):
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Invoice #:** {inv.get('invoice_number', 'N/A')}")
+ st.write(f"**Supplier:** {inv.get('supplier_name', 'N/A')}")
+ st.write(f"**Amount:** โน{inv.get('amount', 0):.2f}")
+ with col2:
+ st.write(f"**Date:** {inv.get('date', 'N/A')}")
+ st.write(f"**Method:** {inv.get('extraction_info', {}).get('method', 'N/A')}")
+ st.write(f"**Confidence:** {inv.get('extraction_info', {}).get('confidence', 0):.1%}")
+ else:
+ st.info("No recent processing results found.")
+ except Exception as e:
+ st.error(f"Error loading recent results: {e}")
+
+ # -------------------------------------------------------------------------
+ # AI CHAT SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ฌ AI Chat":
+ st.header("๐ฌ AI Chat Interface")
+
+ # Display chat history
+ if st.session_state.chat_history:
+ st.markdown("### ๐ฌ Chat History")
+ for i, message in enumerate(st.session_state.chat_history):
+ with st.chat_message(message["role"]):
+ st.markdown(message["content"])
+
+ # Chat input
+ st.markdown("### โ๏ธ Ask a Question")
+
+ col1, col2 = st.columns([4, 1])
+
+ with col1:
+ user_input = st.text_input(
+ "Type your question:",
+ placeholder="e.g., 'show me total spending'",
+ key=f"chat_input_{session_id}"
+ )
+
+ with col2:
+ ask_btn = st.button("๐ Ask", type="primary", key=f"ask_btn_{session_id}")
+
+ if ask_btn and user_input:
+ handle_chat_query(user_input)
+
+ # Suggested queries
+ if not st.session_state.chat_history:
+ st.markdown("### ๐ก Try These Queries")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("**๐ Basic Queries:**")
+ basic_queries = [
+ "Show me a summary of all invoices",
+ "How much have we spent in total?",
+ "Who are our top suppliers?",
+ "Find invoices with high amounts"
+ ]
+ for i, query in enumerate(basic_queries):
+ if st.button(query, key=f"basic_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ with col2:
+ st.markdown("**๐ Advanced Queries:**")
+ advanced_queries = [
+ "Find technology purchases",
+ "Show office supplies",
+ "Search consulting services",
+ "Recent high-value invoices"
+ ]
+ for i, query in enumerate(advanced_queries):
+ if st.button(query, key=f"advanced_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ # Clear chat
+ if st.session_state.chat_history:
+ if st.button("๐๏ธ Clear Chat", key=f"clear_chat_{session_id}"):
+ st.session_state.chat_history = []
+ st.rerun()
+
+ # -------------------------------------------------------------------------
+ # ANALYTICS SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Analytics":
+ st.header("๐ Analytics Dashboard")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices to see analytics.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'invoice_number': inv.get('invoice_number', ''),
+ 'supplier_name': inv.get('supplier_name', ''),
+ 'amount': inv.get('amount', 0),
+ 'date': inv.get('date', ''),
+ 'confidence': inv.get('extraction_info', {}).get('confidence', 0)
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Key metrics
+ col1, col2, col3, col4 = st.columns(4)
+
+ with col1:
+ st.metric("Total Invoices", len(df))
+ with col2:
+ st.metric("Total Amount", f"โน{df['amount'].sum():,.2f}")
+ with col3:
+ st.metric("Avg Amount", f"โน{df['amount'].mean():,.2f}")
+ with col4:
+ st.metric("Unique Suppliers", df['supplier_name'].nunique())
+
+ # Visualizations
+ if len(df) > 0:
+ # Amount distribution
+ fig_hist = px.histogram(
+ df,
+ x='amount',
+ title="Invoice Amount Distribution",
+ labels={'amount': 'Amount (โน)', 'count': 'Number of Invoices'}
+ )
+ st.plotly_chart(fig_hist, use_container_width=True)
+
+ # Top suppliers
+ if df['supplier_name'].notna().any():
+ supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10)
+
+ if len(supplier_amounts) > 0:
+ fig_suppliers = px.bar(
+ x=supplier_amounts.values,
+ y=supplier_amounts.index,
+ orientation='h',
+ title="Top 10 Suppliers by Total Amount",
+ labels={'x': 'Total Amount (โน)', 'y': 'Supplier'}
+ )
+ st.plotly_chart(fig_suppliers, use_container_width=True)
+
+ except Exception as e:
+ st.error(f"Analytics error: {e}")
+
+ # -------------------------------------------------------------------------
+ # DATA EXPLORER SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Data Explorer":
+ st.header("๐ Data Explorer")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices first.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'Invoice Number': inv.get('invoice_number', ''),
+ 'Supplier': inv.get('supplier_name', ''),
+ 'Buyer': inv.get('buyer_name', ''),
+ 'Amount': inv.get('amount', 0),
+ 'Date': inv.get('date', ''),
+ 'Confidence': inv.get('extraction_info', {}).get('confidence', 0),
+ 'Method': inv.get('extraction_info', {}).get('method', ''),
+ 'File': inv.get('file_info', {}).get('file_name', ''),
+ 'Created': inv.get('timestamps', {}).get('created_at', '')[:19]
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Filters
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist())
+ selected_supplier = st.selectbox("Filter by Supplier", suppliers, key=f"supplier_filter_{session_id}")
+
+ with col2:
+ methods = ['All'] + sorted(df['Method'].dropna().unique().tolist())
+ selected_method = st.selectbox("Filter by Method", methods, key=f"method_filter_{session_id}")
+
+ with col3:
+ min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0, key=f"amount_filter_{session_id}")
+
+ # Apply filters
+ filtered_df = df.copy()
+ if selected_supplier != 'All':
+ filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier]
+ if selected_method != 'All':
+ filtered_df = filtered_df[filtered_df['Method'] == selected_method]
+ if min_amount > 0:
+ filtered_df = filtered_df[filtered_df['Amount'] >= min_amount]
+
+ # Display data
+ st.dataframe(
+ filtered_df,
+ use_container_width=True,
+ column_config={
+ "Amount": st.column_config.NumberColumn("Amount", format="โน%.2f"),
+ "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1)
+ }
+ )
+
+ # Export options
+ col1, col2 = st.columns(2)
+
+ with col1:
+ if st.button("๐ฅ Export CSV", key=f"export_csv_{session_id}"):
+ csv_data = filtered_df.to_csv(index=False)
+ st.download_button(
+ "Download CSV",
+ csv_data,
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
+ "text/csv",
+ key=f"download_csv_{session_id}"
+ )
+
+ with col2:
+ if st.button("๐ Export JSON", key=f"export_json_{session_id}"):
+ filtered_invoices = [inv for inv in invoices
+ if inv.get('invoice_number') in filtered_df['Invoice Number'].values]
+
+ export_data = {
+ "exported_at": datetime.now().isoformat(),
+ "total_records": len(filtered_invoices),
+ "invoices": filtered_invoices
+ }
+
+ st.download_button(
+ "Download JSON",
+ json.dumps(export_data, indent=2),
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
+ "application/json",
+ key=f"download_json_{session_id}"
+ )
+
+ except Exception as e:
+ st.error(f"Data explorer error: {e}")
+
+ # -------------------------------------------------------------------------
+ # GLOBAL CHAT INPUT
+ # -------------------------------------------------------------------------
+
+ st.markdown("---")
+ st.markdown("### ๐ฌ Quick Chat (Works from any section)")
+
+ global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
+
+ if global_query:
+ handle_chat_query(global_query, show_response=True)
+
+ # Footer
+ st.markdown("---")
+ st.markdown("""
+
+
๐ AI Invoice Processing System - Optimized for Hugging Face Spaces
+
Built with โค๏ธ using Streamlit, Transformers, and AI
+
+ """, unsafe_allow_html=True)
+
+# ===============================================================================
+# HELPER FUNCTIONS
+# ===============================================================================
+
+def process_files_once(uploaded_files, session_id):
+ """Process uploaded files only once with proper state management"""
+ if not uploaded_files:
+ st.error("No files to process!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ return
+
+ st.markdown("### ๐ Processing Files...")
+
+ # Get already processed file hashes
+ processed_hashes = st.session_state[f'processed_file_hashes_{session_id}']
+
+ # Filter out already processed files
+ files_to_process = []
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ if file_hash not in processed_hashes:
+ files_to_process.append((file, file_hash))
+
+ if not files_to_process:
+ st.info("โ
All files have already been processed!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+ return
+
+ # Create containers for dynamic updates
+ progress_container = st.container()
+ status_container = st.container()
+ results_container = st.container()
+
+ successful = 0
+ failed = 0
+
+ # Show progress
+ with progress_container:
+ progress_bar = st.progress(0)
+ progress_text = st.empty()
+
+ with status_container:
+ st.info(f"Starting to process {len(files_to_process)} new files...")
+
+ # Process each file only once
+ for i, (uploaded_file, file_hash) in enumerate(files_to_process):
+ current_progress = (i + 1) / len(files_to_process)
+
+ with progress_container:
+ progress_bar.progress(current_progress)
+ progress_text.text(f"Processing file {i+1}/{len(files_to_process)}: {uploaded_file.name}")
+
+ with status_container:
+ st.info(f"๐ Processing: {uploaded_file.name} ({len(uploaded_file.getvalue())/1024:.1f} KB)")
+
+ try:
+ # Process the file
+ result = st.session_state.processor.process_uploaded_file(uploaded_file)
+
+ # Mark file as processed regardless of result
+ processed_hashes.add(file_hash)
+
+ # Show result immediately
+ with results_container:
+ if result and hasattr(result, 'invoice_number') and result.invoice_number:
+ successful += 1
+ st.success(f"โ
Successfully processed: {uploaded_file.name}")
+
+ # Show extracted data
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ st.write(f"**Invoice #:** {result.invoice_number}")
+ st.write(f"**Supplier:** {result.supplier_name or 'Not found'}")
+ with col2:
+ st.write(f"**Amount:** โน{result.amount:.2f}")
+ st.write(f"**Date:** {result.date or 'Not found'}")
+ with col3:
+ st.write(f"**Method:** {result.processing_method}")
+ st.write(f"**Confidence:** {result.extraction_confidence:.1%}")
+
+ st.markdown("---")
+ else:
+ failed += 1
+ st.warning(f"โ ๏ธ Could not extract complete data from: {uploaded_file.name}")
+ if result:
+ st.write(f"Partial data: {result.supplier_name}, โน{result.amount}")
+ st.markdown("---")
+
+ except Exception as e:
+ failed += 1
+ # Still mark as processed to avoid reprocessing
+ processed_hashes.add(file_hash)
+
+ with results_container:
+ st.error(f"โ Error processing {uploaded_file.name}: {str(e)}")
+ st.markdown("---")
+
+ # Update session state
+ st.session_state[f'processed_file_hashes_{session_id}'] = processed_hashes
+
+ # Final summary
+ with progress_container:
+ progress_bar.progress(1.0)
+ progress_text.text("โ
Processing completed!")
+
+ with status_container:
+ if successful > 0:
+ st.success(f"๐ Processing complete! {successful} successful, {failed} failed")
+ if successful > 0:
+ st.balloons()
+ else:
+ st.error(f"โ Processing failed for all {failed} files. Please check file formats and content.")
+
+ # Update processing state
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+
+ # Force rerun to update UI
+ st.rerun()
+
+def process_files(uploaded_files, session_id):
+ """Legacy function - redirect to process_files_once"""
+ return process_files_once(uploaded_files, session_id)
+
+def handle_chat_query(query, show_response=False):
+ """Handle chat query"""
+ st.session_state.chat_history.append({
+ "role": "user",
+ "content": query,
+ "timestamp": datetime.now()
+ })
+
+ try:
+ with st.spinner("๐ค AI is analyzing..."):
+ response = st.session_state.chatbot.query_database(query)
+
+ st.session_state.chat_history.append({
+ "role": "assistant",
+ "content": response,
+ "timestamp": datetime.now()
+ })
+
+ if show_response:
+ with st.chat_message("assistant"):
+ st.markdown(response)
+ st.info("๐ก Switch to the 'AI Chat' section to see full conversation history!")
+
+ st.rerun()
+
+ except Exception as e:
+ st.error(f"Chat error: {e}")
+
+# ===============================================================================
+# MAIN ENTRY POINT
+# ===============================================================================
+
+def main():
+ """Main entry point for Hugging Face Spaces"""
+ try:
+ if IS_HF_SPACE:
+ st.sidebar.info("๐ค Running on Hugging Face Spaces")
+
+ create_app()
+
+ except Exception as e:
+ st.error(f"""
+ ## ๐จ Application Error
+
+ {e}
+
+ Please refresh the page or check the logs for more details.
+ """)
+
+if __name__ == "__main__":
+ main(),
+
+ # Standalone amounts with currency words
+ r'([0-9,]+\.?\d*)\s*(?:dollars?|rupees?|usd|inr|eur|gbp)',
+
+ # Table-like patterns
+ r'(?:price|cost|rate)\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+
+ # Amount with decimal precision
+ r'(?:^|\s)([0-9]{1,3}(?:,\d{3})*\.?\d{0,2})(?=\s|$)',
+ ],
+ 'date': [
+ r'date\s*:?\s*(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
+ r'(?:invoice|bill)\s*date\s*:?\s*(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
+ r'(?:^|\s)(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})(?=\s|$)',
+ r'(\d{4}[/\-\.]\d{1,2}[/\-\.]\d{1,2})',
+ r'(\d{1,2}\s+(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{2,4})',
+ ],
+ 'quantity': [
+ r'qty\s*:?\s*(\d+)',
+ r'quantity\s*:?\s*(\d+)',
+ r'(?:units?|pcs?|pieces?)\s*:?\s*(\d+)',
+ r'(\d+)\s*(?:pcs?|units?|items?|pieces?)',
+ ]
+ }
+
+ text_lower = text.lower()
+
+ # Extract invoice number with multiple attempts
+ for pattern in patterns['invoice_number']:
+ match = re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+ if match:
+ invoice_data.invoice_number = match.group(1).upper().strip()
+ break
+
+ # Extract amount with enhanced logic
+ amounts_found = []
+ for pattern in patterns['amount']:
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+ for match in matches:
+ try:
+ amount_str = match.group(1).replace(',', '').replace(' ', '')
+ amount_val = float(amount_str)
+ if 0.01 <= amount_val <= 1000000: # Reasonable range
+ amounts_found.append(amount_val)
+ except (ValueError, IndexError):
+ continue
+
+ # Choose the most likely amount (highest value or most repeated)
+ if amounts_found:
+ # Remove duplicates and sort
+ unique_amounts = sorted(set(amounts_found), reverse=True)
+ # Take the highest reasonable amount
+ invoice_data.amount = unique_amounts[0]
+
+ # Extract date
+ for pattern in patterns['date']:
+ match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+ if match:
+ invoice_data.date = self.parse_date(match.group(1))
+ break
+
+ # Extract quantity
+ for pattern in patterns['quantity']:
+ match = re.search(pattern, text_lower, re.IGNORECASE)
+ if match:
+ try:
+ invoice_data.quantity = int(match.group(1))
+ break
+ except ValueError:
+ continue
+
+ # Enhanced company name extraction
+ company_patterns = [
+ r'(?:from|supplier|vendor)\s*:?\s*([A-Z][A-Za-z\s&,\.]{3,50})',
+ r'(?:to|buyer|client)\s*:?\s*([A-Z][A-Za-z\s&,\.]{3,50})',
+ r'([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:ltd|inc|corp|llc|co\.|company|pvt|private|limited)',
+ r'(?:^|\n)([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:\n|$)',
+ ]
+
+ companies_found = []
+ for pattern in company_patterns:
+ matches = re.findall(pattern, text, re.MULTILINE)
+ for match in matches:
+ clean_company = match.strip().title()
+ if len(clean_company) > 3 and not any(word in clean_company.lower() for word in ['total', 'amount', 'date', 'invoice']):
+ companies_found.append(clean_company)
+
+ # Assign companies (first as supplier, second as buyer)
+ if companies_found:
+ invoice_data.supplier_name = companies_found[0]
+ if len(companies_found) > 1:
+ invoice_data.buyer_name = companies_found[1]
+
+ # Extract product description
+ desc_patterns = [
+ r'(?:description|item|product|service)\s*:?\s*([A-Za-z0-9\s,.-]{10,200})',
+ r'(?:for|regarding)\s*:?\s*([A-Za-z0-9\s,.-]{10,200})',
+ ]
+
+ for pattern in desc_patterns:
+ match = re.search(pattern, text, re.IGNORECASE)
+ if match:
+ desc = match.group(1).strip()
+ if len(desc) > 5:
+ invoice_data.product_description = desc[:200] # Limit length
+ break
+
+ # Set confidence based on how much we extracted
+ confidence_factors = []
+ if invoice_data.invoice_number:
+ confidence_factors.append(0.3)
+ if invoice_data.amount > 0:
+ confidence_factors.append(0.3)
+ if invoice_data.supplier_name:
+ confidence_factors.append(0.2)
+ if invoice_data.date:
+ confidence_factors.append(0.1)
+ if invoice_data.quantity > 0:
+ confidence_factors.append(0.1)
+
+ invoice_data.extraction_confidence = sum(confidence_factors)
+
+ return invoice_data
+
+ def parse_date(self, date_str: str) -> str:
+ """Parse date to YYYY-MM-DD format"""
+ if not date_str:
+ return ""
+
+ formats = ['%Y-%m-%d', '%m/%d/%Y', '%d/%m/%Y', '%m-%d-%Y', '%d-%m-%Y', '%Y/%m/%d']
+
+ for fmt in formats:
+ try:
+ parsed_date = datetime.strptime(date_str, fmt)
+ return parsed_date.strftime('%Y-%m-%d')
+ except ValueError:
+ continue
+
+ return date_str
+
+# ===============================================================================
+# VECTOR STORE CLASS
+# ===============================================================================
+
+class VectorStore:
+ """Simplified vector store for Hugging Face Spaces"""
+
+ def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
+ self.embedding_model_name = embedding_model
+ self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
+ self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
+ self.embedding_model = None
+ self.vectors = []
+ self.document_metadata = []
+ self.embedding_dimension = None
+
+ self.setup_embedding_model()
+ self.load_vector_store()
+
+ def setup_embedding_model(self):
+ """Initialize the sentence transformer model"""
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
+ st.warning("โ ๏ธ Sentence Transformers not available. Vector search disabled.")
+ return
+
+ try:
+ with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
+ self.embedding_model = SentenceTransformer(
+ self.embedding_model_name,
+ cache_folder=HF_CONFIG["cache_dir"]
+ )
+
+ # Get embedding dimension
+ test_embedding = self.embedding_model.encode(["test"])
+ self.embedding_dimension = test_embedding.shape[0]
+
+ st.success(f"โ
Embedding model loaded: {self.embedding_model_name}")
+
+ except Exception as e:
+ st.error(f"โ Failed to load embedding model: {e}")
+ self.embedding_model = None
+
+ def load_vector_store(self):
+ """Load existing vector store"""
+ try:
+ if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
+ with open(self.vector_store_path, 'rb') as f:
+ self.vectors = pickle.load(f)
+
+ with open(self.metadata_path, 'rb') as f:
+ self.document_metadata = pickle.load(f)
+
+ st.success(f"โ
Vector store loaded: {len(self.document_metadata)} documents")
+ else:
+ self.vectors = []
+ self.document_metadata = []
+ st.info("๐ New vector store initialized")
+
+ except Exception as e:
+ st.error(f"โ Error loading vector store: {e}")
+ self.vectors = []
+ self.document_metadata = []
+
+ def save_vector_store(self):
+ """Save vector store to disk"""
+ try:
+ with open(self.vector_store_path, 'wb') as f:
+ pickle.dump(self.vectors, f)
+
+ with open(self.metadata_path, 'wb') as f:
+ pickle.dump(self.document_metadata, f)
+
+ return True
+ except Exception as e:
+ st.error(f"Error saving vector store: {e}")
+ return False
+
+ def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
+ """Create searchable text from invoice data"""
+ text_parts = []
+
+ for field, value in invoice_data.items():
+ if value and field != 'id':
+ text_parts.append(f"{field}: {value}")
+
+ if raw_text:
+ text_parts.append(f"content: {raw_text[:300]}")
+
+ return " | ".join(text_parts)
+
+ def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
+ """Add a document to the vector store"""
+ if not self.embedding_model:
+ return False
+
+ try:
+ document_text = self.create_document_text(invoice_data, raw_text)
+
+ # Generate embedding
+ embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
+
+ # Create metadata
+ metadata = {
+ 'invoice_id': invoice_data.get('id', ''),
+ 'invoice_number': invoice_data.get('invoice_number', ''),
+ 'supplier_name': invoice_data.get('supplier_name', ''),
+ 'buyer_name': invoice_data.get('buyer_name', ''),
+ 'amount': invoice_data.get('amount', 0),
+ 'date': invoice_data.get('date', ''),
+ 'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
+ 'document_text': document_text[:200],
+ 'timestamp': datetime.now().isoformat()
+ }
+
+ # Add to store
+ self.vectors.append(embedding)
+ self.document_metadata.append(metadata)
+
+ return True
+
+ except Exception as e:
+ st.error(f"Error adding document to vector store: {e}")
+ return False
+
+ def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
+ """Perform semantic search using cosine similarity"""
+ if not self.embedding_model or not self.vectors:
+ return []
+
+ try:
+ # Generate query embedding
+ query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
+
+ # Calculate similarities
+ similarities = []
+ for i, doc_embedding in enumerate(self.vectors):
+ similarity = np.dot(query_embedding, doc_embedding)
+ similarities.append((similarity, i))
+
+ # Sort by similarity
+ similarities.sort(reverse=True)
+
+ # Return top results
+ results = []
+ for similarity, idx in similarities[:top_k]:
+ if similarity > 0.1: # Relevance threshold
+ metadata = self.document_metadata[idx]
+ result = VectorSearchResult(
+ invoice_id=metadata.get('invoice_id', ''),
+ invoice_number=metadata.get('invoice_number', ''),
+ supplier_name=metadata.get('supplier_name', ''),
+ similarity_score=float(similarity),
+ content_preview=metadata.get('document_text', ''),
+ metadata=metadata
+ )
+ results.append(result)
+
+ return results
+
+ except Exception as e:
+ st.error(f"Error in semantic search: {e}")
+ return []
+
+# ===============================================================================
+# MAIN PROCESSOR CLASS
+# ===============================================================================
+
+class InvoiceProcessor:
+ """Main invoice processor for Hugging Face Spaces"""
+
+ def __init__(self):
+ self.setup_storage()
+ self.document_processor = DocumentProcessor()
+ self.ai_extractor = AIExtractor()
+ self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
+
+ # Initialize stats
+ self.processing_stats = {
+ 'total_processed': 0,
+ 'successful': 0,
+ 'failed': 0,
+ 'start_time': datetime.now()
+ }
+
+ def setup_storage(self):
+ """Setup storage paths"""
+ self.data_dir = HF_CONFIG["data_dir"]
+ self.json_path = os.path.join(self.data_dir, "invoices.json")
+
+ # Initialize JSON storage
+ if not os.path.exists(self.json_path):
+ initial_data = {
+ "metadata": {
+ "created_at": datetime.now().isoformat(),
+ "version": "hf_v1.0",
+ "total_invoices": 0
+ },
+ "invoices": [],
+ "summary": {
+ "total_amount": 0.0,
+ "unique_suppliers": [],
+ "processing_stats": {"successful": 0, "failed": 0}
+ }
+ }
+ self.save_json_data(initial_data)
+
+ def load_json_data(self) -> dict:
+ """Load invoice data from JSON"""
+ try:
+ with open(self.json_path, 'r', encoding='utf-8') as f:
+ return json.load(f)
+ except (FileNotFoundError, json.JSONDecodeError):
+ self.setup_storage()
+ return self.load_json_data()
+
+ def save_json_data(self, data: dict):
+ """Save invoice data to JSON"""
+ try:
+ with open(self.json_path, 'w', encoding='utf-8') as f:
+ json.dump(data, f, indent=2, ensure_ascii=False)
+ except Exception as e:
+ st.error(f"Error saving data: {e}")
+
+ def process_uploaded_file(self, uploaded_file) -> InvoiceData:
+ """Process a single uploaded file with enhanced debugging"""
+ self.processing_stats['total_processed'] += 1
+
+ try:
+ # Debug file info
+ file_size = len(uploaded_file.getvalue())
+ file_extension = uploaded_file.name.split('.')[-1].lower() if '.' in uploaded_file.name else 'unknown'
+
+ st.info(f"๐ Processing: {uploaded_file.name} ({file_size/1024:.1f} KB, .{file_extension})")
+
+ # Check file size
+ if file_size > HF_CONFIG["max_file_size_mb"] * 1024 * 1024:
+ error_msg = f"File too large: {file_size / 1024 / 1024:.2f}MB > {HF_CONFIG['max_file_size_mb']}MB"
+ st.error(error_msg)
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ # Check file type
+ if file_extension not in ['pdf', 'txt']:
+ error_msg = f"Unsupported file type: .{file_extension} (supported: PDF, TXT)"
+ st.warning(error_msg)
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ # Save temporarily
+ with tempfile.NamedTemporaryFile(delete=False, suffix=f".{file_extension}") as tmp_file:
+ file_content = uploaded_file.getvalue()
+ tmp_file.write(file_content)
+ tmp_file_path = tmp_file.name
+
+ st.info(f"๐พ Saved temporarily to: {tmp_file_path}")
+
+ try:
+ # Extract text
+ st.info("๐ Extracting text from document...")
+ text = self.document_processor.extract_text_from_document(tmp_file_path)
+
+ if not text or not text.strip():
+ st.warning(f"โ No text extracted from {uploaded_file.name}")
+ self.processing_stats['failed'] += 1
+ return InvoiceData()
+
+ text_length = len(text)
+ st.info(f"๐ Extracted {text_length} characters of text")
+
+ # Show text preview
+ if text_length > 0:
+ with st.expander("๐ Text Preview (First 500 characters)", expanded=False):
+ st.text(text[:500] + "..." if len(text) > 500 else text)
+
+ # Extract invoice data
+ st.info("๐ค Extracting invoice data using AI/Regex...")
+ invoice_data = self.ai_extractor.extract_with_ai(text)
+ invoice_data.file_path = uploaded_file.name
+
+ # Show extraction results
+ st.info(f"๐ Extraction completed with {invoice_data.extraction_confidence:.1%} confidence")
+
+ # Save to storage
+ st.info("๐พ Saving extracted data...")
+ self.save_invoice_data(invoice_data, text, file_size)
+
+ self.processing_stats['successful'] += 1
+ st.success(f"โ
Successfully processed {uploaded_file.name}")
+
+ return invoice_data
+
+ finally:
+ # Cleanup
+ try:
+ os.unlink(tmp_file_path)
+ st.info("๐งน Cleaned up temporary file")
+ except:
+ pass
+
+ except Exception as e:
+ error_msg = f"Error processing {uploaded_file.name}: {str(e)}"
+ st.error(error_msg)
+ self.processing_stats['failed'] += 1
+
+ # Show detailed error for debugging
+ with st.expander("๐ Error Details", expanded=False):
+ st.code(str(e))
+ import traceback
+ st.code(traceback.format_exc())
+
+ return InvoiceData()
+
+ def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int):
+ """Save invoice data to JSON and vector store"""
+ try:
+ # Load existing data
+ data = self.load_json_data()
+
+ # Create invoice record
+ invoice_record = {
+ "id": len(data["invoices"]) + 1,
+ "invoice_number": invoice_data.invoice_number,
+ "supplier_name": invoice_data.supplier_name,
+ "buyer_name": invoice_data.buyer_name,
+ "date": invoice_data.date,
+ "amount": invoice_data.amount,
+ "quantity": invoice_data.quantity,
+ "product_description": invoice_data.product_description,
+ "file_info": {
+ "file_name": invoice_data.file_path,
+ "file_size": file_size
+ },
+ "extraction_info": {
+ "confidence": invoice_data.extraction_confidence,
+ "method": invoice_data.processing_method,
+ "raw_text_preview": raw_text[:300]
+ },
+ "timestamps": {
+ "created_at": datetime.now().isoformat()
+ }
+ }
+
+ # Add to invoices
+ data["invoices"].append(invoice_record)
+
+ # Update summary
+ self.update_summary(data)
+
+ # Save JSON
+ self.save_json_data(data)
+
+ # Add to vector store
+ if self.vector_store:
+ self.vector_store.add_document(invoice_record, raw_text)
+ self.vector_store.save_vector_store()
+
+ except Exception as e:
+ st.error(f"Error saving invoice data: {e}")
+
+ def update_summary(self, data: dict):
+ """Update summary statistics"""
+ invoices = data["invoices"]
+
+ total_amount = sum(inv.get("amount", 0) for inv in invoices)
+ unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name")))
+
+ data["summary"] = {
+ "total_amount": total_amount,
+ "unique_suppliers": unique_suppliers,
+ "processing_stats": {
+ "successful": self.processing_stats['successful'],
+ "failed": self.processing_stats['failed'],
+ "total_processed": self.processing_stats['total_processed']
+ }
+ }
+
+ data["metadata"]["last_updated"] = datetime.now().isoformat()
+ data["metadata"]["total_invoices"] = len(invoices)
+
+# ===============================================================================
+# CHATBOT CLASS
+# ===============================================================================
+
+class ChatBot:
+ """Chatbot for invoice queries"""
+
+ def __init__(self, processor: InvoiceProcessor):
+ self.processor = processor
+
+ def query_database(self, query: str) -> str:
+ """Process user query and return response"""
+ try:
+ data = self.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ return "No invoice data found. Please upload some invoices first."
+
+ query_lower = query.lower()
+
+ # Handle different query types
+ if any(phrase in query_lower for phrase in ["summary", "overview", "total"]):
+ return self.generate_summary(data)
+
+ elif "count" in query_lower or "how many" in query_lower:
+ return self.handle_count_query(data)
+
+ elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]):
+ return self.handle_amount_query(data)
+
+ elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
+ return self.handle_supplier_query(data, query)
+
+ elif self.processor.vector_store:
+ return self.handle_semantic_search(query)
+
+ else:
+ return self.handle_general_query(data, query)
+
+ except Exception as e:
+ return f"Error processing query: {e}"
+
+ def generate_summary(self, data: dict) -> str:
+ """Generate comprehensive summary"""
+ invoices = data.get("invoices", [])
+ summary = data.get("summary", {})
+
+ if not invoices:
+ return "No invoices found in the system."
+
+ total_amount = summary.get("total_amount", 0)
+ avg_amount = total_amount / len(invoices) if invoices else 0
+ unique_suppliers = len(summary.get("unique_suppliers", []))
+
+ response = f"""
+**๐ Invoice System Summary**
+
+โข **Total Invoices**: {len(invoices):,}
+โข **Total Value**: โน{total_amount:,.2f}
+โข **Average Invoice**: โน{avg_amount:,.2f}
+โข **Unique Suppliers**: {unique_suppliers}
+
+**๐ Processing Stats**
+โข **Successful**: {summary.get('processing_stats', {}).get('successful', 0)}
+โข **Failed**: {summary.get('processing_stats', {}).get('failed', 0)}
+
+**๐ Recent Invoices**
+"""
+
+ # Show recent invoices
+ recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5]
+ for i, inv in enumerate(recent, 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_count_query(self, data: dict) -> str:
+ """Handle count-related queries"""
+ invoices = data.get("invoices", [])
+ total = len(invoices)
+ unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number')))
+
+ return f"""
+**๐ Invoice Count Summary**
+
+โข **Total Records**: {total}
+โข **Unique Invoice Numbers**: {unique_numbers}
+โข **Duplicates**: {total - unique_numbers if total > unique_numbers else 0}
+
+**๐
Processing Timeline**
+โข **First Invoice**: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+โข **Latest Invoice**: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+"""
+
+ def handle_amount_query(self, data: dict) -> str:
+ """Handle amount-related queries"""
+ invoices = data.get("invoices", [])
+ amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0]
+
+ if not amounts:
+ return "No amount information found in invoices."
+
+ total_amount = sum(amounts)
+ avg_amount = total_amount / len(amounts)
+ max_amount = max(amounts)
+ min_amount = min(amounts)
+
+ # Find high-value invoices
+ high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount
+ high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold]
+
+ response = f"""
+**๐ฐ Financial Analysis**
+
+โข **Total Amount**: โน{total_amount:,.2f}
+โข **Average Amount**: โน{avg_amount:,.2f}
+โข **Highest Invoice**: โน{max_amount:,.2f}
+โข **Lowest Invoice**: โน{min_amount:,.2f}
+
+**๐ฏ High-Value Invoices (โน{high_value_threshold:,.2f}+)**
+"""
+
+ for i, inv in enumerate(high_value_invoices[:5], 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_supplier_query(self, data: dict, query: str) -> str:
+ """Handle supplier-related queries"""
+ invoices = data.get("invoices", [])
+
+ # Count invoices by supplier
+ supplier_counts = {}
+ supplier_amounts = {}
+
+ for inv in invoices:
+ supplier = inv.get('supplier_name', '').strip()
+ if supplier:
+ supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1
+ supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0)
+
+ if not supplier_counts:
+ return "No supplier information found in invoices."
+
+ # Sort suppliers by amount
+ top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10]
+
+ response = f"""
+**๐ข Supplier Analysis**
+
+โข **Total Unique Suppliers**: {len(supplier_counts)}
+โข **Most Active**: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices)
+
+**๐ฐ Top Suppliers by Amount**
+"""
+
+ for i, (supplier, amount) in enumerate(top_suppliers, 1):
+ count = supplier_counts[supplier]
+ avg = amount / count if count > 0 else 0
+ response += f"\n{i}. **{supplier}** - โน{amount:,.2f} ({count} invoices, avg: โน{avg:,.2f})"
+
+ return response
+
+ def handle_semantic_search(self, query: str) -> str:
+ """Handle semantic search queries"""
+ try:
+ results = self.processor.vector_store.semantic_search(query, top_k=5)
+
+ if not results:
+ return f"No relevant results found for '{query}'. Try different keywords."
+
+ response = f"๐ **Semantic Search Results for '{query}'**\n\n"
+
+ for i, result in enumerate(results, 1):
+ response += f"{i}. **{result.invoice_number}** - {result.supplier_name}\n"
+ response += f" โข Similarity: {result.similarity_score:.3f}\n"
+ response += f" โข Amount: โน{result.metadata.get('amount', 0):,.2f}\n"
+ response += f" โข Preview: {result.content_preview[:100]}...\n\n"
+
+ return response
+
+ except Exception as e:
+ return f"Semantic search error: {e}"
+
+ def handle_general_query(self, data: dict, query: str) -> str:
+ """Handle general queries with keyword search"""
+ invoices = data.get("invoices", [])
+ query_words = query.lower().split()
+
+ # Simple keyword matching
+ matching_invoices = []
+ for inv in invoices:
+ text_to_search = (
+ inv.get('supplier_name', '') + ' ' +
+ inv.get('buyer_name', '') + ' ' +
+ inv.get('product_description', '') + ' ' +
+ inv.get('extraction_info', {}).get('raw_text_preview', '')
+ ).lower()
+
+ if any(word in text_to_search for word in query_words):
+ matching_invoices.append(inv)
+
+ if not matching_invoices:
+ return f"No invoices found matching '{query}'. Try different keywords or check the summary."
+
+ response = f"๐ **Found {len(matching_invoices)} invoices matching '{query}'**\n\n"
+
+ for i, inv in enumerate(matching_invoices[:5], 1):
+ response += f"{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')}\n"
+ response += f" โข Amount: โน{inv.get('amount', 0):,.2f}\n"
+ response += f" โข Date: {inv.get('date', 'N/A')}\n\n"
+
+ if len(matching_invoices) > 5:
+ response += f"... and {len(matching_invoices) - 5} more results."
+
+ return response
+
+# ===============================================================================
+# STREAMLIT APPLICATION
+# ===============================================================================
+
+def create_app():
+ """Main Streamlit application"""
+
+ # Generate unique session ID for this run
+ if 'session_id' not in st.session_state:
+ st.session_state.session_id = str(uuid.uuid4())[:8]
+
+ session_id = st.session_state.session_id
+
+ # Custom CSS
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Header
+ st.markdown('๐ AI Invoice Processing System
', unsafe_allow_html=True)
+ st.markdown("""
+
+
+ AI-Powered Document Processing โข Semantic Search โข Smart Analytics โข Hugging Face Spaces
+
+
+ """, unsafe_allow_html=True)
+
+ # Initialize processor
+ if 'processor' not in st.session_state:
+ with st.spinner("๐ง Initializing AI Invoice Processor..."):
+ try:
+ st.session_state.processor = InvoiceProcessor()
+ st.session_state.chatbot = ChatBot(st.session_state.processor)
+ st.session_state.chat_history = []
+ st.success("โ
System initialized successfully!")
+ except Exception as e:
+ st.error(f"โ Initialization failed: {e}")
+ st.stop()
+
+ # Sidebar
+ with st.sidebar:
+ st.header("๐๏ธ System Status")
+
+ processor = st.session_state.processor
+
+ # Component status
+ if processor.document_processor.processors:
+ st.markdown('โ
Document Processing', unsafe_allow_html=True)
+ else:
+ st.markdown('โ Document Processing', unsafe_allow_html=True)
+
+ if processor.ai_extractor.use_transformers:
+ st.markdown('โ
AI Extraction', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Regex Extraction', unsafe_allow_html=True)
+
+ if processor.vector_store and processor.vector_store.embedding_model:
+ st.markdown('โ
Semantic Search', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Keyword Search Only', unsafe_allow_html=True)
+
+ # Quick stats
+ st.header("๐ Quick Stats")
+ try:
+ data = processor.load_json_data()
+ total_invoices = len(data.get("invoices", []))
+ total_amount = data.get("summary", {}).get("total_amount", 0)
+
+ st.metric("Total Invoices", total_invoices)
+ st.metric("Total Value", f"โน{total_amount:,.2f}")
+ st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
+
+ except Exception as e:
+ st.error(f"Stats error: {e}")
+
+ # System info
+ st.header("โ๏ธ System Info")
+ st.info(f"""
+ **Session ID:** {session_id}
+
+ **Limits:**
+ โข Max file size: 10MB
+ โข Max concurrent files: 3
+ โข Timeout: 30s
+ """)
+
+ # Main navigation
+ selected_tab = st.radio(
+ "Choose a section:",
+ ["๐ค Upload & Process", "๐ฌ AI Chat", "๐ Analytics", "๐ Data Explorer"],
+ horizontal=True,
+ key=f"main_navigation_{session_id}"
+ )
+
+ # -------------------------------------------------------------------------
+ # UPLOAD & PROCESS SECTION
+ # -------------------------------------------------------------------------
+
+ if selected_tab == "๐ค Upload & Process":
+ st.header("๐ค Upload Invoice Documents")
+
+ # Feature highlights
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.markdown("""
+
+
๐ค AI Extraction
+
Advanced NLP models extract structured data automatically
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown("""
+
+
๐ Smart Search
+
Semantic search finds invoices using natural language
+
+ """, unsafe_allow_html=True)
+
+ with col3:
+ st.markdown("""
+
+
๐ Analytics
+
Comprehensive insights and visualizations
+
+ """, unsafe_allow_html=True)
+
+ # File upload
+ st.markdown("### ๐ Upload Your Invoices")
+
+ # Initialize session state for files if not exists
+ if f'uploaded_files_{session_id}' not in st.session_state:
+ st.session_state[f'uploaded_files_{session_id}'] = None
+ if f'processing_complete_{session_id}' not in st.session_state:
+ st.session_state[f'processing_complete_{session_id}'] = False
+ if f'currently_processing_{session_id}' not in st.session_state:
+ st.session_state[f'currently_processing_{session_id}'] = False
+ if f'processed_file_hashes_{session_id}' not in st.session_state:
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+
+ # File uploader with stable key
+ uploaded_files = st.file_uploader(
+ "Choose invoice files (PDF, TXT supported)",
+ type=['pdf', 'txt'],
+ accept_multiple_files=True,
+ help="Maximum file size: 10MB per file",
+ key=f"file_uploader_stable_{session_id}"
+ )
+
+ # Store uploaded files in session state only if they're new
+ if uploaded_files:
+ # Create file hashes to detect if files have changed
+ current_file_hashes = set()
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ current_file_hashes.add(file_hash)
+
+ # Check if files have changed
+ stored_hashes = st.session_state.get(f'uploaded_file_hashes_{session_id}', set())
+ if current_file_hashes != stored_hashes:
+ st.session_state[f'uploaded_files_{session_id}'] = uploaded_files
+ st.session_state[f'uploaded_file_hashes_{session_id}'] = current_file_hashes
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.info("๐ New files detected - ready for processing")
+
+ # Get files from session state
+ current_files = st.session_state[f'uploaded_files_{session_id}']
+ is_processing = st.session_state[f'currently_processing_{session_id}']
+ is_complete = st.session_state[f'processing_complete_{session_id}']
+
+ if current_files:
+ max_files = 3
+ if len(current_files) > max_files:
+ st.warning(f"โ ๏ธ Too many files selected. Processing first {max_files} files.")
+ current_files = current_files[:max_files]
+
+ st.info(f"๐ {len(current_files)} files selected")
+
+ # Show file names
+ st.markdown("**Selected Files:**")
+ for i, file in enumerate(current_files, 1):
+ file_size_mb = len(file.getvalue()) / (1024 * 1024)
+ file_hash = hash((file.name, file.size))
+ processed_icon = "โ
" if file_hash in st.session_state[f'processed_file_hashes_{session_id}'] else "๐"
+ st.write(f"{processed_icon} {i}. {file.name} ({file_size_mb:.2f} MB)")
+
+ # Process button - only show if not currently processing
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ if not is_processing and not is_complete:
+ if st.button("๐ Process Files", type="primary", key=f"process_btn_{session_id}"):
+ st.session_state[f'currently_processing_{session_id}'] = True
+ st.rerun()
+ elif is_processing:
+ st.info("๐ Processing in progress...")
+ # Actually process the files here
+ process_files_once(current_files, session_id)
+ elif is_complete:
+ st.success("โ
Processing completed!")
+ if st.button("๐ Process Again", key=f"reprocess_btn_{session_id}"):
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+ st.rerun()
+
+ with col2:
+ if st.button("๐๏ธ Clear Files", key=f"clear_files_{session_id}"):
+ st.session_state[f'uploaded_files_{session_id}'] = None
+ st.session_state[f'uploaded_file_hashes_{session_id}'] = set()
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+ st.rerun()
+
+ else:
+ st.info("๐ Please select invoice files to upload and process")
+
+ # Show processing results if completed
+ if is_complete:
+ st.markdown("### ๐ Recent Processing Results")
+ try:
+ data = st.session_state.processor.load_json_data()
+ recent_invoices = sorted(
+ data.get("invoices", []),
+ key=lambda x: x.get('timestamps', {}).get('created_at', ''),
+ reverse=True
+ )[:5]
+
+ if recent_invoices:
+ for i, inv in enumerate(recent_invoices, 1):
+ with st.expander(f"๐ {inv.get('invoice_number', f'Invoice {i}')} - {inv.get('supplier_name', 'Unknown')}", expanded=False):
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Invoice #:** {inv.get('invoice_number', 'N/A')}")
+ st.write(f"**Supplier:** {inv.get('supplier_name', 'N/A')}")
+ st.write(f"**Amount:** โน{inv.get('amount', 0):.2f}")
+ with col2:
+ st.write(f"**Date:** {inv.get('date', 'N/A')}")
+ st.write(f"**Method:** {inv.get('extraction_info', {}).get('method', 'N/A')}")
+ st.write(f"**Confidence:** {inv.get('extraction_info', {}).get('confidence', 0):.1%}")
+ else:
+ st.info("No recent processing results found.")
+ except Exception as e:
+ st.error(f"Error loading recent results: {e}")
+
+ # -------------------------------------------------------------------------
+ # AI CHAT SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ฌ AI Chat":
+ st.header("๐ฌ AI Chat Interface")
+
+ # Display chat history
+ if st.session_state.chat_history:
+ st.markdown("### ๐ฌ Chat History")
+ for i, message in enumerate(st.session_state.chat_history):
+ with st.chat_message(message["role"]):
+ st.markdown(message["content"])
+
+ # Chat input
+ st.markdown("### โ๏ธ Ask a Question")
+
+ col1, col2 = st.columns([4, 1])
+
+ with col1:
+ user_input = st.text_input(
+ "Type your question:",
+ placeholder="e.g., 'show me total spending'",
+ key=f"chat_input_{session_id}"
+ )
+
+ with col2:
+ ask_btn = st.button("๐ Ask", type="primary", key=f"ask_btn_{session_id}")
+
+ if ask_btn and user_input:
+ handle_chat_query(user_input)
+
+ # Suggested queries
+ if not st.session_state.chat_history:
+ st.markdown("### ๐ก Try These Queries")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("**๐ Basic Queries:**")
+ basic_queries = [
+ "Show me a summary of all invoices",
+ "How much have we spent in total?",
+ "Who are our top suppliers?",
+ "Find invoices with high amounts"
+ ]
+ for i, query in enumerate(basic_queries):
+ if st.button(query, key=f"basic_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ with col2:
+ st.markdown("**๐ Advanced Queries:**")
+ advanced_queries = [
+ "Find technology purchases",
+ "Show office supplies",
+ "Search consulting services",
+ "Recent high-value invoices"
+ ]
+ for i, query in enumerate(advanced_queries):
+ if st.button(query, key=f"advanced_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ # Clear chat
+ if st.session_state.chat_history:
+ if st.button("๐๏ธ Clear Chat", key=f"clear_chat_{session_id}"):
+ st.session_state.chat_history = []
+ st.rerun()
+
+ # -------------------------------------------------------------------------
+ # ANALYTICS SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Analytics":
+ st.header("๐ Analytics Dashboard")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices to see analytics.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'invoice_number': inv.get('invoice_number', ''),
+ 'supplier_name': inv.get('supplier_name', ''),
+ 'amount': inv.get('amount', 0),
+ 'date': inv.get('date', ''),
+ 'confidence': inv.get('extraction_info', {}).get('confidence', 0)
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Key metrics
+ col1, col2, col3, col4 = st.columns(4)
+
+ with col1:
+ st.metric("Total Invoices", len(df))
+ with col2:
+ st.metric("Total Amount", f"โน{df['amount'].sum():,.2f}")
+ with col3:
+ st.metric("Avg Amount", f"โน{df['amount'].mean():,.2f}")
+ with col4:
+ st.metric("Unique Suppliers", df['supplier_name'].nunique())
+
+ # Visualizations
+ if len(df) > 0:
+ # Amount distribution
+ fig_hist = px.histogram(
+ df,
+ x='amount',
+ title="Invoice Amount Distribution",
+ labels={'amount': 'Amount (โน)', 'count': 'Number of Invoices'}
+ )
+ st.plotly_chart(fig_hist, use_container_width=True)
+
+ # Top suppliers
+ if df['supplier_name'].notna().any():
+ supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10)
+
+ if len(supplier_amounts) > 0:
+ fig_suppliers = px.bar(
+ x=supplier_amounts.values,
+ y=supplier_amounts.index,
+ orientation='h',
+ title="Top 10 Suppliers by Total Amount",
+ labels={'x': 'Total Amount (โน)', 'y': 'Supplier'}
+ )
+ st.plotly_chart(fig_suppliers, use_container_width=True)
+
+ except Exception as e:
+ st.error(f"Analytics error: {e}")
+
+ # -------------------------------------------------------------------------
+ # DATA EXPLORER SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Data Explorer":
+ st.header("๐ Data Explorer")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices first.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'Invoice Number': inv.get('invoice_number', ''),
+ 'Supplier': inv.get('supplier_name', ''),
+ 'Buyer': inv.get('buyer_name', ''),
+ 'Amount': inv.get('amount', 0),
+ 'Date': inv.get('date', ''),
+ 'Confidence': inv.get('extraction_info', {}).get('confidence', 0),
+ 'Method': inv.get('extraction_info', {}).get('method', ''),
+ 'File': inv.get('file_info', {}).get('file_name', ''),
+ 'Created': inv.get('timestamps', {}).get('created_at', '')[:19]
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Filters
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist())
+ selected_supplier = st.selectbox("Filter by Supplier", suppliers, key=f"supplier_filter_{session_id}")
+
+ with col2:
+ methods = ['All'] + sorted(df['Method'].dropna().unique().tolist())
+ selected_method = st.selectbox("Filter by Method", methods, key=f"method_filter_{session_id}")
+
+ with col3:
+ min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0, key=f"amount_filter_{session_id}")
+
+ # Apply filters
+ filtered_df = df.copy()
+ if selected_supplier != 'All':
+ filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier]
+ if selected_method != 'All':
+ filtered_df = filtered_df[filtered_df['Method'] == selected_method]
+ if min_amount > 0:
+ filtered_df = filtered_df[filtered_df['Amount'] >= min_amount]
+
+ # Display data
+ st.dataframe(
+ filtered_df,
+ use_container_width=True,
+ column_config={
+ "Amount": st.column_config.NumberColumn("Amount", format="โน%.2f"),
+ "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1)
+ }
+ )
+
+ # Export options
+ col1, col2 = st.columns(2)
+
+ with col1:
+ if st.button("๐ฅ Export CSV", key=f"export_csv_{session_id}"):
+ csv_data = filtered_df.to_csv(index=False)
+ st.download_button(
+ "Download CSV",
+ csv_data,
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
+ "text/csv",
+ key=f"download_csv_{session_id}"
+ )
+
+ with col2:
+ if st.button("๐ Export JSON", key=f"export_json_{session_id}"):
+ filtered_invoices = [inv for inv in invoices
+ if inv.get('invoice_number') in filtered_df['Invoice Number'].values]
+
+ export_data = {
+ "exported_at": datetime.now().isoformat(),
+ "total_records": len(filtered_invoices),
+ "invoices": filtered_invoices
+ }
+
+ st.download_button(
+ "Download JSON",
+ json.dumps(export_data, indent=2),
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
+ "application/json",
+ key=f"download_json_{session_id}"
+ )
+
+ except Exception as e:
+ st.error(f"Data explorer error: {e}")
+
+ # -------------------------------------------------------------------------
+ # GLOBAL CHAT INPUT
+ # -------------------------------------------------------------------------
+
+ st.markdown("---")
+ st.markdown("### ๐ฌ Quick Chat (Works from any section)")
+
+ global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
+
+ if global_query:
+ handle_chat_query(global_query, show_response=True)
+
+ # Footer
+ st.markdown("---")
+ st.markdown("""
+
+
๐ AI Invoice Processing System - Optimized for Hugging Face Spaces
+
Built with โค๏ธ using Streamlit, Transformers, and AI
+
+ """, unsafe_allow_html=True)
+
+# ===============================================================================
+# HELPER FUNCTIONS
+# ===============================================================================
+
+def process_files_once(uploaded_files, session_id):
+ """Process uploaded files only once with proper state management"""
+ if not uploaded_files:
+ st.error("No files to process!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ return
+
+ st.markdown("### ๐ Processing Files...")
+
+ # Get already processed file hashes
+ processed_hashes = st.session_state[f'processed_file_hashes_{session_id}']
+
+ # Filter out already processed files
+ files_to_process = []
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ if file_hash not in processed_hashes:
+ files_to_process.append((file, file_hash))
+
+ if not files_to_process:
+ st.info("โ
All files have already been processed!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+ return
+
+ # Create containers for dynamic updates
+ progress_container = st.container()
+ status_container = st.container()
+ results_container = st.container()
+
+ successful = 0
+ failed = 0
+
+ # Show progress
+ with progress_container:
+ progress_bar = st.progress(0)
+ progress_text = st.empty()
+
+ with status_container:
+ st.info(f"Starting to process {len(files_to_process)} new files...")
+
+ # Process each file only once
+ for i, (uploaded_file, file_hash) in enumerate(files_to_process):
+ current_progress = (i + 1) / len(files_to_process)
+
+ with progress_container:
+ progress_bar.progress(current_progress)
+ progress_text.text(f"Processing file {i+1}/{len(files_to_process)}: {uploaded_file.name}")
+
+ with status_container:
+ st.info(f"๐ Processing: {uploaded_file.name} ({len(uploaded_file.getvalue())/1024:.1f} KB)")
+
+ try:
+ # Process the file
+ result = st.session_state.processor.process_uploaded_file(uploaded_file)
+
+ # Mark file as processed regardless of result
+ processed_hashes.add(file_hash)
+
+ # Show result immediately
+ with results_container:
+ if result and hasattr(result, 'invoice_number') and result.invoice_number:
+ successful += 1
+ st.success(f"โ
Successfully processed: {uploaded_file.name}")
+
+ # Show extracted data
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ st.write(f"**Invoice #:** {result.invoice_number}")
+ st.write(f"**Supplier:** {result.supplier_name or 'Not found'}")
+ with col2:
+ st.write(f"**Amount:** โน{result.amount:.2f}")
+ st.write(f"**Date:** {result.date or 'Not found'}")
+ with col3:
+ st.write(f"**Method:** {result.processing_method}")
+ st.write(f"**Confidence:** {result.extraction_confidence:.1%}")
+
+ st.markdown("---")
+ else:
+ failed += 1
+ st.warning(f"โ ๏ธ Could not extract complete data from: {uploaded_file.name}")
+ if result:
+ st.write(f"Partial data: {result.supplier_name}, โน{result.amount}")
+ st.markdown("---")
+
+ except Exception as e:
+ failed += 1
+ # Still mark as processed to avoid reprocessing
+ processed_hashes.add(file_hash)
+
+ with results_container:
+ st.error(f"โ Error processing {uploaded_file.name}: {str(e)}")
+ st.markdown("---")
+
+ # Update session state
+ st.session_state[f'processed_file_hashes_{session_id}'] = processed_hashes
+
+ # Final summary
+ with progress_container:
+ progress_bar.progress(1.0)
+ progress_text.text("โ
Processing completed!")
+
+ with status_container:
+ if successful > 0:
+ st.success(f"๐ Processing complete! {successful} successful, {failed} failed")
+ if successful > 0:
+ st.balloons()
+ else:
+ st.error(f"โ Processing failed for all {failed} files. Please check file formats and content.")
+
+ # Update processing state
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+
+ # Force rerun to update UI
+ st.rerun()
+
+def process_files(uploaded_files, session_id):
+ """Legacy function - redirect to process_files_once"""
+ return process_files_once(uploaded_files, session_id)
+
+def handle_chat_query(query, show_response=False):
+ """Handle chat query"""
+ st.session_state.chat_history.append({
+ "role": "user",
+ "content": query,
+ "timestamp": datetime.now()
+ })
+
+ try:
+ with st.spinner("๐ค AI is analyzing..."):
+ response = st.session_state.chatbot.query_database(query)
+
+ st.session_state.chat_history.append({
+ "role": "assistant",
+ "content": response,
+ "timestamp": datetime.now()
+ })
+
+ if show_response:
+ with st.chat_message("assistant"):
+ st.markdown(response)
+ st.info("๐ก Switch to the 'AI Chat' section to see full conversation history!")
+
+ st.rerun()
+
+ except Exception as e:
+ st.error(f"Chat error: {e}")
+
+# ===============================================================================
+# MAIN ENTRY POINT
+# ===============================================================================
+
+def main():
+ """Main entry point for Hugging Face Spaces"""
+ try:
+ if IS_HF_SPACE:
+ st.sidebar.info("๐ค Running on Hugging Face Spaces")
+
+ create_app()
+
+ except Exception as e:
+ st.error(f"""
+ ## ๐จ Application Error
+
+ {e}
+
+ Please refresh the page or check the logs for more details.
+ """)
+
+if __name__ == "__main__":
+ main(),
+ ]
+
+ amounts_found = []
+ for i, pattern in enumerate(amount_patterns):
+ matches = re.findall(pattern, text.lower(), re.IGNORECASE | re.MULTILINE)
+ if matches:
+ st.write(f"Pattern {i+1}: {matches}")
+ for match in matches:
+ try:
+ amount_val = float(match.replace(',', ''))
+ amounts_found.append(amount_val)
+ except:
+ pass
+
+ if amounts_found:
+ st.success(f"โ
Found amounts: {amounts_found}")
+ else:
+ st.warning("โ ๏ธ No amounts detected in text")
+
+ # Debug invoice number detection
+ st.markdown("**๐ Invoice Number Detection Debug:**")
+ inv_patterns = [
+ r'invoice\s*(?:no|number|#)?\s*:?\s*([A-Z0-9\-_/]+)',
+ r'#\s*([A-Z0-9\-_/]{3,})',
+ ]
+
+ for i, pattern in enumerate(inv_patterns):
+ matches = re.findall(pattern, text.lower(), re.IGNORECASE)
+ if matches:
+ st.write(f"Invoice Pattern {i+1}: {matches}")
+
+ # Show full text for manual inspection
+ if st.checkbox("Show Full Extracted Text", key=f"debug_full_text_{uploaded_file.name}"):
+ st.text_area("Full Text:", value=text, height=300, disabled=True)
+
+ # Extract invoice data
+ st.info("๐ค Extracting invoice data using AI/Regex...")
+ invoice_data = self.ai_extractor.extract_with_ai(text)
+ invoice_data.file_path = uploaded_file.name
+
+ # Show extraction results
+ st.info(f"๐ Extraction completed with {invoice_data.extraction_confidence:.1%} confidence")
+
+ # Save to storage
+ st.info("๐พ Saving extracted data...")
+ self.save_invoice_data(invoice_data, text, file_size)
+
+ self.processing_stats['successful'] += 1
+ st.success(f"โ
Successfully processed {uploaded_file.name}")
+
+ return invoice_data
+
+ finally:
+ # Cleanup
+ try:
+ os.unlink(tmp_file_path)
+ st.info("๐งน Cleaned up temporary file")
+ except:
+ pass
+
+ except Exception as e:
+ error_msg = f"Error processing {uploaded_file.name}: {str(e)}"
+ st.error(error_msg)
+ self.processing_stats['failed'] += 1
+
+ # Show detailed error for debugging
+ with st.expander("๐ Error Details", expanded=False):
+ st.code(str(e))
+ import traceback
+ st.code(traceback.format_exc())
+
+ return InvoiceData()
+
+ def save_invoice_data(self, invoice_data: InvoiceData, raw_text: str, file_size: int):
+ """Save invoice data to JSON and vector store"""
+ try:
+ # Load existing data
+ data = self.load_json_data()
+
+ # Create invoice record
+ invoice_record = {
+ "id": len(data["invoices"]) + 1,
+ "invoice_number": invoice_data.invoice_number,
+ "supplier_name": invoice_data.supplier_name,
+ "buyer_name": invoice_data.buyer_name,
+ "date": invoice_data.date,
+ "amount": invoice_data.amount,
+ "quantity": invoice_data.quantity,
+ "product_description": invoice_data.product_description,
+ "file_info": {
+ "file_name": invoice_data.file_path,
+ "file_size": file_size
+ },
+ "extraction_info": {
+ "confidence": invoice_data.extraction_confidence,
+ "method": invoice_data.processing_method,
+ "raw_text_preview": raw_text[:300]
+ },
+ "timestamps": {
+ "created_at": datetime.now().isoformat()
+ }
+ }
+
+ # Add to invoices
+ data["invoices"].append(invoice_record)
+
+ # Update summary
+ self.update_summary(data)
+
+ # Save JSON
+ self.save_json_data(data)
+
+ # Add to vector store
+ if self.vector_store:
+ self.vector_store.add_document(invoice_record, raw_text)
+ self.vector_store.save_vector_store()
+
+ except Exception as e:
+ st.error(f"Error saving invoice data: {e}")
+
+ def update_summary(self, data: dict):
+ """Update summary statistics"""
+ invoices = data["invoices"]
+
+ total_amount = sum(inv.get("amount", 0) for inv in invoices)
+ unique_suppliers = list(set(inv.get("supplier_name", "") for inv in invoices if inv.get("supplier_name")))
+
+ data["summary"] = {
+ "total_amount": total_amount,
+ "unique_suppliers": unique_suppliers,
+ "processing_stats": {
+ "successful": self.processing_stats['successful'],
+ "failed": self.processing_stats['failed'],
+ "total_processed": self.processing_stats['total_processed']
+ }
+ }
+
+ data["metadata"]["last_updated"] = datetime.now().isoformat()
+ data["metadata"]["total_invoices"] = len(invoices)
+
+# ===============================================================================
+# CHATBOT CLASS
+# ===============================================================================
+
+class ChatBot:
+ """Chatbot for invoice queries"""
+
+ def __init__(self, processor: InvoiceProcessor):
+ self.processor = processor
+
+ def query_database(self, query: str) -> str:
+ """Process user query and return response"""
+ try:
+ data = self.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ return "No invoice data found. Please upload some invoices first."
+
+ query_lower = query.lower()
+
+ # Handle different query types
+ if any(phrase in query_lower for phrase in ["summary", "overview", "total"]):
+ return self.generate_summary(data)
+
+ elif "count" in query_lower or "how many" in query_lower:
+ return self.handle_count_query(data)
+
+ elif any(phrase in query_lower for phrase in ["amount", "value", "money", "cost"]):
+ return self.handle_amount_query(data)
+
+ elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
+ return self.handle_supplier_query(data, query)
+
+ elif self.processor.vector_store:
+ return self.handle_semantic_search(query)
+
+ else:
+ return self.handle_general_query(data, query)
+
+ except Exception as e:
+ return f"Error processing query: {e}"
+
+ def generate_summary(self, data: dict) -> str:
+ """Generate comprehensive summary"""
+ invoices = data.get("invoices", [])
+ summary = data.get("summary", {})
+
+ if not invoices:
+ return "No invoices found in the system."
+
+ total_amount = summary.get("total_amount", 0)
+ avg_amount = total_amount / len(invoices) if invoices else 0
+ unique_suppliers = len(summary.get("unique_suppliers", []))
+
+ response = f"""
+**๐ Invoice System Summary**
+
+โข **Total Invoices**: {len(invoices):,}
+โข **Total Value**: โน{total_amount:,.2f}
+โข **Average Invoice**: โน{avg_amount:,.2f}
+โข **Unique Suppliers**: {unique_suppliers}
+
+**๐ Processing Stats**
+โข **Successful**: {summary.get('processing_stats', {}).get('successful', 0)}
+โข **Failed**: {summary.get('processing_stats', {}).get('failed', 0)}
+
+**๐ Recent Invoices**
+"""
+
+ # Show recent invoices
+ recent = sorted(invoices, key=lambda x: x.get('timestamps', {}).get('created_at', ''), reverse=True)[:5]
+ for i, inv in enumerate(recent, 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_count_query(self, data: dict) -> str:
+ """Handle count-related queries"""
+ invoices = data.get("invoices", [])
+ total = len(invoices)
+ unique_numbers = len(set(inv.get('invoice_number', '') for inv in invoices if inv.get('invoice_number')))
+
+ return f"""
+**๐ Invoice Count Summary**
+
+โข **Total Records**: {total}
+โข **Unique Invoice Numbers**: {unique_numbers}
+โข **Duplicates**: {total - unique_numbers if total > unique_numbers else 0}
+
+**๐
Processing Timeline**
+โข **First Invoice**: {invoices[0].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+โข **Latest Invoice**: {invoices[-1].get('timestamps', {}).get('created_at', 'N/A')[:10] if invoices else 'N/A'}
+"""
+
+ def handle_amount_query(self, data: dict) -> str:
+ """Handle amount-related queries"""
+ invoices = data.get("invoices", [])
+ amounts = [inv.get('amount', 0) for inv in invoices if inv.get('amount', 0) > 0]
+
+ if not amounts:
+ return "No amount information found in invoices."
+
+ total_amount = sum(amounts)
+ avg_amount = total_amount / len(amounts)
+ max_amount = max(amounts)
+ min_amount = min(amounts)
+
+ # Find high-value invoices
+ high_value_threshold = sorted(amounts, reverse=True)[min(4, len(amounts)-1)] if len(amounts) > 5 else max_amount
+ high_value_invoices = [inv for inv in invoices if inv.get('amount', 0) >= high_value_threshold]
+
+ response = f"""
+**๐ฐ Financial Analysis**
+
+โข **Total Amount**: โน{total_amount:,.2f}
+โข **Average Amount**: โน{avg_amount:,.2f}
+โข **Highest Invoice**: โน{max_amount:,.2f}
+โข **Lowest Invoice**: โน{min_amount:,.2f}
+
+**๐ฏ High-Value Invoices (โน{high_value_threshold:,.2f}+)**
+"""
+
+ for i, inv in enumerate(high_value_invoices[:5], 1):
+ response += f"\n{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')} (โน{inv.get('amount', 0):,.2f})"
+
+ return response
+
+ def handle_supplier_query(self, data: dict, query: str) -> str:
+ """Handle supplier-related queries"""
+ invoices = data.get("invoices", [])
+
+ # Count invoices by supplier
+ supplier_counts = {}
+ supplier_amounts = {}
+
+ for inv in invoices:
+ supplier = inv.get('supplier_name', '').strip()
+ if supplier:
+ supplier_counts[supplier] = supplier_counts.get(supplier, 0) + 1
+ supplier_amounts[supplier] = supplier_amounts.get(supplier, 0) + inv.get('amount', 0)
+
+ if not supplier_counts:
+ return "No supplier information found in invoices."
+
+ # Sort suppliers by amount
+ top_suppliers = sorted(supplier_amounts.items(), key=lambda x: x[1], reverse=True)[:10]
+
+ response = f"""
+**๐ข Supplier Analysis**
+
+โข **Total Unique Suppliers**: {len(supplier_counts)}
+โข **Most Active**: {max(supplier_counts, key=supplier_counts.get)} ({supplier_counts[max(supplier_counts, key=supplier_counts.get)]} invoices)
+
+**๐ฐ Top Suppliers by Amount**
+"""
+
+ for i, (supplier, amount) in enumerate(top_suppliers, 1):
+ count = supplier_counts[supplier]
+ avg = amount / count if count > 0 else 0
+ response += f"\n{i}. **{supplier}** - โน{amount:,.2f} ({count} invoices, avg: โน{avg:,.2f})"
+
+ return response
+
+ def handle_semantic_search(self, query: str) -> str:
+ """Handle semantic search queries"""
+ try:
+ results = self.processor.vector_store.semantic_search(query, top_k=5)
+
+ if not results:
+ return f"No relevant results found for '{query}'. Try different keywords."
+
+ response = f"๐ **Semantic Search Results for '{query}'**\n\n"
+
+ for i, result in enumerate(results, 1):
+ response += f"{i}. **{result.invoice_number}** - {result.supplier_name}\n"
+ response += f" โข Similarity: {result.similarity_score:.3f}\n"
+ response += f" โข Amount: โน{result.metadata.get('amount', 0):,.2f}\n"
+ response += f" โข Preview: {result.content_preview[:100]}...\n\n"
+
+ return response
+
+ except Exception as e:
+ return f"Semantic search error: {e}"
+
+ def handle_general_query(self, data: dict, query: str) -> str:
+ """Handle general queries with keyword search"""
+ invoices = data.get("invoices", [])
+ query_words = query.lower().split()
+
+ # Simple keyword matching
+ matching_invoices = []
+ for inv in invoices:
+ text_to_search = (
+ inv.get('supplier_name', '') + ' ' +
+ inv.get('buyer_name', '') + ' ' +
+ inv.get('product_description', '') + ' ' +
+ inv.get('extraction_info', {}).get('raw_text_preview', '')
+ ).lower()
+
+ if any(word in text_to_search for word in query_words):
+ matching_invoices.append(inv)
+
+ if not matching_invoices:
+ return f"No invoices found matching '{query}'. Try different keywords or check the summary."
+
+ response = f"๐ **Found {len(matching_invoices)} invoices matching '{query}'**\n\n"
+
+ for i, inv in enumerate(matching_invoices[:5], 1):
+ response += f"{i}. **{inv.get('invoice_number', 'N/A')}** - {inv.get('supplier_name', 'Unknown')}\n"
+ response += f" โข Amount: โน{inv.get('amount', 0):,.2f}\n"
+ response += f" โข Date: {inv.get('date', 'N/A')}\n\n"
+
+ if len(matching_invoices) > 5:
+ response += f"... and {len(matching_invoices) - 5} more results."
+
+ return response
+
+# ===============================================================================
+# STREAMLIT APPLICATION
+# ===============================================================================
+
+def create_app():
+ """Main Streamlit application"""
+
+ # Generate unique session ID for this run
+ if 'session_id' not in st.session_state:
+ st.session_state.session_id = str(uuid.uuid4())[:8]
+
+ session_id = st.session_state.session_id
+
+ # Custom CSS
+ st.markdown("""
+
+ """, unsafe_allow_html=True)
+
+ # Header
+ st.markdown('๐ AI Invoice Processing System
', unsafe_allow_html=True)
+ st.markdown("""
+
+
+ AI-Powered Document Processing โข Semantic Search โข Smart Analytics โข Hugging Face Spaces
+
+
+ """, unsafe_allow_html=True)
+
+ # Initialize processor
+ if 'processor' not in st.session_state:
+ with st.spinner("๐ง Initializing AI Invoice Processor..."):
+ try:
+ st.session_state.processor = InvoiceProcessor()
+ st.session_state.chatbot = ChatBot(st.session_state.processor)
+ st.session_state.chat_history = []
+ st.success("โ
System initialized successfully!")
+ except Exception as e:
+ st.error(f"โ Initialization failed: {e}")
+ st.stop()
+
+ # Sidebar
+ with st.sidebar:
+ st.header("๐๏ธ System Status")
+
+ processor = st.session_state.processor
+
+ # Component status
+ if processor.document_processor.processors:
+ st.markdown('โ
Document Processing', unsafe_allow_html=True)
+ else:
+ st.markdown('โ Document Processing', unsafe_allow_html=True)
+
+ if processor.ai_extractor.use_transformers:
+ st.markdown('โ
AI Extraction', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Regex Extraction', unsafe_allow_html=True)
+
+ if processor.vector_store and processor.vector_store.embedding_model:
+ st.markdown('โ
Semantic Search', unsafe_allow_html=True)
+ else:
+ st.markdown('โ ๏ธ Keyword Search Only', unsafe_allow_html=True)
+
+ # Quick stats
+ st.header("๐ Quick Stats")
+ try:
+ data = processor.load_json_data()
+ total_invoices = len(data.get("invoices", []))
+ total_amount = data.get("summary", {}).get("total_amount", 0)
+
+ st.metric("Total Invoices", total_invoices)
+ st.metric("Total Value", f"โน{total_amount:,.2f}")
+ st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
+
+ except Exception as e:
+ st.error(f"Stats error: {e}")
+
+ # System info
+ st.header("โ๏ธ System Info")
+ st.info(f"""
+ **Session ID:** {session_id}
+
+ **Limits:**
+ โข Max file size: 10MB
+ โข Max concurrent files: 3
+ โข Timeout: 30s
+ """)
+
+ # Main navigation
+ selected_tab = st.radio(
+ "Choose a section:",
+ ["๐ค Upload & Process", "๐ฌ AI Chat", "๐ Analytics", "๐ Data Explorer"],
+ horizontal=True,
+ key=f"main_navigation_{session_id}"
+ )
+
+ # -------------------------------------------------------------------------
+ # UPLOAD & PROCESS SECTION
+ # -------------------------------------------------------------------------
+
+ if selected_tab == "๐ค Upload & Process":
+ st.header("๐ค Upload Invoice Documents")
+
+ # Feature highlights
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ st.markdown("""
+
+
๐ค AI Extraction
+
Advanced NLP models extract structured data automatically
+
+ """, unsafe_allow_html=True)
+
+ with col2:
+ st.markdown("""
+
+
๐ Smart Search
+
Semantic search finds invoices using natural language
+
+ """, unsafe_allow_html=True)
+
+ with col3:
+ st.markdown("""
+
+
๐ Analytics
+
Comprehensive insights and visualizations
+
+ """, unsafe_allow_html=True)
+
+ # File upload
+ st.markdown("### ๐ Upload Your Invoices")
+
+ # Initialize session state for files if not exists
+ if f'uploaded_files_{session_id}' not in st.session_state:
+ st.session_state[f'uploaded_files_{session_id}'] = None
+ if f'processing_complete_{session_id}' not in st.session_state:
+ st.session_state[f'processing_complete_{session_id}'] = False
+ if f'currently_processing_{session_id}' not in st.session_state:
+ st.session_state[f'currently_processing_{session_id}'] = False
+ if f'processed_file_hashes_{session_id}' not in st.session_state:
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+
+ # File uploader with stable key
+ uploaded_files = st.file_uploader(
+ "Choose invoice files (PDF, TXT supported)",
+ type=['pdf', 'txt'],
+ accept_multiple_files=True,
+ help="Maximum file size: 10MB per file",
+ key=f"file_uploader_stable_{session_id}"
+ )
+
+ # Store uploaded files in session state only if they're new
+ if uploaded_files:
+ # Create file hashes to detect if files have changed
+ current_file_hashes = set()
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ current_file_hashes.add(file_hash)
+
+ # Check if files have changed
+ stored_hashes = st.session_state.get(f'uploaded_file_hashes_{session_id}', set())
+ if current_file_hashes != stored_hashes:
+ st.session_state[f'uploaded_files_{session_id}'] = uploaded_files
+ st.session_state[f'uploaded_file_hashes_{session_id}'] = current_file_hashes
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.info("๐ New files detected - ready for processing")
+
+ # Get files from session state
+ current_files = st.session_state[f'uploaded_files_{session_id}']
+ is_processing = st.session_state[f'currently_processing_{session_id}']
+ is_complete = st.session_state[f'processing_complete_{session_id}']
+
+ if current_files:
+ max_files = 3
+ if len(current_files) > max_files:
+ st.warning(f"โ ๏ธ Too many files selected. Processing first {max_files} files.")
+ current_files = current_files[:max_files]
+
+ st.info(f"๐ {len(current_files)} files selected")
+
+ # Show file names
+ st.markdown("**Selected Files:**")
+ for i, file in enumerate(current_files, 1):
+ file_size_mb = len(file.getvalue()) / (1024 * 1024)
+ file_hash = hash((file.name, file.size))
+ processed_icon = "โ
" if file_hash in st.session_state[f'processed_file_hashes_{session_id}'] else "๐"
+ st.write(f"{processed_icon} {i}. {file.name} ({file_size_mb:.2f} MB)")
+
+ # Process button - only show if not currently processing
+ col1, col2 = st.columns([1, 1])
+
+ with col1:
+ if not is_processing and not is_complete:
+ if st.button("๐ Process Files", type="primary", key=f"process_btn_{session_id}"):
+ st.session_state[f'currently_processing_{session_id}'] = True
+ st.rerun()
+ elif is_processing:
+ st.info("๐ Processing in progress...")
+ # Actually process the files here
+ process_files_once(current_files, session_id)
+ elif is_complete:
+ st.success("โ
Processing completed!")
+ if st.button("๐ Process Again", key=f"reprocess_btn_{session_id}"):
+ st.session_state[f'processing_complete_{session_id}'] = False
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processed_file_hashes_{session_id}'] = set()
+ st.rerun()
+
+ with col2:
+ if st.button("๐๏ธ Clear Files", key=f"clear_files_{session_id}"):
+ # Clear all session state related to files
+ keys_to_clear = [
+ f'uploaded_files_{session_id}',
+ f'uploaded_file_hashes_{session_id}',
+ f'processing_complete_{session_id}',
+ f'currently_processing_{session_id}',
+ f'processed_file_hashes_{session_id}'
+ ]
+
+ for key in keys_to_clear:
+ if key in st.session_state:
+ del st.session_state[key]
+
+ st.success("๐๏ธ Files cleared successfully!")
+ time.sleep(1) # Brief pause to show message
+ st.rerun()
+
+ else:
+ st.info("๐ Please select invoice files to upload and process")
+
+ # Show processing results if completed
+ if is_complete:
+ st.markdown("### ๐ Recent Processing Results")
+ try:
+ data = st.session_state.processor.load_json_data()
+ recent_invoices = sorted(
+ data.get("invoices", []),
+ key=lambda x: x.get('timestamps', {}).get('created_at', ''),
+ reverse=True
+ )[:5]
+
+ if recent_invoices:
+ for i, inv in enumerate(recent_invoices, 1):
+ with st.expander(f"๐ {inv.get('invoice_number', f'Invoice {i}')} - {inv.get('supplier_name', 'Unknown')}", expanded=False):
+ col1, col2 = st.columns(2)
+ with col1:
+ st.write(f"**Invoice #:** {inv.get('invoice_number', 'N/A')}")
+ st.write(f"**Supplier:** {inv.get('supplier_name', 'N/A')}")
+ st.write(f"**Amount:** โน{inv.get('amount', 0):.2f}")
+ with col2:
+ st.write(f"**Date:** {inv.get('date', 'N/A')}")
+ st.write(f"**Method:** {inv.get('extraction_info', {}).get('method', 'N/A')}")
+ st.write(f"**Confidence:** {inv.get('extraction_info', {}).get('confidence', 0):.1%}")
+ else:
+ st.info("No recent processing results found.")
+ except Exception as e:
+ st.error(f"Error loading recent results: {e}")
+
+ # -------------------------------------------------------------------------
+ # AI CHAT SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ฌ AI Chat":
+ st.header("๐ฌ AI Chat Interface")
+
+ # Display chat history
+ if st.session_state.chat_history:
+ st.markdown("### ๐ฌ Chat History")
+ for i, message in enumerate(st.session_state.chat_history):
+ with st.chat_message(message["role"]):
+ st.markdown(message["content"])
+
+ # Chat input
+ st.markdown("### โ๏ธ Ask a Question")
+
+ col1, col2 = st.columns([4, 1])
+
+ with col1:
+ user_input = st.text_input(
+ "Type your question:",
+ placeholder="e.g., 'show me total spending'",
+ key=f"chat_input_{session_id}"
+ )
+
+ with col2:
+ ask_btn = st.button("๐ Ask", type="primary", key=f"ask_btn_{session_id}")
+
+ if ask_btn and user_input:
+ handle_chat_query(user_input)
+
+ # Suggested queries
+ if not st.session_state.chat_history:
+ st.markdown("### ๐ก Try These Queries")
+
+ col1, col2 = st.columns(2)
+
+ with col1:
+ st.markdown("**๐ Basic Queries:**")
+ basic_queries = [
+ "Show me a summary of all invoices",
+ "How much have we spent in total?",
+ "Who are our top suppliers?",
+ "Find invoices with high amounts"
+ ]
+ for i, query in enumerate(basic_queries):
+ if st.button(query, key=f"basic_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ with col2:
+ st.markdown("**๐ Advanced Queries:**")
+ advanced_queries = [
+ "Find technology purchases",
+ "Show office supplies",
+ "Search consulting services",
+ "Recent high-value invoices"
+ ]
+ for i, query in enumerate(advanced_queries):
+ if st.button(query, key=f"advanced_{session_id}_{i}"):
+ handle_chat_query(query)
+
+ # Clear chat
+ if st.session_state.chat_history:
+ if st.button("๐๏ธ Clear Chat", key=f"clear_chat_{session_id}"):
+ st.session_state.chat_history = []
+ st.rerun()
+
+ # -------------------------------------------------------------------------
+ # ANALYTICS SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Analytics":
+ st.header("๐ Analytics Dashboard")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices to see analytics.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'invoice_number': inv.get('invoice_number', ''),
+ 'supplier_name': inv.get('supplier_name', ''),
+ 'amount': inv.get('amount', 0),
+ 'date': inv.get('date', ''),
+ 'confidence': inv.get('extraction_info', {}).get('confidence', 0)
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Key metrics
+ col1, col2, col3, col4 = st.columns(4)
+
+ with col1:
+ st.metric("Total Invoices", len(df))
+ with col2:
+ st.metric("Total Amount", f"โน{df['amount'].sum():,.2f}")
+ with col3:
+ st.metric("Avg Amount", f"โน{df['amount'].mean():,.2f}")
+ with col4:
+ st.metric("Unique Suppliers", df['supplier_name'].nunique())
+
+ # Visualizations
+ if len(df) > 0:
+ # Amount distribution
+ fig_hist = px.histogram(
+ df,
+ x='amount',
+ title="Invoice Amount Distribution",
+ labels={'amount': 'Amount (โน)', 'count': 'Number of Invoices'}
+ )
+ st.plotly_chart(fig_hist, use_container_width=True)
+
+ # Top suppliers
+ if df['supplier_name'].notna().any():
+ supplier_amounts = df.groupby('supplier_name')['amount'].sum().sort_values(ascending=False).head(10)
+
+ if len(supplier_amounts) > 0:
+ fig_suppliers = px.bar(
+ x=supplier_amounts.values,
+ y=supplier_amounts.index,
+ orientation='h',
+ title="Top 10 Suppliers by Total Amount",
+ labels={'x': 'Total Amount (โน)', 'y': 'Supplier'}
+ )
+ st.plotly_chart(fig_suppliers, use_container_width=True)
+
+ except Exception as e:
+ st.error(f"Analytics error: {e}")
+
+ # -------------------------------------------------------------------------
+ # DATA EXPLORER SECTION
+ # -------------------------------------------------------------------------
+
+ elif selected_tab == "๐ Data Explorer":
+ st.header("๐ Data Explorer")
+
+ try:
+ data = st.session_state.processor.load_json_data()
+ invoices = data.get("invoices", [])
+
+ if not invoices:
+ st.info("๐ No data available. Upload some invoices first.")
+ return
+
+ # Convert to DataFrame
+ df_data = []
+ for inv in invoices:
+ df_data.append({
+ 'Invoice Number': inv.get('invoice_number', ''),
+ 'Supplier': inv.get('supplier_name', ''),
+ 'Buyer': inv.get('buyer_name', ''),
+ 'Amount': inv.get('amount', 0),
+ 'Date': inv.get('date', ''),
+ 'Confidence': inv.get('extraction_info', {}).get('confidence', 0),
+ 'Method': inv.get('extraction_info', {}).get('method', ''),
+ 'File': inv.get('file_info', {}).get('file_name', ''),
+ 'Created': inv.get('timestamps', {}).get('created_at', '')[:19]
+ })
+
+ df = pd.DataFrame(df_data)
+
+ # Filters
+ col1, col2, col3 = st.columns(3)
+
+ with col1:
+ suppliers = ['All'] + sorted(df['Supplier'].dropna().unique().tolist())
+ selected_supplier = st.selectbox("Filter by Supplier", suppliers, key=f"supplier_filter_{session_id}")
+
+ with col2:
+ methods = ['All'] + sorted(df['Method'].dropna().unique().tolist())
+ selected_method = st.selectbox("Filter by Method", methods, key=f"method_filter_{session_id}")
+
+ with col3:
+ min_amount = st.number_input("Min Amount", min_value=0.0, value=0.0, key=f"amount_filter_{session_id}")
+
+ # Apply filters
+ filtered_df = df.copy()
+ if selected_supplier != 'All':
+ filtered_df = filtered_df[filtered_df['Supplier'] == selected_supplier]
+ if selected_method != 'All':
+ filtered_df = filtered_df[filtered_df['Method'] == selected_method]
+ if min_amount > 0:
+ filtered_df = filtered_df[filtered_df['Amount'] >= min_amount]
+
+ # Display data
+ st.dataframe(
+ filtered_df,
+ use_container_width=True,
+ column_config={
+ "Amount": st.column_config.NumberColumn("Amount", format="โน%.2f"),
+ "Confidence": st.column_config.ProgressColumn("Confidence", min_value=0, max_value=1)
+ }
+ )
+
+ # Export options
+ col1, col2 = st.columns(2)
+
+ with col1:
+ if st.button("๐ฅ Export CSV", key=f"export_csv_{session_id}"):
+ csv_data = filtered_df.to_csv(index=False)
+ st.download_button(
+ "Download CSV",
+ csv_data,
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
+ "text/csv",
+ key=f"download_csv_{session_id}"
+ )
+
+ with col2:
+ if st.button("๐ Export JSON", key=f"export_json_{session_id}"):
+ filtered_invoices = [inv for inv in invoices
+ if inv.get('invoice_number') in filtered_df['Invoice Number'].values]
+
+ export_data = {
+ "exported_at": datetime.now().isoformat(),
+ "total_records": len(filtered_invoices),
+ "invoices": filtered_invoices
+ }
+
+ st.download_button(
+ "Download JSON",
+ json.dumps(export_data, indent=2),
+ f"invoices_{datetime.now().strftime('%Y%m%d_%H%M')}.json",
+ "application/json",
+ key=f"download_json_{session_id}"
+ )
+
+ except Exception as e:
+ st.error(f"Data explorer error: {e}")
+
+ # -------------------------------------------------------------------------
+ # GLOBAL CHAT INPUT
+ # -------------------------------------------------------------------------
+
+ st.markdown("---")
+ st.markdown("### ๐ฌ Quick Chat (Works from any section)")
+
+ global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
+
+ if global_query:
+ handle_chat_query(global_query, show_response=True)
+
+ # Footer
+ st.markdown("---")
+ st.markdown("""
+
+
๐ AI Invoice Processing System - Optimized for Hugging Face Spaces
+
Built with โค๏ธ using Streamlit, Transformers, and AI
+
+ """, unsafe_allow_html=True)
+
+# ===============================================================================
+# HELPER FUNCTIONS
+# ===============================================================================
+
+def process_files_once(uploaded_files, session_id):
+ """Process uploaded files only once with proper state management"""
+ if not uploaded_files:
+ st.error("No files to process!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ return
+
+ st.markdown("### ๐ Processing Files...")
+
+ # Get already processed file hashes
+ processed_hashes = st.session_state[f'processed_file_hashes_{session_id}']
+
+ # Filter out already processed files
+ files_to_process = []
+ for file in uploaded_files:
+ file_hash = hash((file.name, file.size))
+ if file_hash not in processed_hashes:
+ files_to_process.append((file, file_hash))
+
+ if not files_to_process:
+ st.info("โ
All files have already been processed!")
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+ return
+
+ # Create containers for dynamic updates
+ progress_container = st.container()
+ status_container = st.container()
+ results_container = st.container()
+
+ successful = 0
+ failed = 0
+
+ # Show progress
+ with progress_container:
+ progress_bar = st.progress(0)
+ progress_text = st.empty()
+
+ with status_container:
+ st.info(f"Starting to process {len(files_to_process)} new files...")
+
+ # Process each file only once
+ for i, (uploaded_file, file_hash) in enumerate(files_to_process):
+ current_progress = (i + 1) / len(files_to_process)
+
+ with progress_container:
+ progress_bar.progress(current_progress)
+ progress_text.text(f"Processing file {i+1}/{len(files_to_process)}: {uploaded_file.name}")
+
+ with status_container:
+ st.info(f"๐ Processing: {uploaded_file.name} ({len(uploaded_file.getvalue())/1024:.1f} KB)")
+
+ try:
+ # Process the file
+ result = st.session_state.processor.process_uploaded_file(uploaded_file)
+
+ # Mark file as processed regardless of result
+ processed_hashes.add(file_hash)
+
+ # Show result immediately
+ with results_container:
+ if result and hasattr(result, 'invoice_number') and result.invoice_number:
+ successful += 1
+ st.success(f"โ
Successfully processed: {uploaded_file.name}")
+
+ # Show extracted data
+ col1, col2, col3 = st.columns(3)
+ with col1:
+ st.write(f"**Invoice #:** {result.invoice_number}")
+ st.write(f"**Supplier:** {result.supplier_name or 'Not found'}")
+ with col2:
+ st.write(f"**Amount:** โน{result.amount:.2f}")
+ st.write(f"**Date:** {result.date or 'Not found'}")
+ with col3:
+ st.write(f"**Method:** {result.processing_method}")
+ st.write(f"**Confidence:** {result.extraction_confidence:.1%}")
+
+ st.markdown("---")
+ else:
+ failed += 1
+ st.warning(f"โ ๏ธ Could not extract complete data from: {uploaded_file.name}")
+ if result:
+ st.write(f"Partial data: {result.supplier_name}, โน{result.amount}")
+ st.markdown("---")
+
+ except Exception as e:
+ failed += 1
+ # Still mark as processed to avoid reprocessing
+ processed_hashes.add(file_hash)
+
+ with results_container:
+ st.error(f"โ Error processing {uploaded_file.name}: {str(e)}")
+ st.markdown("---")
+
+ # Update session state
+ st.session_state[f'processed_file_hashes_{session_id}'] = processed_hashes
+
+ # Final summary
+ with progress_container:
+ progress_bar.progress(1.0)
+ progress_text.text("โ
Processing completed!")
+
+ with status_container:
+ if successful > 0:
+ st.success(f"๏ฟฝ๏ฟฝ Processing complete! {successful} successful, {failed} failed")
+ if successful > 0:
+ st.balloons()
+ else:
+ st.error(f"โ Processing failed for all {failed} files. Please check file formats and content.")
+
+ # Update processing state
+ st.session_state[f'currently_processing_{session_id}'] = False
+ st.session_state[f'processing_complete_{session_id}'] = True
+
+ # Force rerun to update UI
+ st.rerun()
+
+def process_files(uploaded_files, session_id):
+ """Legacy function - redirect to process_files_once"""
+ return process_files_once(uploaded_files, session_id)
+
+def handle_chat_query(query, show_response=False):
+ """Handle chat query"""
+ st.session_state.chat_history.append({
+ "role": "user",
+ "content": query,
+ "timestamp": datetime.now()
+ })
+
+ try:
+ with st.spinner("๐ค AI is analyzing..."):
+ response = st.session_state.chatbot.query_database(query)
+
+ st.session_state.chat_history.append({
+ "role": "assistant",
+ "content": response,
+ "timestamp": datetime.now()
+ })
+
+ if show_response:
+ with st.chat_message("assistant"):
+ st.markdown(response)
+ st.info("๐ก Switch to the 'AI Chat' section to see full conversation history!")
+
+ st.rerun()
+
+ except Exception as e:
+ st.error(f"Chat error: {e}")
+
+# ===============================================================================
+# MAIN ENTRY POINT
+# ===============================================================================
+
+def main():
+ """Main entry point for Hugging Face Spaces"""
+ try:
+ if IS_HF_SPACE:
+ st.sidebar.info("๐ค Running on Hugging Face Spaces")
+
+ create_app()
+
+ except Exception as e:
+ st.error(f"""
+ ## ๐จ Application Error
+
+ {e}
+
+ Please refresh the page or check the logs for more details.
+ """)
+
+if __name__ == "__main__":
+ main(),
+
+ # Standalone amounts with currency words
+ r'([0-9,]+\.?\d*)\s*(?:dollars?|rupees?|usd|inr|eur|gbp)',
+
+ # Table-like patterns
+ r'(?:price|cost|rate)\s*:?\s*[\$โนยฃโฌ]?\s*([0-9,]+\.?\d*)',
+
+ # Amount with decimal precision
+ r'(?:^|\s)([0-9]{1,3}(?:,\d{3})*\.?\d{0,2})(?=\s|$)',
+ ],
+ 'date': [
+ r'date\s*:?\s*(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
+ r'(?:invoice|bill)\s*date\s*:?\s*(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})',
+ r'(?:^|\s)(\d{1,2}[/\-\.]\d{1,2}[/\-\.]\d{2,4})(?=\s|$)',
+ r'(\d{4}[/\-\.]\d{1,2}[/\-\.]\d{1,2})',
+ r'(\d{1,2}\s+(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{2,4})',
+ ],
+ 'quantity': [
+ r'qty\s*:?\s*(\d+)',
+ r'quantity\s*:?\s*(\d+)',
+ r'(?:units?|pcs?|pieces?)\s*:?\s*(\d+)',
+ r'(\d+)\s*(?:pcs?|units?|items?|pieces?)',
+ ]
+ }
+
+ text_lower = text.lower()
+
+ # Extract invoice number with multiple attempts
+ for pattern in patterns['invoice_number']:
+ match = re.search(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+ if match:
+ invoice_data.invoice_number = match.group(1).upper().strip()
+ break
+
+ # Extract amount with enhanced logic
+ amounts_found = []
+ for pattern in patterns['amount']:
+ matches = re.finditer(pattern, text_lower, re.IGNORECASE | re.MULTILINE)
+ for match in matches:
+ try:
+ amount_str = match.group(1).replace(',', '').replace(' ', '')
+ amount_val = float(amount_str)
+ if 0.01 <= amount_val <= 1000000: # Reasonable range
+ amounts_found.append(amount_val)
+ except (ValueError, IndexError):
+ continue
+
+ # Choose the most likely amount (highest value or most repeated)
+ if amounts_found:
+ # Remove duplicates and sort
+ unique_amounts = sorted(set(amounts_found), reverse=True)
+ # Take the highest reasonable amount
+ invoice_data.amount = unique_amounts[0]
+
+ # Extract date
+ for pattern in patterns['date']:
+ match = re.search(pattern, text, re.IGNORECASE | re.MULTILINE)
+ if match:
+ invoice_data.date = self.parse_date(match.group(1))
+ break
+
+ # Extract quantity
+ for pattern in patterns['quantity']:
+ match = re.search(pattern, text_lower, re.IGNORECASE)
+ if match:
+ try:
+ invoice_data.quantity = int(match.group(1))
+ break
+ except ValueError:
+ continue
+
+ # Enhanced company name extraction
+ company_patterns = [
+ r'(?:from|supplier|vendor)\s*:?\s*([A-Z][A-Za-z\s&,\.]{3,50})',
+ r'(?:to|buyer|client)\s*:?\s*([A-Z][A-Za-z\s&,\.]{3,50})',
+ r'([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:ltd|inc|corp|llc|co\.|company|pvt|private|limited)',
+ r'(?:^|\n)([A-Z][A-Za-z\s&,\.]{3,50})\s*(?:\n|$)',
+ ]
+
+ companies_found = []
+ for pattern in company_patterns:
+ matches = re.findall(pattern, text, re.MULTILINE)
+ for match in matches:
+ clean_company = match.strip().title()
+ if len(clean_company) > 3 and not any(word in clean_company.lower() for word in ['total', 'amount', 'date', 'invoice']):
+ companies_found.append(clean_company)
+
+ # Assign companies (first as supplier, second as buyer)
+ if companies_found:
+ invoice_data.supplier_name = companies_found[0]
+ if len(companies_found) > 1:
+ invoice_data.buyer_name = companies_found[1]
+
+ # Extract product description
+ desc_patterns = [
+ r'(?:description|item|product|service)\s*:?\s*([A-Za-z0-9\s,.-]{10,200})',
+ r'(?:for|regarding)\s*:?\s*([A-Za-z0-9\s,.-]{10,200})',
+ ]
+
+ for pattern in desc_patterns:
+ match = re.search(pattern, text, re.IGNORECASE)
+ if match:
+ desc = match.group(1).strip()
+ if len(desc) > 5:
+ invoice_data.product_description = desc[:200] # Limit length
+ break
+
+ # Set confidence based on how much we extracted
+ confidence_factors = []
+ if invoice_data.invoice_number:
+ confidence_factors.append(0.3)
+ if invoice_data.amount > 0:
+ confidence_factors.append(0.3)
+ if invoice_data.supplier_name:
+ confidence_factors.append(0.2)
+ if invoice_data.date:
+ confidence_factors.append(0.1)
+ if invoice_data.quantity > 0:
+ confidence_factors.append(0.1)
+
+ invoice_data.extraction_confidence = sum(confidence_factors)
- invoice_data.extraction_confidence = 0.6
return invoice_data
def parse_date(self, date_str: str) -> str: