shivam701171 commited on
Commit
6a4369a
·
verified ·
1 Parent(s): baa5d38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +248 -438
app.py CHANGED
@@ -1,15 +1,15 @@
1
  #!/usr/bin/env python3
2
  """
3
- Enhanced Invoice Processing & Analysis System - Hugging Face Spaces Compatible
4
  A comprehensive system with AI-powered extraction, semantic search, and analytics.
5
 
6
  Author: AI Assistant
7
  Date: 2024
8
- Version: HuggingFace v1.0
9
  """
10
 
11
  # ===============================================================================
12
- # IMPORTS AND HUGGING FACE COMPATIBILITY
13
  # ===============================================================================
14
 
15
  import os
@@ -25,8 +25,7 @@ from dataclasses import dataclass
25
  from pathlib import Path
26
  import time
27
  import logging
28
-
29
-
30
 
31
  # Check if running on Hugging Face Spaces
32
  IS_HF_SPACE = os.getenv("SPACE_ID") is not None
@@ -39,26 +38,7 @@ import plotly.express as px
39
  import plotly.graph_objects as go
40
  import requests
41
 
42
- # This should be the FIRST Streamlit command
43
- st.set_page_config(
44
- page_title="AI Invoice Processing System",
45
- page_icon="📄",
46
- layout="wide",
47
- initial_sidebar_state="expanded",
48
- menu_items={
49
- 'Get Help': 'https://huggingface.co/spaces',
50
- 'Report a bug': 'https://huggingface.co/spaces',
51
- 'About': """
52
- # AI Invoice Processing System
53
- Built for Hugging Face Spaces with AI-powered extraction and semantic search.
54
- """
55
- }
56
- )
57
-
58
-
59
-
60
-
61
- # Vector storage and embeddings (HF compatible)
62
  try:
63
  import faiss
64
  FAISS_AVAILABLE = True
@@ -80,41 +60,51 @@ except ImportError:
80
  TORCH_AVAILABLE = False
81
 
82
  # Document processing (simplified for HF)
83
- try:
84
- from docling.document_converter import DocumentConverter
85
- from docling.datamodel.base_models import InputFormat
86
- from docling.datamodel.pipeline_options import PdfPipelineOptions
87
- from docling.document_converter import PdfFormatOption
88
- DOCLING_AVAILABLE = True
89
- except ImportError:
90
- DOCLING_AVAILABLE = False
91
- st.warning("⚠️ Docling not available. Using simplified document processing.")
92
-
93
- # Alternative document processing for HF
94
  try:
95
  import pdfplumber
96
  PDF_PROCESSING_AVAILABLE = True
 
97
  except ImportError:
98
  try:
99
  import PyPDF2
100
  PDF_PROCESSING_AVAILABLE = True
 
101
  except ImportError:
102
  PDF_PROCESSING_AVAILABLE = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  # ===============================================================================
105
- # HUGGING FACE CONFIGURATION
106
  # ===============================================================================
107
 
108
- # Hugging Face Spaces configuration
109
  HF_CONFIG = {
110
- "max_file_size_mb": 10, # Reduced for HF Spaces
111
- "max_concurrent_files": 3, # Reduced for HF Spaces
112
  "timeout_seconds": 30,
113
- "use_cpu_only": True, # Force CPU for HF Spaces
114
- "embedding_model": "all-MiniLM-L6-v2", # Lightweight model
115
  "cache_dir": "./cache",
116
  "data_dir": "./data",
117
- "enable_ollama": False, # Disable Ollama for HF Spaces
118
  }
119
 
120
  # Create necessary directories
@@ -122,15 +112,12 @@ os.makedirs(HF_CONFIG["cache_dir"], exist_ok=True)
122
  os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
123
 
124
  # ===============================================================================
125
- # STREAMLIT CONFIGURATION FOR HUGGING FACE
126
- # ===============================================================================
127
- # ===============================================================================
128
- # SIMPLIFIED DATA STRUCTURES FOR HF
129
  # ===============================================================================
130
 
131
  @dataclass
132
  class InvoiceData:
133
- """Simplified data structure for extracted invoice information"""
134
  supplier_name: str = ""
135
  buyer_name: str = ""
136
  invoice_number: str = ""
@@ -153,182 +140,10 @@ class VectorSearchResult:
153
  metadata: Dict
154
 
155
  # ===============================================================================
156
- # HUGGING FACE COMPATIBLE VECTOR STORE
157
- # ===============================================================================
158
-
159
- class HuggingFaceVectorStore:
160
- """Simplified vector store compatible with Hugging Face Spaces"""
161
-
162
- def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
163
- self.embedding_model_name = embedding_model
164
- self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
165
- self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
166
- self.embedding_model = None
167
- self.vectors = []
168
- self.document_metadata = []
169
- self.embedding_dimension = None
170
-
171
- self.setup_embedding_model()
172
- self.load_vector_store()
173
-
174
- def setup_embedding_model(self):
175
- """Initialize the sentence transformer model"""
176
- if not SENTENCE_TRANSFORMERS_AVAILABLE:
177
- st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
178
- return
179
-
180
- try:
181
- with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
182
- self.embedding_model = SentenceTransformer(
183
- self.embedding_model_name,
184
- cache_folder=HF_CONFIG["cache_dir"]
185
- )
186
-
187
- # Get embedding dimension
188
- test_embedding = self.embedding_model.encode(["test"])
189
- self.embedding_dimension = test_embedding.shape[0]
190
-
191
- st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
192
-
193
- except Exception as e:
194
- st.error(f"❌ Failed to load embedding model: {e}")
195
- self.embedding_model = None
196
-
197
- def load_vector_store(self):
198
- """Load existing vector store"""
199
- try:
200
- if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
201
- with open(self.vector_store_path, 'rb') as f:
202
- self.vectors = pickle.load(f)
203
-
204
- with open(self.metadata_path, 'rb') as f:
205
- self.document_metadata = pickle.load(f)
206
-
207
- st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
208
- else:
209
- self.vectors = []
210
- self.document_metadata = []
211
- st.info("📄 New vector store initialized")
212
-
213
- except Exception as e:
214
- st.error(f"❌ Error loading vector store: {e}")
215
- self.vectors = []
216
- self.document_metadata = []
217
-
218
- def save_vector_store(self):
219
- """Save vector store to disk"""
220
- try:
221
- with open(self.vector_store_path, 'wb') as f:
222
- pickle.dump(self.vectors, f)
223
-
224
- with open(self.metadata_path, 'wb') as f:
225
- pickle.dump(self.document_metadata, f)
226
-
227
- return True
228
- except Exception as e:
229
- st.error(f"Error saving vector store: {e}")
230
- return False
231
-
232
- def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
233
- """Create searchable text from invoice data"""
234
- text_parts = []
235
-
236
- for field, value in invoice_data.items():
237
- if value and field != 'id':
238
- text_parts.append(f"{field}: {value}")
239
-
240
- if raw_text:
241
- text_parts.append(f"content: {raw_text[:300]}")
242
-
243
- return " | ".join(text_parts)
244
-
245
- def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
246
- """Add a document to the vector store"""
247
- if not self.embedding_model:
248
- return False
249
-
250
- try:
251
- document_text = self.create_document_text(invoice_data, raw_text)
252
-
253
- # Generate embedding
254
- embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
255
-
256
- # Create metadata
257
- metadata = {
258
- 'invoice_id': invoice_data.get('id', ''),
259
- 'invoice_number': invoice_data.get('invoice_number', ''),
260
- 'supplier_name': invoice_data.get('supplier_name', ''),
261
- 'buyer_name': invoice_data.get('buyer_name', ''),
262
- 'amount': invoice_data.get('amount', 0),
263
- 'date': invoice_data.get('date', ''),
264
- 'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
265
- 'document_text': document_text[:200],
266
- 'timestamp': datetime.now().isoformat()
267
- }
268
-
269
- # Add to store
270
- self.vectors.append(embedding)
271
- self.document_metadata.append(metadata)
272
-
273
- return True
274
-
275
- except Exception as e:
276
- st.error(f"Error adding document to vector store: {e}")
277
- return False
278
-
279
- def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
280
- """Perform semantic search using cosine similarity"""
281
- if not self.embedding_model or not self.vectors:
282
- return []
283
-
284
- try:
285
- # Generate query embedding
286
- query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
287
-
288
- # Calculate similarities
289
- similarities = []
290
- for i, doc_embedding in enumerate(self.vectors):
291
- similarity = np.dot(query_embedding, doc_embedding)
292
- similarities.append((similarity, i))
293
-
294
- # Sort by similarity
295
- similarities.sort(reverse=True)
296
-
297
- # Return top results
298
- results = []
299
- for similarity, idx in similarities[:top_k]:
300
- if similarity > 0.1: # Relevance threshold
301
- metadata = self.document_metadata[idx]
302
- result = VectorSearchResult(
303
- invoice_id=metadata.get('invoice_id', ''),
304
- invoice_number=metadata.get('invoice_number', ''),
305
- supplier_name=metadata.get('supplier_name', ''),
306
- similarity_score=float(similarity),
307
- content_preview=metadata.get('document_text', ''),
308
- metadata=metadata
309
- )
310
- results.append(result)
311
-
312
- return results
313
-
314
- except Exception as e:
315
- st.error(f"Error in semantic search: {e}")
316
- return []
317
-
318
- def get_stats(self) -> Dict:
319
- """Get vector store statistics"""
320
- return {
321
- 'total_documents': len(self.document_metadata),
322
- 'embedding_dimension': self.embedding_dimension,
323
- 'model_name': self.embedding_model_name,
324
- 'vector_store_size': len(self.vectors)
325
- }
326
-
327
- # ===============================================================================
328
- # SIMPLIFIED DOCUMENT PROCESSING FOR HF
329
  # ===============================================================================
330
 
331
- class HuggingFaceDocumentProcessor:
332
  """Simplified document processor for Hugging Face Spaces"""
333
 
334
  def __init__(self):
@@ -340,23 +155,17 @@ class HuggingFaceDocumentProcessor:
340
 
341
  # PDF processing
342
  if PDF_PROCESSING_AVAILABLE:
343
- try:
344
- import pdfplumber
345
  self.processors['pdf'] = self.extract_with_pdfplumber
346
  st.success("✅ PDF processing available (pdfplumber)")
347
- except ImportError:
348
- try:
349
- import PyPDF2
350
- self.processors['pdf'] = self.extract_with_pypdf2
351
- st.success(" PDF processing available (PyPDF2)")
352
- except ImportError:
353
- st.warning("⚠️ No PDF processor available")
354
 
355
  # Text files
356
  self.processors['txt'] = self.extract_text_file
357
-
358
- # Images (basic OCR alternative)
359
- self.processors['image'] = self.extract_image_text
360
 
361
  def extract_with_pdfplumber(self, file_path: str) -> str:
362
  """Extract text using pdfplumber"""
@@ -396,11 +205,6 @@ class HuggingFaceDocumentProcessor:
396
  st.error(f"Text file extraction failed: {e}")
397
  return ""
398
 
399
- def extract_image_text(self, file_path: str) -> str:
400
- """Basic image text extraction (placeholder for OCR)"""
401
- st.warning("⚠️ OCR not available in this environment. Please use text-based documents.")
402
- return ""
403
-
404
  def extract_text_from_document(self, file_path: str) -> str:
405
  """Extract text from document based on file type"""
406
  file_ext = Path(file_path).suffix.lower()
@@ -409,8 +213,6 @@ class HuggingFaceDocumentProcessor:
409
  processor = self.processors.get('pdf')
410
  elif file_ext == '.txt':
411
  processor = self.processors.get('txt')
412
- elif file_ext in ['.jpg', '.jpeg', '.png']:
413
- processor = self.processors.get('image')
414
  else:
415
  st.warning(f"Unsupported file type: {file_ext}")
416
  return ""
@@ -422,11 +224,11 @@ class HuggingFaceDocumentProcessor:
422
  return ""
423
 
424
  # ===============================================================================
425
- # SIMPLIFIED AI EXTRACTION FOR HF
426
  # ===============================================================================
427
 
428
- class HuggingFaceAIExtractor:
429
- """Simplified AI extraction for Hugging Face Spaces"""
430
 
431
  def __init__(self):
432
  self.use_transformers = self.setup_transformers()
@@ -434,16 +236,12 @@ class HuggingFaceAIExtractor:
434
  def setup_transformers(self):
435
  """Try to setup Hugging Face transformers for NER"""
436
  try:
437
- from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
438
-
439
- # Use a lightweight NER model
440
- model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
441
 
442
  with st.spinner("Loading AI extraction model..."):
443
  self.ner_pipeline = pipeline(
444
  "ner",
445
- model=model_name,
446
- tokenizer=model_name,
447
  aggregation_strategy="simple"
448
  )
449
 
@@ -614,17 +412,180 @@ class HuggingFaceAIExtractor:
614
  return date_str
615
 
616
  # ===============================================================================
617
- # MAIN PROCESSOR FOR HUGGING FACE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
  # ===============================================================================
619
 
620
- class HuggingFaceInvoiceProcessor:
621
- """Main invoice processor optimized for Hugging Face Spaces"""
622
 
623
  def __init__(self):
624
  self.setup_storage()
625
- self.document_processor = HuggingFaceDocumentProcessor()
626
- self.ai_extractor = HuggingFaceAIExtractor()
627
- self.vector_store = HuggingFaceVectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
628
 
629
  # Initialize stats
630
  self.processing_stats = {
@@ -786,13 +747,13 @@ class HuggingFaceInvoiceProcessor:
786
  data["metadata"]["total_invoices"] = len(invoices)
787
 
788
  # ===============================================================================
789
- # SIMPLIFIED CHATBOT FOR HF
790
  # ===============================================================================
791
 
792
- class HuggingFaceChatBot:
793
- """Simplified chatbot for Hugging Face Spaces"""
794
 
795
- def __init__(self, processor: HuggingFaceInvoiceProcessor):
796
  self.processor = processor
797
 
798
  def query_database(self, query: str) -> str:
@@ -819,7 +780,6 @@ class HuggingFaceChatBot:
819
  elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
820
  return self.handle_supplier_query(data, query)
821
 
822
-
823
  elif self.processor.vector_store:
824
  return self.handle_semantic_search(query)
825
 
@@ -1005,40 +965,19 @@ class HuggingFaceChatBot:
1005
  return response
1006
 
1007
  # ===============================================================================
1008
- # STREAMLIT APPLICATION FOR HUGGING FACE
1009
- # ===============================================================================
1010
-
1011
- # ===============================================================================
1012
- # FIXED MAIN APPLICATION WITH PROPER CHAT INPUT PLACEMENT
1013
- # ===============================================================================
1014
-
1015
- # ===============================================================================
1016
- # FIXED APPLICATION WITH UNIQUE WIDGET KEYS
1017
- # ===============================================================================
1018
-
1019
- # ===============================================================================
1020
- # FIXED APPLICATION WITH DYNAMIC UNIQUE KEYS AND SESSION STATE
1021
  # ===============================================================================
1022
 
1023
- import streamlit as st
1024
- import pandas as pd
1025
- import plotly.express as px
1026
- import json
1027
- from datetime import datetime
1028
- import os
1029
- import uuid
1030
-
1031
- # Generate unique session ID for this run
1032
- if 'session_id' not in st.session_state:
1033
- st.session_state.session_id = str(uuid.uuid4())[:8]
1034
-
1035
- def create_huggingface_app():
1036
- """Main Streamlit application optimized for Hugging Face Spaces"""
1037
 
1038
- # Get unique session ID
1039
  session_id = st.session_state.session_id
1040
 
1041
- # Custom CSS for better UI
1042
  st.markdown("""
1043
  <style>
1044
  .main-header {
@@ -1073,40 +1012,35 @@ def create_huggingface_app():
1073
  """, unsafe_allow_html=True)
1074
 
1075
  # Initialize processor
1076
- if 'hf_processor' not in st.session_state:
1077
  with st.spinner("🔧 Initializing AI Invoice Processor..."):
1078
  try:
1079
- from enhanced_invoice_system_part1 import (
1080
- HuggingFaceInvoiceProcessor, HF_CONFIG
1081
- )
1082
- st.session_state.hf_processor = HuggingFaceInvoiceProcessor()
1083
- st.session_state.hf_chatbot = HuggingFaceChatBot(st.session_state.hf_processor)
1084
  st.session_state.chat_history = []
1085
  st.success("✅ System initialized successfully!")
1086
  except Exception as e:
1087
  st.error(f"❌ Initialization failed: {e}")
1088
  st.stop()
1089
 
1090
- # Sidebar with system status
1091
  with st.sidebar:
1092
  st.header("🎛️ System Status")
1093
 
1094
- processor = st.session_state.hf_processor
1095
 
1096
- # Document processing status
1097
- if hasattr(processor, 'document_processor') and processor.document_processor.processors:
1098
  st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
1099
  else:
1100
  st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
1101
 
1102
- # AI extraction status
1103
- if hasattr(processor, 'ai_extractor') and processor.ai_extractor.use_transformers:
1104
  st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
1105
  else:
1106
  st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
1107
 
1108
- # Vector search status
1109
- if hasattr(processor, 'vector_store') and processor.vector_store and processor.vector_store.embedding_model:
1110
  st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
1111
  else:
1112
  st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
@@ -1120,15 +1054,12 @@ def create_huggingface_app():
1120
 
1121
  st.metric("Total Invoices", total_invoices)
1122
  st.metric("Total Value", f"₹{total_amount:,.2f}")
1123
-
1124
- if hasattr(processor, 'processing_stats'):
1125
- success_rate = f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}"
1126
- st.metric("Success Rate", success_rate)
1127
 
1128
  except Exception as e:
1129
  st.error(f"Stats error: {e}")
1130
 
1131
- # Processing info
1132
  st.header("⚙️ System Info")
1133
  st.info(f"""
1134
  **Session ID:** {session_id}
@@ -1181,10 +1112,9 @@ def create_huggingface_app():
1181
  </div>
1182
  """, unsafe_allow_html=True)
1183
 
1184
- # File upload interface
1185
  st.markdown("### 📁 Upload Your Invoices")
1186
 
1187
- # Use timestamp to ensure unique keys
1188
  timestamp = datetime.now().strftime("%H%M%S")
1189
 
1190
  uploaded_files = st.file_uploader(
@@ -1220,7 +1150,7 @@ def create_huggingface_app():
1220
  with st.chat_message(message["role"]):
1221
  st.markdown(message["content"])
1222
 
1223
- # Chat input area
1224
  st.markdown("### ✍️ Ask a Question")
1225
 
1226
  col1, col2 = st.columns([4, 1])
@@ -1282,7 +1212,7 @@ def create_huggingface_app():
1282
  st.header("📊 Analytics Dashboard")
1283
 
1284
  try:
1285
- data = st.session_state.hf_processor.load_json_data()
1286
  invoices = data.get("invoices", [])
1287
 
1288
  if not invoices:
@@ -1350,7 +1280,7 @@ def create_huggingface_app():
1350
  st.header("📋 Data Explorer")
1351
 
1352
  try:
1353
- data = st.session_state.hf_processor.load_json_data()
1354
  invoices = data.get("invoices", [])
1355
 
1356
  if not invoices:
@@ -1444,13 +1374,12 @@ def create_huggingface_app():
1444
  st.error(f"Data explorer error: {e}")
1445
 
1446
  # -------------------------------------------------------------------------
1447
- # GLOBAL CHAT INPUT (Outside sections)
1448
  # -------------------------------------------------------------------------
1449
 
1450
  st.markdown("---")
1451
  st.markdown("### 💬 Quick Chat (Works from any section)")
1452
 
1453
- # Global chat input with unique key
1454
  global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
1455
 
1456
  if global_query:
@@ -1485,8 +1414,7 @@ def process_files(uploaded_files, session_id):
1485
  st.info(f"Processing: {uploaded_file.name}")
1486
 
1487
  try:
1488
- # Process file
1489
- result = st.session_state.hf_processor.process_uploaded_file(uploaded_file)
1490
 
1491
  with results_container:
1492
  if result and result.invoice_number:
@@ -1510,7 +1438,6 @@ def process_files(uploaded_files, session_id):
1510
  with results_container:
1511
  st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
1512
 
1513
- # Final status
1514
  with status_container:
1515
  st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
1516
 
@@ -1519,17 +1446,15 @@ def process_files(uploaded_files, session_id):
1519
 
1520
  def handle_chat_query(query, show_response=False):
1521
  """Handle chat query"""
1522
- # Add user message
1523
  st.session_state.chat_history.append({
1524
  "role": "user",
1525
  "content": query,
1526
  "timestamp": datetime.now()
1527
  })
1528
 
1529
- # Get AI response
1530
  try:
1531
  with st.spinner("🤖 AI is analyzing..."):
1532
- response = st.session_state.hf_chatbot.query_database(query)
1533
 
1534
  st.session_state.chat_history.append({
1535
  "role": "assistant",
@@ -1537,7 +1462,6 @@ def handle_chat_query(query, show_response=False):
1537
  "timestamp": datetime.now()
1538
  })
1539
 
1540
- # Show response if requested
1541
  if show_response:
1542
  with st.chat_message("assistant"):
1543
  st.markdown(response)
@@ -1555,26 +1479,10 @@ def handle_chat_query(query, show_response=False):
1555
  def main():
1556
  """Main entry point for Hugging Face Spaces"""
1557
  try:
1558
- # Import required classes
1559
- from enhanced_invoice_system_part1 import IS_HF_SPACE
1560
-
1561
- # Display environment info
1562
  if IS_HF_SPACE:
1563
  st.sidebar.info("🤗 Running on Hugging Face Spaces")
1564
 
1565
- # Create and run the app
1566
- create_huggingface_app()
1567
-
1568
- except ImportError as e:
1569
- st.error(f"""
1570
- ## 🚨 Import Error
1571
-
1572
- Missing required modules: {e}
1573
-
1574
- Please ensure all files are uploaded to your Hugging Face Space:
1575
- - enhanced_invoice_system_part1.py
1576
- - enhanced_invoice_system_part2.py (this file)
1577
- """)
1578
 
1579
  except Exception as e:
1580
  st.error(f"""
@@ -1585,103 +1493,5 @@ def main():
1585
  Please refresh the page or check the logs for more details.
1586
  """)
1587
 
1588
- if __name__ == "__main__":
1589
- main()
1590
- # ===============================================================================
1591
- # MAIN APPLICATION ENTRY POINT
1592
- # ===============================================================================
1593
-
1594
- def main():
1595
- """Main entry point for Hugging Face Spaces"""
1596
- try:
1597
- # Display Hugging Face info if running on HF Spaces
1598
- if IS_HF_SPACE:
1599
- st.sidebar.info("🤗 Running on Hugging Face Spaces")
1600
-
1601
- # Create and run the app
1602
- create_huggingface_app()
1603
-
1604
- except Exception as e:
1605
- st.error(f"Application error: {e}")
1606
- st.info("Please refresh the page or contact support if the error persists.")
1607
-
1608
- if __name__ == "__main__":
1609
- main()
1610
-
1611
- # ===============================================================================
1612
- # MAIN APPLICATION ENTRY POINT
1613
- # ===============================================================================
1614
-
1615
- def main():
1616
- """Main entry point for Hugging Face Spaces"""
1617
- try:
1618
- # Display Hugging Face info if running on HF Spaces
1619
- if IS_HF_SPACE:
1620
- st.sidebar.info("🤗 Running on Hugging Face Spaces")
1621
-
1622
- # Create and run the app
1623
- create_huggingface_app()
1624
-
1625
- except Exception as e:
1626
- st.error(f"Application error: {e}")
1627
- st.info("Please refresh the page or contact support if the error persists.")
1628
-
1629
- if __name__ == "__main__":
1630
- main()
1631
-
1632
- # ===============================================================================
1633
- # HUGGING FACE REQUIREMENTS AND CONFIGURATION
1634
- # ===============================================================================
1635
-
1636
- def generate_hf_requirements():
1637
- """Generate requirements.txt optimized for Hugging Face Spaces"""
1638
- requirements = """streamlit>=1.28.0
1639
- pandas>=1.5.0
1640
- numpy>=1.21.0
1641
- plotly>=5.0.0
1642
- sentence-transformers>=2.2.0
1643
- transformers>=4.21.0
1644
- torch>=1.13.0
1645
- faiss-cpu>=1.7.0
1646
- pdfplumber>=0.7.0
1647
- requests>=2.28.0
1648
- python-dateutil>=2.8.0
1649
- Pillow>=9.0.0
1650
- """
1651
- return requirements.strip()
1652
-
1653
- def generate_hf_config():
1654
- """Generate app configuration for Hugging Face Spaces"""
1655
- config = {
1656
- "title": "AI Invoice Processing System",
1657
- "emoji": "📄",
1658
- "colorFrom": "blue",
1659
- "colorTo": "purple",
1660
- "sdk": "streamlit",
1661
- "sdk_version": "1.28.0",
1662
- "app_file": "app.py",
1663
- "pinned": False,
1664
- "python_version": "3.9"
1665
- }
1666
- return config
1667
-
1668
- # ===============================================================================
1669
- # MAIN APPLICATION ENTRY POINT
1670
- # ===============================================================================
1671
-
1672
- def main():
1673
- """Main entry point for Hugging Face Spaces"""
1674
- try:
1675
- # Display Hugging Face info if running on HF Spaces
1676
- if IS_HF_SPACE:
1677
- st.sidebar.info("🤗 Running on Hugging Face Spaces")
1678
-
1679
- # Create and run the app
1680
- create_huggingface_app()
1681
-
1682
- except Exception as e:
1683
- st.error(f"Application error: {e}")
1684
- st.info("Please refresh the page or contact support if the error persists.")
1685
-
1686
  if __name__ == "__main__":
1687
  main()
 
1
  #!/usr/bin/env python3
2
  """
3
+ AI Invoice Processing System - Complete Single File for Hugging Face Spaces
4
  A comprehensive system with AI-powered extraction, semantic search, and analytics.
5
 
6
  Author: AI Assistant
7
  Date: 2024
8
+ Version: HuggingFace Single File v1.0
9
  """
10
 
11
  # ===============================================================================
12
+ # IMPORTS AND COMPATIBILITY CHECKS
13
  # ===============================================================================
14
 
15
  import os
 
25
  from pathlib import Path
26
  import time
27
  import logging
28
+ import uuid
 
29
 
30
  # Check if running on Hugging Face Spaces
31
  IS_HF_SPACE = os.getenv("SPACE_ID") is not None
 
38
  import plotly.graph_objects as go
39
  import requests
40
 
41
+ # Vector storage and embeddings (with fallbacks)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  try:
43
  import faiss
44
  FAISS_AVAILABLE = True
 
60
  TORCH_AVAILABLE = False
61
 
62
  # Document processing (simplified for HF)
 
 
 
 
 
 
 
 
 
 
 
63
  try:
64
  import pdfplumber
65
  PDF_PROCESSING_AVAILABLE = True
66
+ PDF_PROCESSOR = "pdfplumber"
67
  except ImportError:
68
  try:
69
  import PyPDF2
70
  PDF_PROCESSING_AVAILABLE = True
71
+ PDF_PROCESSOR = "PyPDF2"
72
  except ImportError:
73
  PDF_PROCESSING_AVAILABLE = False
74
+ PDF_PROCESSOR = None
75
+
76
+ # ===============================================================================
77
+ # STREAMLIT CONFIGURATION
78
+ # ===============================================================================
79
+
80
+ st.set_page_config(
81
+ page_title="AI Invoice Processing System",
82
+ page_icon="📄",
83
+ layout="wide",
84
+ initial_sidebar_state="expanded",
85
+ menu_items={
86
+ 'Get Help': 'https://huggingface.co/spaces',
87
+ 'Report a bug': 'https://huggingface.co/spaces',
88
+ 'About': """
89
+ # AI Invoice Processing System
90
+ Built for Hugging Face Spaces with AI-powered extraction and semantic search.
91
+ """
92
+ }
93
+ )
94
 
95
  # ===============================================================================
96
+ # CONFIGURATION
97
  # ===============================================================================
98
 
 
99
  HF_CONFIG = {
100
+ "max_file_size_mb": 10,
101
+ "max_concurrent_files": 3,
102
  "timeout_seconds": 30,
103
+ "use_cpu_only": True,
104
+ "embedding_model": "all-MiniLM-L6-v2",
105
  "cache_dir": "./cache",
106
  "data_dir": "./data",
107
+ "enable_ollama": False,
108
  }
109
 
110
  # Create necessary directories
 
112
  os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
113
 
114
  # ===============================================================================
115
+ # DATA STRUCTURES
 
 
 
116
  # ===============================================================================
117
 
118
  @dataclass
119
  class InvoiceData:
120
+ """Data structure for extracted invoice information"""
121
  supplier_name: str = ""
122
  buyer_name: str = ""
123
  invoice_number: str = ""
 
140
  metadata: Dict
141
 
142
  # ===============================================================================
143
+ # DOCUMENT PROCESSING CLASSES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  # ===============================================================================
145
 
146
+ class DocumentProcessor:
147
  """Simplified document processor for Hugging Face Spaces"""
148
 
149
  def __init__(self):
 
155
 
156
  # PDF processing
157
  if PDF_PROCESSING_AVAILABLE:
158
+ if PDF_PROCESSOR == "pdfplumber":
 
159
  self.processors['pdf'] = self.extract_with_pdfplumber
160
  st.success("✅ PDF processing available (pdfplumber)")
161
+ elif PDF_PROCESSOR == "PyPDF2":
162
+ self.processors['pdf'] = self.extract_with_pypdf2
163
+ st.success("✅ PDF processing available (PyPDF2)")
164
+ else:
165
+ st.warning("⚠️ No PDF processor available")
 
 
166
 
167
  # Text files
168
  self.processors['txt'] = self.extract_text_file
 
 
 
169
 
170
  def extract_with_pdfplumber(self, file_path: str) -> str:
171
  """Extract text using pdfplumber"""
 
205
  st.error(f"Text file extraction failed: {e}")
206
  return ""
207
 
 
 
 
 
 
208
  def extract_text_from_document(self, file_path: str) -> str:
209
  """Extract text from document based on file type"""
210
  file_ext = Path(file_path).suffix.lower()
 
213
  processor = self.processors.get('pdf')
214
  elif file_ext == '.txt':
215
  processor = self.processors.get('txt')
 
 
216
  else:
217
  st.warning(f"Unsupported file type: {file_ext}")
218
  return ""
 
224
  return ""
225
 
226
  # ===============================================================================
227
+ # AI EXTRACTION CLASS
228
  # ===============================================================================
229
 
230
+ class AIExtractor:
231
+ """AI extraction for Hugging Face Spaces"""
232
 
233
  def __init__(self):
234
  self.use_transformers = self.setup_transformers()
 
236
  def setup_transformers(self):
237
  """Try to setup Hugging Face transformers for NER"""
238
  try:
239
+ from transformers import pipeline
 
 
 
240
 
241
  with st.spinner("Loading AI extraction model..."):
242
  self.ner_pipeline = pipeline(
243
  "ner",
244
+ model="dbmdz/bert-large-cased-finetuned-conll03-english",
 
245
  aggregation_strategy="simple"
246
  )
247
 
 
412
  return date_str
413
 
414
  # ===============================================================================
415
+ # VECTOR STORE CLASS
416
+ # ===============================================================================
417
+
418
+ class VectorStore:
419
+ """Simplified vector store for Hugging Face Spaces"""
420
+
421
+ def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
422
+ self.embedding_model_name = embedding_model
423
+ self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
424
+ self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
425
+ self.embedding_model = None
426
+ self.vectors = []
427
+ self.document_metadata = []
428
+ self.embedding_dimension = None
429
+
430
+ self.setup_embedding_model()
431
+ self.load_vector_store()
432
+
433
+ def setup_embedding_model(self):
434
+ """Initialize the sentence transformer model"""
435
+ if not SENTENCE_TRANSFORMERS_AVAILABLE:
436
+ st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
437
+ return
438
+
439
+ try:
440
+ with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
441
+ self.embedding_model = SentenceTransformer(
442
+ self.embedding_model_name,
443
+ cache_folder=HF_CONFIG["cache_dir"]
444
+ )
445
+
446
+ # Get embedding dimension
447
+ test_embedding = self.embedding_model.encode(["test"])
448
+ self.embedding_dimension = test_embedding.shape[0]
449
+
450
+ st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
451
+
452
+ except Exception as e:
453
+ st.error(f"❌ Failed to load embedding model: {e}")
454
+ self.embedding_model = None
455
+
456
+ def load_vector_store(self):
457
+ """Load existing vector store"""
458
+ try:
459
+ if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
460
+ with open(self.vector_store_path, 'rb') as f:
461
+ self.vectors = pickle.load(f)
462
+
463
+ with open(self.metadata_path, 'rb') as f:
464
+ self.document_metadata = pickle.load(f)
465
+
466
+ st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
467
+ else:
468
+ self.vectors = []
469
+ self.document_metadata = []
470
+ st.info("📄 New vector store initialized")
471
+
472
+ except Exception as e:
473
+ st.error(f"❌ Error loading vector store: {e}")
474
+ self.vectors = []
475
+ self.document_metadata = []
476
+
477
+ def save_vector_store(self):
478
+ """Save vector store to disk"""
479
+ try:
480
+ with open(self.vector_store_path, 'wb') as f:
481
+ pickle.dump(self.vectors, f)
482
+
483
+ with open(self.metadata_path, 'wb') as f:
484
+ pickle.dump(self.document_metadata, f)
485
+
486
+ return True
487
+ except Exception as e:
488
+ st.error(f"Error saving vector store: {e}")
489
+ return False
490
+
491
+ def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
492
+ """Create searchable text from invoice data"""
493
+ text_parts = []
494
+
495
+ for field, value in invoice_data.items():
496
+ if value and field != 'id':
497
+ text_parts.append(f"{field}: {value}")
498
+
499
+ if raw_text:
500
+ text_parts.append(f"content: {raw_text[:300]}")
501
+
502
+ return " | ".join(text_parts)
503
+
504
+ def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
505
+ """Add a document to the vector store"""
506
+ if not self.embedding_model:
507
+ return False
508
+
509
+ try:
510
+ document_text = self.create_document_text(invoice_data, raw_text)
511
+
512
+ # Generate embedding
513
+ embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
514
+
515
+ # Create metadata
516
+ metadata = {
517
+ 'invoice_id': invoice_data.get('id', ''),
518
+ 'invoice_number': invoice_data.get('invoice_number', ''),
519
+ 'supplier_name': invoice_data.get('supplier_name', ''),
520
+ 'buyer_name': invoice_data.get('buyer_name', ''),
521
+ 'amount': invoice_data.get('amount', 0),
522
+ 'date': invoice_data.get('date', ''),
523
+ 'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
524
+ 'document_text': document_text[:200],
525
+ 'timestamp': datetime.now().isoformat()
526
+ }
527
+
528
+ # Add to store
529
+ self.vectors.append(embedding)
530
+ self.document_metadata.append(metadata)
531
+
532
+ return True
533
+
534
+ except Exception as e:
535
+ st.error(f"Error adding document to vector store: {e}")
536
+ return False
537
+
538
+ def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
539
+ """Perform semantic search using cosine similarity"""
540
+ if not self.embedding_model or not self.vectors:
541
+ return []
542
+
543
+ try:
544
+ # Generate query embedding
545
+ query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
546
+
547
+ # Calculate similarities
548
+ similarities = []
549
+ for i, doc_embedding in enumerate(self.vectors):
550
+ similarity = np.dot(query_embedding, doc_embedding)
551
+ similarities.append((similarity, i))
552
+
553
+ # Sort by similarity
554
+ similarities.sort(reverse=True)
555
+
556
+ # Return top results
557
+ results = []
558
+ for similarity, idx in similarities[:top_k]:
559
+ if similarity > 0.1: # Relevance threshold
560
+ metadata = self.document_metadata[idx]
561
+ result = VectorSearchResult(
562
+ invoice_id=metadata.get('invoice_id', ''),
563
+ invoice_number=metadata.get('invoice_number', ''),
564
+ supplier_name=metadata.get('supplier_name', ''),
565
+ similarity_score=float(similarity),
566
+ content_preview=metadata.get('document_text', ''),
567
+ metadata=metadata
568
+ )
569
+ results.append(result)
570
+
571
+ return results
572
+
573
+ except Exception as e:
574
+ st.error(f"Error in semantic search: {e}")
575
+ return []
576
+
577
+ # ===============================================================================
578
+ # MAIN PROCESSOR CLASS
579
  # ===============================================================================
580
 
581
+ class InvoiceProcessor:
582
+ """Main invoice processor for Hugging Face Spaces"""
583
 
584
  def __init__(self):
585
  self.setup_storage()
586
+ self.document_processor = DocumentProcessor()
587
+ self.ai_extractor = AIExtractor()
588
+ self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
589
 
590
  # Initialize stats
591
  self.processing_stats = {
 
747
  data["metadata"]["total_invoices"] = len(invoices)
748
 
749
  # ===============================================================================
750
+ # CHATBOT CLASS
751
  # ===============================================================================
752
 
753
+ class ChatBot:
754
+ """Chatbot for invoice queries"""
755
 
756
+ def __init__(self, processor: InvoiceProcessor):
757
  self.processor = processor
758
 
759
  def query_database(self, query: str) -> str:
 
780
  elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
781
  return self.handle_supplier_query(data, query)
782
 
 
783
  elif self.processor.vector_store:
784
  return self.handle_semantic_search(query)
785
 
 
965
  return response
966
 
967
  # ===============================================================================
968
+ # STREAMLIT APPLICATION
 
 
 
 
 
 
 
 
 
 
 
 
969
  # ===============================================================================
970
 
971
+ def create_app():
972
+ """Main Streamlit application"""
973
+
974
+ # Generate unique session ID for this run
975
+ if 'session_id' not in st.session_state:
976
+ st.session_state.session_id = str(uuid.uuid4())[:8]
 
 
 
 
 
 
 
 
977
 
 
978
  session_id = st.session_state.session_id
979
 
980
+ # Custom CSS
981
  st.markdown("""
982
  <style>
983
  .main-header {
 
1012
  """, unsafe_allow_html=True)
1013
 
1014
  # Initialize processor
1015
+ if 'processor' not in st.session_state:
1016
  with st.spinner("🔧 Initializing AI Invoice Processor..."):
1017
  try:
1018
+ st.session_state.processor = InvoiceProcessor()
1019
+ st.session_state.chatbot = ChatBot(st.session_state.processor)
 
 
 
1020
  st.session_state.chat_history = []
1021
  st.success("✅ System initialized successfully!")
1022
  except Exception as e:
1023
  st.error(f"❌ Initialization failed: {e}")
1024
  st.stop()
1025
 
1026
+ # Sidebar
1027
  with st.sidebar:
1028
  st.header("🎛️ System Status")
1029
 
1030
+ processor = st.session_state.processor
1031
 
1032
+ # Component status
1033
+ if processor.document_processor.processors:
1034
  st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
1035
  else:
1036
  st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
1037
 
1038
+ if processor.ai_extractor.use_transformers:
 
1039
  st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
1040
  else:
1041
  st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
1042
 
1043
+ if processor.vector_store and processor.vector_store.embedding_model:
 
1044
  st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
1045
  else:
1046
  st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
 
1054
 
1055
  st.metric("Total Invoices", total_invoices)
1056
  st.metric("Total Value", f"₹{total_amount:,.2f}")
1057
+ st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
 
 
 
1058
 
1059
  except Exception as e:
1060
  st.error(f"Stats error: {e}")
1061
 
1062
+ # System info
1063
  st.header("⚙️ System Info")
1064
  st.info(f"""
1065
  **Session ID:** {session_id}
 
1112
  </div>
1113
  """, unsafe_allow_html=True)
1114
 
1115
+ # File upload
1116
  st.markdown("### 📁 Upload Your Invoices")
1117
 
 
1118
  timestamp = datetime.now().strftime("%H%M%S")
1119
 
1120
  uploaded_files = st.file_uploader(
 
1150
  with st.chat_message(message["role"]):
1151
  st.markdown(message["content"])
1152
 
1153
+ # Chat input
1154
  st.markdown("### ✍️ Ask a Question")
1155
 
1156
  col1, col2 = st.columns([4, 1])
 
1212
  st.header("📊 Analytics Dashboard")
1213
 
1214
  try:
1215
+ data = st.session_state.processor.load_json_data()
1216
  invoices = data.get("invoices", [])
1217
 
1218
  if not invoices:
 
1280
  st.header("📋 Data Explorer")
1281
 
1282
  try:
1283
+ data = st.session_state.processor.load_json_data()
1284
  invoices = data.get("invoices", [])
1285
 
1286
  if not invoices:
 
1374
  st.error(f"Data explorer error: {e}")
1375
 
1376
  # -------------------------------------------------------------------------
1377
+ # GLOBAL CHAT INPUT
1378
  # -------------------------------------------------------------------------
1379
 
1380
  st.markdown("---")
1381
  st.markdown("### 💬 Quick Chat (Works from any section)")
1382
 
 
1383
  global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
1384
 
1385
  if global_query:
 
1414
  st.info(f"Processing: {uploaded_file.name}")
1415
 
1416
  try:
1417
+ result = st.session_state.processor.process_uploaded_file(uploaded_file)
 
1418
 
1419
  with results_container:
1420
  if result and result.invoice_number:
 
1438
  with results_container:
1439
  st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
1440
 
 
1441
  with status_container:
1442
  st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
1443
 
 
1446
 
1447
  def handle_chat_query(query, show_response=False):
1448
  """Handle chat query"""
 
1449
  st.session_state.chat_history.append({
1450
  "role": "user",
1451
  "content": query,
1452
  "timestamp": datetime.now()
1453
  })
1454
 
 
1455
  try:
1456
  with st.spinner("🤖 AI is analyzing..."):
1457
+ response = st.session_state.chatbot.query_database(query)
1458
 
1459
  st.session_state.chat_history.append({
1460
  "role": "assistant",
 
1462
  "timestamp": datetime.now()
1463
  })
1464
 
 
1465
  if show_response:
1466
  with st.chat_message("assistant"):
1467
  st.markdown(response)
 
1479
  def main():
1480
  """Main entry point for Hugging Face Spaces"""
1481
  try:
 
 
 
 
1482
  if IS_HF_SPACE:
1483
  st.sidebar.info("🤗 Running on Hugging Face Spaces")
1484
 
1485
+ create_app()
 
 
 
 
 
 
 
 
 
 
 
 
1486
 
1487
  except Exception as e:
1488
  st.error(f"""
 
1493
  Please refresh the page or check the logs for more details.
1494
  """)
1495
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1496
  if __name__ == "__main__":
1497
  main()