Update app.py
Browse files
app.py
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
A comprehensive system with AI-powered extraction, semantic search, and analytics.
|
| 5 |
|
| 6 |
Author: AI Assistant
|
| 7 |
Date: 2024
|
| 8 |
-
Version: HuggingFace v1.0
|
| 9 |
"""
|
| 10 |
|
| 11 |
# ===============================================================================
|
| 12 |
-
# IMPORTS AND
|
| 13 |
# ===============================================================================
|
| 14 |
|
| 15 |
import os
|
|
@@ -25,8 +25,7 @@ from dataclasses import dataclass
|
|
| 25 |
from pathlib import Path
|
| 26 |
import time
|
| 27 |
import logging
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
# Check if running on Hugging Face Spaces
|
| 32 |
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
|
|
@@ -39,26 +38,7 @@ import plotly.express as px
|
|
| 39 |
import plotly.graph_objects as go
|
| 40 |
import requests
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
st.set_page_config(
|
| 44 |
-
page_title="AI Invoice Processing System",
|
| 45 |
-
page_icon="📄",
|
| 46 |
-
layout="wide",
|
| 47 |
-
initial_sidebar_state="expanded",
|
| 48 |
-
menu_items={
|
| 49 |
-
'Get Help': 'https://huggingface.co/spaces',
|
| 50 |
-
'Report a bug': 'https://huggingface.co/spaces',
|
| 51 |
-
'About': """
|
| 52 |
-
# AI Invoice Processing System
|
| 53 |
-
Built for Hugging Face Spaces with AI-powered extraction and semantic search.
|
| 54 |
-
"""
|
| 55 |
-
}
|
| 56 |
-
)
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
# Vector storage and embeddings (HF compatible)
|
| 62 |
try:
|
| 63 |
import faiss
|
| 64 |
FAISS_AVAILABLE = True
|
|
@@ -80,41 +60,51 @@ except ImportError:
|
|
| 80 |
TORCH_AVAILABLE = False
|
| 81 |
|
| 82 |
# Document processing (simplified for HF)
|
| 83 |
-
try:
|
| 84 |
-
from docling.document_converter import DocumentConverter
|
| 85 |
-
from docling.datamodel.base_models import InputFormat
|
| 86 |
-
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
| 87 |
-
from docling.document_converter import PdfFormatOption
|
| 88 |
-
DOCLING_AVAILABLE = True
|
| 89 |
-
except ImportError:
|
| 90 |
-
DOCLING_AVAILABLE = False
|
| 91 |
-
st.warning("⚠️ Docling not available. Using simplified document processing.")
|
| 92 |
-
|
| 93 |
-
# Alternative document processing for HF
|
| 94 |
try:
|
| 95 |
import pdfplumber
|
| 96 |
PDF_PROCESSING_AVAILABLE = True
|
|
|
|
| 97 |
except ImportError:
|
| 98 |
try:
|
| 99 |
import PyPDF2
|
| 100 |
PDF_PROCESSING_AVAILABLE = True
|
|
|
|
| 101 |
except ImportError:
|
| 102 |
PDF_PROCESSING_AVAILABLE = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# ===============================================================================
|
| 105 |
-
#
|
| 106 |
# ===============================================================================
|
| 107 |
|
| 108 |
-
# Hugging Face Spaces configuration
|
| 109 |
HF_CONFIG = {
|
| 110 |
-
"max_file_size_mb": 10,
|
| 111 |
-
"max_concurrent_files": 3,
|
| 112 |
"timeout_seconds": 30,
|
| 113 |
-
"use_cpu_only": True,
|
| 114 |
-
"embedding_model": "all-MiniLM-L6-v2",
|
| 115 |
"cache_dir": "./cache",
|
| 116 |
"data_dir": "./data",
|
| 117 |
-
"enable_ollama": False,
|
| 118 |
}
|
| 119 |
|
| 120 |
# Create necessary directories
|
|
@@ -122,15 +112,12 @@ os.makedirs(HF_CONFIG["cache_dir"], exist_ok=True)
|
|
| 122 |
os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
|
| 123 |
|
| 124 |
# ===============================================================================
|
| 125 |
-
#
|
| 126 |
-
# ===============================================================================
|
| 127 |
-
# ===============================================================================
|
| 128 |
-
# SIMPLIFIED DATA STRUCTURES FOR HF
|
| 129 |
# ===============================================================================
|
| 130 |
|
| 131 |
@dataclass
|
| 132 |
class InvoiceData:
|
| 133 |
-
"""
|
| 134 |
supplier_name: str = ""
|
| 135 |
buyer_name: str = ""
|
| 136 |
invoice_number: str = ""
|
|
@@ -153,182 +140,10 @@ class VectorSearchResult:
|
|
| 153 |
metadata: Dict
|
| 154 |
|
| 155 |
# ===============================================================================
|
| 156 |
-
#
|
| 157 |
-
# ===============================================================================
|
| 158 |
-
|
| 159 |
-
class HuggingFaceVectorStore:
|
| 160 |
-
"""Simplified vector store compatible with Hugging Face Spaces"""
|
| 161 |
-
|
| 162 |
-
def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
|
| 163 |
-
self.embedding_model_name = embedding_model
|
| 164 |
-
self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
|
| 165 |
-
self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
|
| 166 |
-
self.embedding_model = None
|
| 167 |
-
self.vectors = []
|
| 168 |
-
self.document_metadata = []
|
| 169 |
-
self.embedding_dimension = None
|
| 170 |
-
|
| 171 |
-
self.setup_embedding_model()
|
| 172 |
-
self.load_vector_store()
|
| 173 |
-
|
| 174 |
-
def setup_embedding_model(self):
|
| 175 |
-
"""Initialize the sentence transformer model"""
|
| 176 |
-
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
| 177 |
-
st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
|
| 178 |
-
return
|
| 179 |
-
|
| 180 |
-
try:
|
| 181 |
-
with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
|
| 182 |
-
self.embedding_model = SentenceTransformer(
|
| 183 |
-
self.embedding_model_name,
|
| 184 |
-
cache_folder=HF_CONFIG["cache_dir"]
|
| 185 |
-
)
|
| 186 |
-
|
| 187 |
-
# Get embedding dimension
|
| 188 |
-
test_embedding = self.embedding_model.encode(["test"])
|
| 189 |
-
self.embedding_dimension = test_embedding.shape[0]
|
| 190 |
-
|
| 191 |
-
st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
|
| 192 |
-
|
| 193 |
-
except Exception as e:
|
| 194 |
-
st.error(f"❌ Failed to load embedding model: {e}")
|
| 195 |
-
self.embedding_model = None
|
| 196 |
-
|
| 197 |
-
def load_vector_store(self):
|
| 198 |
-
"""Load existing vector store"""
|
| 199 |
-
try:
|
| 200 |
-
if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
|
| 201 |
-
with open(self.vector_store_path, 'rb') as f:
|
| 202 |
-
self.vectors = pickle.load(f)
|
| 203 |
-
|
| 204 |
-
with open(self.metadata_path, 'rb') as f:
|
| 205 |
-
self.document_metadata = pickle.load(f)
|
| 206 |
-
|
| 207 |
-
st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
|
| 208 |
-
else:
|
| 209 |
-
self.vectors = []
|
| 210 |
-
self.document_metadata = []
|
| 211 |
-
st.info("📄 New vector store initialized")
|
| 212 |
-
|
| 213 |
-
except Exception as e:
|
| 214 |
-
st.error(f"❌ Error loading vector store: {e}")
|
| 215 |
-
self.vectors = []
|
| 216 |
-
self.document_metadata = []
|
| 217 |
-
|
| 218 |
-
def save_vector_store(self):
|
| 219 |
-
"""Save vector store to disk"""
|
| 220 |
-
try:
|
| 221 |
-
with open(self.vector_store_path, 'wb') as f:
|
| 222 |
-
pickle.dump(self.vectors, f)
|
| 223 |
-
|
| 224 |
-
with open(self.metadata_path, 'wb') as f:
|
| 225 |
-
pickle.dump(self.document_metadata, f)
|
| 226 |
-
|
| 227 |
-
return True
|
| 228 |
-
except Exception as e:
|
| 229 |
-
st.error(f"Error saving vector store: {e}")
|
| 230 |
-
return False
|
| 231 |
-
|
| 232 |
-
def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
|
| 233 |
-
"""Create searchable text from invoice data"""
|
| 234 |
-
text_parts = []
|
| 235 |
-
|
| 236 |
-
for field, value in invoice_data.items():
|
| 237 |
-
if value and field != 'id':
|
| 238 |
-
text_parts.append(f"{field}: {value}")
|
| 239 |
-
|
| 240 |
-
if raw_text:
|
| 241 |
-
text_parts.append(f"content: {raw_text[:300]}")
|
| 242 |
-
|
| 243 |
-
return " | ".join(text_parts)
|
| 244 |
-
|
| 245 |
-
def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
|
| 246 |
-
"""Add a document to the vector store"""
|
| 247 |
-
if not self.embedding_model:
|
| 248 |
-
return False
|
| 249 |
-
|
| 250 |
-
try:
|
| 251 |
-
document_text = self.create_document_text(invoice_data, raw_text)
|
| 252 |
-
|
| 253 |
-
# Generate embedding
|
| 254 |
-
embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
|
| 255 |
-
|
| 256 |
-
# Create metadata
|
| 257 |
-
metadata = {
|
| 258 |
-
'invoice_id': invoice_data.get('id', ''),
|
| 259 |
-
'invoice_number': invoice_data.get('invoice_number', ''),
|
| 260 |
-
'supplier_name': invoice_data.get('supplier_name', ''),
|
| 261 |
-
'buyer_name': invoice_data.get('buyer_name', ''),
|
| 262 |
-
'amount': invoice_data.get('amount', 0),
|
| 263 |
-
'date': invoice_data.get('date', ''),
|
| 264 |
-
'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
|
| 265 |
-
'document_text': document_text[:200],
|
| 266 |
-
'timestamp': datetime.now().isoformat()
|
| 267 |
-
}
|
| 268 |
-
|
| 269 |
-
# Add to store
|
| 270 |
-
self.vectors.append(embedding)
|
| 271 |
-
self.document_metadata.append(metadata)
|
| 272 |
-
|
| 273 |
-
return True
|
| 274 |
-
|
| 275 |
-
except Exception as e:
|
| 276 |
-
st.error(f"Error adding document to vector store: {e}")
|
| 277 |
-
return False
|
| 278 |
-
|
| 279 |
-
def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
|
| 280 |
-
"""Perform semantic search using cosine similarity"""
|
| 281 |
-
if not self.embedding_model or not self.vectors:
|
| 282 |
-
return []
|
| 283 |
-
|
| 284 |
-
try:
|
| 285 |
-
# Generate query embedding
|
| 286 |
-
query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
|
| 287 |
-
|
| 288 |
-
# Calculate similarities
|
| 289 |
-
similarities = []
|
| 290 |
-
for i, doc_embedding in enumerate(self.vectors):
|
| 291 |
-
similarity = np.dot(query_embedding, doc_embedding)
|
| 292 |
-
similarities.append((similarity, i))
|
| 293 |
-
|
| 294 |
-
# Sort by similarity
|
| 295 |
-
similarities.sort(reverse=True)
|
| 296 |
-
|
| 297 |
-
# Return top results
|
| 298 |
-
results = []
|
| 299 |
-
for similarity, idx in similarities[:top_k]:
|
| 300 |
-
if similarity > 0.1: # Relevance threshold
|
| 301 |
-
metadata = self.document_metadata[idx]
|
| 302 |
-
result = VectorSearchResult(
|
| 303 |
-
invoice_id=metadata.get('invoice_id', ''),
|
| 304 |
-
invoice_number=metadata.get('invoice_number', ''),
|
| 305 |
-
supplier_name=metadata.get('supplier_name', ''),
|
| 306 |
-
similarity_score=float(similarity),
|
| 307 |
-
content_preview=metadata.get('document_text', ''),
|
| 308 |
-
metadata=metadata
|
| 309 |
-
)
|
| 310 |
-
results.append(result)
|
| 311 |
-
|
| 312 |
-
return results
|
| 313 |
-
|
| 314 |
-
except Exception as e:
|
| 315 |
-
st.error(f"Error in semantic search: {e}")
|
| 316 |
-
return []
|
| 317 |
-
|
| 318 |
-
def get_stats(self) -> Dict:
|
| 319 |
-
"""Get vector store statistics"""
|
| 320 |
-
return {
|
| 321 |
-
'total_documents': len(self.document_metadata),
|
| 322 |
-
'embedding_dimension': self.embedding_dimension,
|
| 323 |
-
'model_name': self.embedding_model_name,
|
| 324 |
-
'vector_store_size': len(self.vectors)
|
| 325 |
-
}
|
| 326 |
-
|
| 327 |
-
# ===============================================================================
|
| 328 |
-
# SIMPLIFIED DOCUMENT PROCESSING FOR HF
|
| 329 |
# ===============================================================================
|
| 330 |
|
| 331 |
-
class
|
| 332 |
"""Simplified document processor for Hugging Face Spaces"""
|
| 333 |
|
| 334 |
def __init__(self):
|
|
@@ -340,23 +155,17 @@ class HuggingFaceDocumentProcessor:
|
|
| 340 |
|
| 341 |
# PDF processing
|
| 342 |
if PDF_PROCESSING_AVAILABLE:
|
| 343 |
-
|
| 344 |
-
import pdfplumber
|
| 345 |
self.processors['pdf'] = self.extract_with_pdfplumber
|
| 346 |
st.success("✅ PDF processing available (pdfplumber)")
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
except ImportError:
|
| 353 |
-
st.warning("⚠️ No PDF processor available")
|
| 354 |
|
| 355 |
# Text files
|
| 356 |
self.processors['txt'] = self.extract_text_file
|
| 357 |
-
|
| 358 |
-
# Images (basic OCR alternative)
|
| 359 |
-
self.processors['image'] = self.extract_image_text
|
| 360 |
|
| 361 |
def extract_with_pdfplumber(self, file_path: str) -> str:
|
| 362 |
"""Extract text using pdfplumber"""
|
|
@@ -396,11 +205,6 @@ class HuggingFaceDocumentProcessor:
|
|
| 396 |
st.error(f"Text file extraction failed: {e}")
|
| 397 |
return ""
|
| 398 |
|
| 399 |
-
def extract_image_text(self, file_path: str) -> str:
|
| 400 |
-
"""Basic image text extraction (placeholder for OCR)"""
|
| 401 |
-
st.warning("⚠️ OCR not available in this environment. Please use text-based documents.")
|
| 402 |
-
return ""
|
| 403 |
-
|
| 404 |
def extract_text_from_document(self, file_path: str) -> str:
|
| 405 |
"""Extract text from document based on file type"""
|
| 406 |
file_ext = Path(file_path).suffix.lower()
|
|
@@ -409,8 +213,6 @@ class HuggingFaceDocumentProcessor:
|
|
| 409 |
processor = self.processors.get('pdf')
|
| 410 |
elif file_ext == '.txt':
|
| 411 |
processor = self.processors.get('txt')
|
| 412 |
-
elif file_ext in ['.jpg', '.jpeg', '.png']:
|
| 413 |
-
processor = self.processors.get('image')
|
| 414 |
else:
|
| 415 |
st.warning(f"Unsupported file type: {file_ext}")
|
| 416 |
return ""
|
|
@@ -422,11 +224,11 @@ class HuggingFaceDocumentProcessor:
|
|
| 422 |
return ""
|
| 423 |
|
| 424 |
# ===============================================================================
|
| 425 |
-
#
|
| 426 |
# ===============================================================================
|
| 427 |
|
| 428 |
-
class
|
| 429 |
-
"""
|
| 430 |
|
| 431 |
def __init__(self):
|
| 432 |
self.use_transformers = self.setup_transformers()
|
|
@@ -434,16 +236,12 @@ class HuggingFaceAIExtractor:
|
|
| 434 |
def setup_transformers(self):
|
| 435 |
"""Try to setup Hugging Face transformers for NER"""
|
| 436 |
try:
|
| 437 |
-
from transformers import pipeline
|
| 438 |
-
|
| 439 |
-
# Use a lightweight NER model
|
| 440 |
-
model_name = "dbmdz/bert-large-cased-finetuned-conll03-english"
|
| 441 |
|
| 442 |
with st.spinner("Loading AI extraction model..."):
|
| 443 |
self.ner_pipeline = pipeline(
|
| 444 |
"ner",
|
| 445 |
-
model=
|
| 446 |
-
tokenizer=model_name,
|
| 447 |
aggregation_strategy="simple"
|
| 448 |
)
|
| 449 |
|
|
@@ -614,17 +412,180 @@ class HuggingFaceAIExtractor:
|
|
| 614 |
return date_str
|
| 615 |
|
| 616 |
# ===============================================================================
|
| 617 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 618 |
# ===============================================================================
|
| 619 |
|
| 620 |
-
class
|
| 621 |
-
"""Main invoice processor
|
| 622 |
|
| 623 |
def __init__(self):
|
| 624 |
self.setup_storage()
|
| 625 |
-
self.document_processor =
|
| 626 |
-
self.ai_extractor =
|
| 627 |
-
self.vector_store =
|
| 628 |
|
| 629 |
# Initialize stats
|
| 630 |
self.processing_stats = {
|
|
@@ -786,13 +747,13 @@ class HuggingFaceInvoiceProcessor:
|
|
| 786 |
data["metadata"]["total_invoices"] = len(invoices)
|
| 787 |
|
| 788 |
# ===============================================================================
|
| 789 |
-
#
|
| 790 |
# ===============================================================================
|
| 791 |
|
| 792 |
-
class
|
| 793 |
-
"""
|
| 794 |
|
| 795 |
-
def __init__(self, processor:
|
| 796 |
self.processor = processor
|
| 797 |
|
| 798 |
def query_database(self, query: str) -> str:
|
|
@@ -819,7 +780,6 @@ class HuggingFaceChatBot:
|
|
| 819 |
elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
|
| 820 |
return self.handle_supplier_query(data, query)
|
| 821 |
|
| 822 |
-
|
| 823 |
elif self.processor.vector_store:
|
| 824 |
return self.handle_semantic_search(query)
|
| 825 |
|
|
@@ -1005,40 +965,19 @@ class HuggingFaceChatBot:
|
|
| 1005 |
return response
|
| 1006 |
|
| 1007 |
# ===============================================================================
|
| 1008 |
-
# STREAMLIT APPLICATION
|
| 1009 |
-
# ===============================================================================
|
| 1010 |
-
|
| 1011 |
-
# ===============================================================================
|
| 1012 |
-
# FIXED MAIN APPLICATION WITH PROPER CHAT INPUT PLACEMENT
|
| 1013 |
-
# ===============================================================================
|
| 1014 |
-
|
| 1015 |
-
# ===============================================================================
|
| 1016 |
-
# FIXED APPLICATION WITH UNIQUE WIDGET KEYS
|
| 1017 |
-
# ===============================================================================
|
| 1018 |
-
|
| 1019 |
-
# ===============================================================================
|
| 1020 |
-
# FIXED APPLICATION WITH DYNAMIC UNIQUE KEYS AND SESSION STATE
|
| 1021 |
# ===============================================================================
|
| 1022 |
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
| 1027 |
-
|
| 1028 |
-
|
| 1029 |
-
import uuid
|
| 1030 |
-
|
| 1031 |
-
# Generate unique session ID for this run
|
| 1032 |
-
if 'session_id' not in st.session_state:
|
| 1033 |
-
st.session_state.session_id = str(uuid.uuid4())[:8]
|
| 1034 |
-
|
| 1035 |
-
def create_huggingface_app():
|
| 1036 |
-
"""Main Streamlit application optimized for Hugging Face Spaces"""
|
| 1037 |
|
| 1038 |
-
# Get unique session ID
|
| 1039 |
session_id = st.session_state.session_id
|
| 1040 |
|
| 1041 |
-
# Custom CSS
|
| 1042 |
st.markdown("""
|
| 1043 |
<style>
|
| 1044 |
.main-header {
|
|
@@ -1073,40 +1012,35 @@ def create_huggingface_app():
|
|
| 1073 |
""", unsafe_allow_html=True)
|
| 1074 |
|
| 1075 |
# Initialize processor
|
| 1076 |
-
if '
|
| 1077 |
with st.spinner("🔧 Initializing AI Invoice Processor..."):
|
| 1078 |
try:
|
| 1079 |
-
|
| 1080 |
-
|
| 1081 |
-
)
|
| 1082 |
-
st.session_state.hf_processor = HuggingFaceInvoiceProcessor()
|
| 1083 |
-
st.session_state.hf_chatbot = HuggingFaceChatBot(st.session_state.hf_processor)
|
| 1084 |
st.session_state.chat_history = []
|
| 1085 |
st.success("✅ System initialized successfully!")
|
| 1086 |
except Exception as e:
|
| 1087 |
st.error(f"❌ Initialization failed: {e}")
|
| 1088 |
st.stop()
|
| 1089 |
|
| 1090 |
-
# Sidebar
|
| 1091 |
with st.sidebar:
|
| 1092 |
st.header("🎛️ System Status")
|
| 1093 |
|
| 1094 |
-
processor = st.session_state.
|
| 1095 |
|
| 1096 |
-
#
|
| 1097 |
-
if
|
| 1098 |
st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
|
| 1099 |
else:
|
| 1100 |
st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
|
| 1101 |
|
| 1102 |
-
|
| 1103 |
-
if hasattr(processor, 'ai_extractor') and processor.ai_extractor.use_transformers:
|
| 1104 |
st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
|
| 1105 |
else:
|
| 1106 |
st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
|
| 1107 |
|
| 1108 |
-
|
| 1109 |
-
if hasattr(processor, 'vector_store') and processor.vector_store and processor.vector_store.embedding_model:
|
| 1110 |
st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
|
| 1111 |
else:
|
| 1112 |
st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
|
|
@@ -1120,15 +1054,12 @@ def create_huggingface_app():
|
|
| 1120 |
|
| 1121 |
st.metric("Total Invoices", total_invoices)
|
| 1122 |
st.metric("Total Value", f"₹{total_amount:,.2f}")
|
| 1123 |
-
|
| 1124 |
-
if hasattr(processor, 'processing_stats'):
|
| 1125 |
-
success_rate = f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}"
|
| 1126 |
-
st.metric("Success Rate", success_rate)
|
| 1127 |
|
| 1128 |
except Exception as e:
|
| 1129 |
st.error(f"Stats error: {e}")
|
| 1130 |
|
| 1131 |
-
#
|
| 1132 |
st.header("⚙️ System Info")
|
| 1133 |
st.info(f"""
|
| 1134 |
**Session ID:** {session_id}
|
|
@@ -1181,10 +1112,9 @@ def create_huggingface_app():
|
|
| 1181 |
</div>
|
| 1182 |
""", unsafe_allow_html=True)
|
| 1183 |
|
| 1184 |
-
# File upload
|
| 1185 |
st.markdown("### 📁 Upload Your Invoices")
|
| 1186 |
|
| 1187 |
-
# Use timestamp to ensure unique keys
|
| 1188 |
timestamp = datetime.now().strftime("%H%M%S")
|
| 1189 |
|
| 1190 |
uploaded_files = st.file_uploader(
|
|
@@ -1220,7 +1150,7 @@ def create_huggingface_app():
|
|
| 1220 |
with st.chat_message(message["role"]):
|
| 1221 |
st.markdown(message["content"])
|
| 1222 |
|
| 1223 |
-
# Chat input
|
| 1224 |
st.markdown("### ✍️ Ask a Question")
|
| 1225 |
|
| 1226 |
col1, col2 = st.columns([4, 1])
|
|
@@ -1282,7 +1212,7 @@ def create_huggingface_app():
|
|
| 1282 |
st.header("📊 Analytics Dashboard")
|
| 1283 |
|
| 1284 |
try:
|
| 1285 |
-
data = st.session_state.
|
| 1286 |
invoices = data.get("invoices", [])
|
| 1287 |
|
| 1288 |
if not invoices:
|
|
@@ -1350,7 +1280,7 @@ def create_huggingface_app():
|
|
| 1350 |
st.header("📋 Data Explorer")
|
| 1351 |
|
| 1352 |
try:
|
| 1353 |
-
data = st.session_state.
|
| 1354 |
invoices = data.get("invoices", [])
|
| 1355 |
|
| 1356 |
if not invoices:
|
|
@@ -1444,13 +1374,12 @@ def create_huggingface_app():
|
|
| 1444 |
st.error(f"Data explorer error: {e}")
|
| 1445 |
|
| 1446 |
# -------------------------------------------------------------------------
|
| 1447 |
-
# GLOBAL CHAT INPUT
|
| 1448 |
# -------------------------------------------------------------------------
|
| 1449 |
|
| 1450 |
st.markdown("---")
|
| 1451 |
st.markdown("### 💬 Quick Chat (Works from any section)")
|
| 1452 |
|
| 1453 |
-
# Global chat input with unique key
|
| 1454 |
global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
|
| 1455 |
|
| 1456 |
if global_query:
|
|
@@ -1485,8 +1414,7 @@ def process_files(uploaded_files, session_id):
|
|
| 1485 |
st.info(f"Processing: {uploaded_file.name}")
|
| 1486 |
|
| 1487 |
try:
|
| 1488 |
-
|
| 1489 |
-
result = st.session_state.hf_processor.process_uploaded_file(uploaded_file)
|
| 1490 |
|
| 1491 |
with results_container:
|
| 1492 |
if result and result.invoice_number:
|
|
@@ -1510,7 +1438,6 @@ def process_files(uploaded_files, session_id):
|
|
| 1510 |
with results_container:
|
| 1511 |
st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
|
| 1512 |
|
| 1513 |
-
# Final status
|
| 1514 |
with status_container:
|
| 1515 |
st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
|
| 1516 |
|
|
@@ -1519,17 +1446,15 @@ def process_files(uploaded_files, session_id):
|
|
| 1519 |
|
| 1520 |
def handle_chat_query(query, show_response=False):
|
| 1521 |
"""Handle chat query"""
|
| 1522 |
-
# Add user message
|
| 1523 |
st.session_state.chat_history.append({
|
| 1524 |
"role": "user",
|
| 1525 |
"content": query,
|
| 1526 |
"timestamp": datetime.now()
|
| 1527 |
})
|
| 1528 |
|
| 1529 |
-
# Get AI response
|
| 1530 |
try:
|
| 1531 |
with st.spinner("🤖 AI is analyzing..."):
|
| 1532 |
-
response = st.session_state.
|
| 1533 |
|
| 1534 |
st.session_state.chat_history.append({
|
| 1535 |
"role": "assistant",
|
|
@@ -1537,7 +1462,6 @@ def handle_chat_query(query, show_response=False):
|
|
| 1537 |
"timestamp": datetime.now()
|
| 1538 |
})
|
| 1539 |
|
| 1540 |
-
# Show response if requested
|
| 1541 |
if show_response:
|
| 1542 |
with st.chat_message("assistant"):
|
| 1543 |
st.markdown(response)
|
|
@@ -1555,26 +1479,10 @@ def handle_chat_query(query, show_response=False):
|
|
| 1555 |
def main():
|
| 1556 |
"""Main entry point for Hugging Face Spaces"""
|
| 1557 |
try:
|
| 1558 |
-
# Import required classes
|
| 1559 |
-
from enhanced_invoice_system_part1 import IS_HF_SPACE
|
| 1560 |
-
|
| 1561 |
-
# Display environment info
|
| 1562 |
if IS_HF_SPACE:
|
| 1563 |
st.sidebar.info("🤗 Running on Hugging Face Spaces")
|
| 1564 |
|
| 1565 |
-
|
| 1566 |
-
create_huggingface_app()
|
| 1567 |
-
|
| 1568 |
-
except ImportError as e:
|
| 1569 |
-
st.error(f"""
|
| 1570 |
-
## 🚨 Import Error
|
| 1571 |
-
|
| 1572 |
-
Missing required modules: {e}
|
| 1573 |
-
|
| 1574 |
-
Please ensure all files are uploaded to your Hugging Face Space:
|
| 1575 |
-
- enhanced_invoice_system_part1.py
|
| 1576 |
-
- enhanced_invoice_system_part2.py (this file)
|
| 1577 |
-
""")
|
| 1578 |
|
| 1579 |
except Exception as e:
|
| 1580 |
st.error(f"""
|
|
@@ -1585,103 +1493,5 @@ def main():
|
|
| 1585 |
Please refresh the page or check the logs for more details.
|
| 1586 |
""")
|
| 1587 |
|
| 1588 |
-
if __name__ == "__main__":
|
| 1589 |
-
main()
|
| 1590 |
-
# ===============================================================================
|
| 1591 |
-
# MAIN APPLICATION ENTRY POINT
|
| 1592 |
-
# ===============================================================================
|
| 1593 |
-
|
| 1594 |
-
def main():
|
| 1595 |
-
"""Main entry point for Hugging Face Spaces"""
|
| 1596 |
-
try:
|
| 1597 |
-
# Display Hugging Face info if running on HF Spaces
|
| 1598 |
-
if IS_HF_SPACE:
|
| 1599 |
-
st.sidebar.info("🤗 Running on Hugging Face Spaces")
|
| 1600 |
-
|
| 1601 |
-
# Create and run the app
|
| 1602 |
-
create_huggingface_app()
|
| 1603 |
-
|
| 1604 |
-
except Exception as e:
|
| 1605 |
-
st.error(f"Application error: {e}")
|
| 1606 |
-
st.info("Please refresh the page or contact support if the error persists.")
|
| 1607 |
-
|
| 1608 |
-
if __name__ == "__main__":
|
| 1609 |
-
main()
|
| 1610 |
-
|
| 1611 |
-
# ===============================================================================
|
| 1612 |
-
# MAIN APPLICATION ENTRY POINT
|
| 1613 |
-
# ===============================================================================
|
| 1614 |
-
|
| 1615 |
-
def main():
|
| 1616 |
-
"""Main entry point for Hugging Face Spaces"""
|
| 1617 |
-
try:
|
| 1618 |
-
# Display Hugging Face info if running on HF Spaces
|
| 1619 |
-
if IS_HF_SPACE:
|
| 1620 |
-
st.sidebar.info("🤗 Running on Hugging Face Spaces")
|
| 1621 |
-
|
| 1622 |
-
# Create and run the app
|
| 1623 |
-
create_huggingface_app()
|
| 1624 |
-
|
| 1625 |
-
except Exception as e:
|
| 1626 |
-
st.error(f"Application error: {e}")
|
| 1627 |
-
st.info("Please refresh the page or contact support if the error persists.")
|
| 1628 |
-
|
| 1629 |
-
if __name__ == "__main__":
|
| 1630 |
-
main()
|
| 1631 |
-
|
| 1632 |
-
# ===============================================================================
|
| 1633 |
-
# HUGGING FACE REQUIREMENTS AND CONFIGURATION
|
| 1634 |
-
# ===============================================================================
|
| 1635 |
-
|
| 1636 |
-
def generate_hf_requirements():
|
| 1637 |
-
"""Generate requirements.txt optimized for Hugging Face Spaces"""
|
| 1638 |
-
requirements = """streamlit>=1.28.0
|
| 1639 |
-
pandas>=1.5.0
|
| 1640 |
-
numpy>=1.21.0
|
| 1641 |
-
plotly>=5.0.0
|
| 1642 |
-
sentence-transformers>=2.2.0
|
| 1643 |
-
transformers>=4.21.0
|
| 1644 |
-
torch>=1.13.0
|
| 1645 |
-
faiss-cpu>=1.7.0
|
| 1646 |
-
pdfplumber>=0.7.0
|
| 1647 |
-
requests>=2.28.0
|
| 1648 |
-
python-dateutil>=2.8.0
|
| 1649 |
-
Pillow>=9.0.0
|
| 1650 |
-
"""
|
| 1651 |
-
return requirements.strip()
|
| 1652 |
-
|
| 1653 |
-
def generate_hf_config():
|
| 1654 |
-
"""Generate app configuration for Hugging Face Spaces"""
|
| 1655 |
-
config = {
|
| 1656 |
-
"title": "AI Invoice Processing System",
|
| 1657 |
-
"emoji": "📄",
|
| 1658 |
-
"colorFrom": "blue",
|
| 1659 |
-
"colorTo": "purple",
|
| 1660 |
-
"sdk": "streamlit",
|
| 1661 |
-
"sdk_version": "1.28.0",
|
| 1662 |
-
"app_file": "app.py",
|
| 1663 |
-
"pinned": False,
|
| 1664 |
-
"python_version": "3.9"
|
| 1665 |
-
}
|
| 1666 |
-
return config
|
| 1667 |
-
|
| 1668 |
-
# ===============================================================================
|
| 1669 |
-
# MAIN APPLICATION ENTRY POINT
|
| 1670 |
-
# ===============================================================================
|
| 1671 |
-
|
| 1672 |
-
def main():
|
| 1673 |
-
"""Main entry point for Hugging Face Spaces"""
|
| 1674 |
-
try:
|
| 1675 |
-
# Display Hugging Face info if running on HF Spaces
|
| 1676 |
-
if IS_HF_SPACE:
|
| 1677 |
-
st.sidebar.info("🤗 Running on Hugging Face Spaces")
|
| 1678 |
-
|
| 1679 |
-
# Create and run the app
|
| 1680 |
-
create_huggingface_app()
|
| 1681 |
-
|
| 1682 |
-
except Exception as e:
|
| 1683 |
-
st.error(f"Application error: {e}")
|
| 1684 |
-
st.info("Please refresh the page or contact support if the error persists.")
|
| 1685 |
-
|
| 1686 |
if __name__ == "__main__":
|
| 1687 |
main()
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
AI Invoice Processing System - Complete Single File for Hugging Face Spaces
|
| 4 |
A comprehensive system with AI-powered extraction, semantic search, and analytics.
|
| 5 |
|
| 6 |
Author: AI Assistant
|
| 7 |
Date: 2024
|
| 8 |
+
Version: HuggingFace Single File v1.0
|
| 9 |
"""
|
| 10 |
|
| 11 |
# ===============================================================================
|
| 12 |
+
# IMPORTS AND COMPATIBILITY CHECKS
|
| 13 |
# ===============================================================================
|
| 14 |
|
| 15 |
import os
|
|
|
|
| 25 |
from pathlib import Path
|
| 26 |
import time
|
| 27 |
import logging
|
| 28 |
+
import uuid
|
|
|
|
| 29 |
|
| 30 |
# Check if running on Hugging Face Spaces
|
| 31 |
IS_HF_SPACE = os.getenv("SPACE_ID") is not None
|
|
|
|
| 38 |
import plotly.graph_objects as go
|
| 39 |
import requests
|
| 40 |
|
| 41 |
+
# Vector storage and embeddings (with fallbacks)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
try:
|
| 43 |
import faiss
|
| 44 |
FAISS_AVAILABLE = True
|
|
|
|
| 60 |
TORCH_AVAILABLE = False
|
| 61 |
|
| 62 |
# Document processing (simplified for HF)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
try:
|
| 64 |
import pdfplumber
|
| 65 |
PDF_PROCESSING_AVAILABLE = True
|
| 66 |
+
PDF_PROCESSOR = "pdfplumber"
|
| 67 |
except ImportError:
|
| 68 |
try:
|
| 69 |
import PyPDF2
|
| 70 |
PDF_PROCESSING_AVAILABLE = True
|
| 71 |
+
PDF_PROCESSOR = "PyPDF2"
|
| 72 |
except ImportError:
|
| 73 |
PDF_PROCESSING_AVAILABLE = False
|
| 74 |
+
PDF_PROCESSOR = None
|
| 75 |
+
|
| 76 |
+
# ===============================================================================
|
| 77 |
+
# STREAMLIT CONFIGURATION
|
| 78 |
+
# ===============================================================================
|
| 79 |
+
|
| 80 |
+
st.set_page_config(
|
| 81 |
+
page_title="AI Invoice Processing System",
|
| 82 |
+
page_icon="📄",
|
| 83 |
+
layout="wide",
|
| 84 |
+
initial_sidebar_state="expanded",
|
| 85 |
+
menu_items={
|
| 86 |
+
'Get Help': 'https://huggingface.co/spaces',
|
| 87 |
+
'Report a bug': 'https://huggingface.co/spaces',
|
| 88 |
+
'About': """
|
| 89 |
+
# AI Invoice Processing System
|
| 90 |
+
Built for Hugging Face Spaces with AI-powered extraction and semantic search.
|
| 91 |
+
"""
|
| 92 |
+
}
|
| 93 |
+
)
|
| 94 |
|
| 95 |
# ===============================================================================
|
| 96 |
+
# CONFIGURATION
|
| 97 |
# ===============================================================================
|
| 98 |
|
|
|
|
| 99 |
HF_CONFIG = {
|
| 100 |
+
"max_file_size_mb": 10,
|
| 101 |
+
"max_concurrent_files": 3,
|
| 102 |
"timeout_seconds": 30,
|
| 103 |
+
"use_cpu_only": True,
|
| 104 |
+
"embedding_model": "all-MiniLM-L6-v2",
|
| 105 |
"cache_dir": "./cache",
|
| 106 |
"data_dir": "./data",
|
| 107 |
+
"enable_ollama": False,
|
| 108 |
}
|
| 109 |
|
| 110 |
# Create necessary directories
|
|
|
|
| 112 |
os.makedirs(HF_CONFIG["data_dir"], exist_ok=True)
|
| 113 |
|
| 114 |
# ===============================================================================
|
| 115 |
+
# DATA STRUCTURES
|
|
|
|
|
|
|
|
|
|
| 116 |
# ===============================================================================
|
| 117 |
|
| 118 |
@dataclass
|
| 119 |
class InvoiceData:
|
| 120 |
+
"""Data structure for extracted invoice information"""
|
| 121 |
supplier_name: str = ""
|
| 122 |
buyer_name: str = ""
|
| 123 |
invoice_number: str = ""
|
|
|
|
| 140 |
metadata: Dict
|
| 141 |
|
| 142 |
# ===============================================================================
|
| 143 |
+
# DOCUMENT PROCESSING CLASSES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
# ===============================================================================
|
| 145 |
|
| 146 |
+
class DocumentProcessor:
|
| 147 |
"""Simplified document processor for Hugging Face Spaces"""
|
| 148 |
|
| 149 |
def __init__(self):
|
|
|
|
| 155 |
|
| 156 |
# PDF processing
|
| 157 |
if PDF_PROCESSING_AVAILABLE:
|
| 158 |
+
if PDF_PROCESSOR == "pdfplumber":
|
|
|
|
| 159 |
self.processors['pdf'] = self.extract_with_pdfplumber
|
| 160 |
st.success("✅ PDF processing available (pdfplumber)")
|
| 161 |
+
elif PDF_PROCESSOR == "PyPDF2":
|
| 162 |
+
self.processors['pdf'] = self.extract_with_pypdf2
|
| 163 |
+
st.success("✅ PDF processing available (PyPDF2)")
|
| 164 |
+
else:
|
| 165 |
+
st.warning("⚠️ No PDF processor available")
|
|
|
|
|
|
|
| 166 |
|
| 167 |
# Text files
|
| 168 |
self.processors['txt'] = self.extract_text_file
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
def extract_with_pdfplumber(self, file_path: str) -> str:
|
| 171 |
"""Extract text using pdfplumber"""
|
|
|
|
| 205 |
st.error(f"Text file extraction failed: {e}")
|
| 206 |
return ""
|
| 207 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def extract_text_from_document(self, file_path: str) -> str:
|
| 209 |
"""Extract text from document based on file type"""
|
| 210 |
file_ext = Path(file_path).suffix.lower()
|
|
|
|
| 213 |
processor = self.processors.get('pdf')
|
| 214 |
elif file_ext == '.txt':
|
| 215 |
processor = self.processors.get('txt')
|
|
|
|
|
|
|
| 216 |
else:
|
| 217 |
st.warning(f"Unsupported file type: {file_ext}")
|
| 218 |
return ""
|
|
|
|
| 224 |
return ""
|
| 225 |
|
| 226 |
# ===============================================================================
|
| 227 |
+
# AI EXTRACTION CLASS
|
| 228 |
# ===============================================================================
|
| 229 |
|
| 230 |
+
class AIExtractor:
|
| 231 |
+
"""AI extraction for Hugging Face Spaces"""
|
| 232 |
|
| 233 |
def __init__(self):
|
| 234 |
self.use_transformers = self.setup_transformers()
|
|
|
|
| 236 |
def setup_transformers(self):
|
| 237 |
"""Try to setup Hugging Face transformers for NER"""
|
| 238 |
try:
|
| 239 |
+
from transformers import pipeline
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
with st.spinner("Loading AI extraction model..."):
|
| 242 |
self.ner_pipeline = pipeline(
|
| 243 |
"ner",
|
| 244 |
+
model="dbmdz/bert-large-cased-finetuned-conll03-english",
|
|
|
|
| 245 |
aggregation_strategy="simple"
|
| 246 |
)
|
| 247 |
|
|
|
|
| 412 |
return date_str
|
| 413 |
|
| 414 |
# ===============================================================================
|
| 415 |
+
# VECTOR STORE CLASS
|
| 416 |
+
# ===============================================================================
|
| 417 |
+
|
| 418 |
+
class VectorStore:
|
| 419 |
+
"""Simplified vector store for Hugging Face Spaces"""
|
| 420 |
+
|
| 421 |
+
def __init__(self, embedding_model: str = "all-MiniLM-L6-v2"):
|
| 422 |
+
self.embedding_model_name = embedding_model
|
| 423 |
+
self.vector_store_path = os.path.join(HF_CONFIG["data_dir"], "vectors.pkl")
|
| 424 |
+
self.metadata_path = os.path.join(HF_CONFIG["data_dir"], "metadata.pkl")
|
| 425 |
+
self.embedding_model = None
|
| 426 |
+
self.vectors = []
|
| 427 |
+
self.document_metadata = []
|
| 428 |
+
self.embedding_dimension = None
|
| 429 |
+
|
| 430 |
+
self.setup_embedding_model()
|
| 431 |
+
self.load_vector_store()
|
| 432 |
+
|
| 433 |
+
def setup_embedding_model(self):
|
| 434 |
+
"""Initialize the sentence transformer model"""
|
| 435 |
+
if not SENTENCE_TRANSFORMERS_AVAILABLE:
|
| 436 |
+
st.warning("⚠️ Sentence Transformers not available. Vector search disabled.")
|
| 437 |
+
return
|
| 438 |
+
|
| 439 |
+
try:
|
| 440 |
+
with st.spinner(f"Loading embedding model: {self.embedding_model_name}..."):
|
| 441 |
+
self.embedding_model = SentenceTransformer(
|
| 442 |
+
self.embedding_model_name,
|
| 443 |
+
cache_folder=HF_CONFIG["cache_dir"]
|
| 444 |
+
)
|
| 445 |
+
|
| 446 |
+
# Get embedding dimension
|
| 447 |
+
test_embedding = self.embedding_model.encode(["test"])
|
| 448 |
+
self.embedding_dimension = test_embedding.shape[0]
|
| 449 |
+
|
| 450 |
+
st.success(f"✅ Embedding model loaded: {self.embedding_model_name}")
|
| 451 |
+
|
| 452 |
+
except Exception as e:
|
| 453 |
+
st.error(f"❌ Failed to load embedding model: {e}")
|
| 454 |
+
self.embedding_model = None
|
| 455 |
+
|
| 456 |
+
def load_vector_store(self):
|
| 457 |
+
"""Load existing vector store"""
|
| 458 |
+
try:
|
| 459 |
+
if os.path.exists(self.vector_store_path) and os.path.exists(self.metadata_path):
|
| 460 |
+
with open(self.vector_store_path, 'rb') as f:
|
| 461 |
+
self.vectors = pickle.load(f)
|
| 462 |
+
|
| 463 |
+
with open(self.metadata_path, 'rb') as f:
|
| 464 |
+
self.document_metadata = pickle.load(f)
|
| 465 |
+
|
| 466 |
+
st.success(f"✅ Vector store loaded: {len(self.document_metadata)} documents")
|
| 467 |
+
else:
|
| 468 |
+
self.vectors = []
|
| 469 |
+
self.document_metadata = []
|
| 470 |
+
st.info("📄 New vector store initialized")
|
| 471 |
+
|
| 472 |
+
except Exception as e:
|
| 473 |
+
st.error(f"❌ Error loading vector store: {e}")
|
| 474 |
+
self.vectors = []
|
| 475 |
+
self.document_metadata = []
|
| 476 |
+
|
| 477 |
+
def save_vector_store(self):
|
| 478 |
+
"""Save vector store to disk"""
|
| 479 |
+
try:
|
| 480 |
+
with open(self.vector_store_path, 'wb') as f:
|
| 481 |
+
pickle.dump(self.vectors, f)
|
| 482 |
+
|
| 483 |
+
with open(self.metadata_path, 'wb') as f:
|
| 484 |
+
pickle.dump(self.document_metadata, f)
|
| 485 |
+
|
| 486 |
+
return True
|
| 487 |
+
except Exception as e:
|
| 488 |
+
st.error(f"Error saving vector store: {e}")
|
| 489 |
+
return False
|
| 490 |
+
|
| 491 |
+
def create_document_text(self, invoice_data: dict, raw_text: str = "") -> str:
|
| 492 |
+
"""Create searchable text from invoice data"""
|
| 493 |
+
text_parts = []
|
| 494 |
+
|
| 495 |
+
for field, value in invoice_data.items():
|
| 496 |
+
if value and field != 'id':
|
| 497 |
+
text_parts.append(f"{field}: {value}")
|
| 498 |
+
|
| 499 |
+
if raw_text:
|
| 500 |
+
text_parts.append(f"content: {raw_text[:300]}")
|
| 501 |
+
|
| 502 |
+
return " | ".join(text_parts)
|
| 503 |
+
|
| 504 |
+
def add_document(self, invoice_data: dict, raw_text: str = "") -> bool:
|
| 505 |
+
"""Add a document to the vector store"""
|
| 506 |
+
if not self.embedding_model:
|
| 507 |
+
return False
|
| 508 |
+
|
| 509 |
+
try:
|
| 510 |
+
document_text = self.create_document_text(invoice_data, raw_text)
|
| 511 |
+
|
| 512 |
+
# Generate embedding
|
| 513 |
+
embedding = self.embedding_model.encode(document_text, normalize_embeddings=True)
|
| 514 |
+
|
| 515 |
+
# Create metadata
|
| 516 |
+
metadata = {
|
| 517 |
+
'invoice_id': invoice_data.get('id', ''),
|
| 518 |
+
'invoice_number': invoice_data.get('invoice_number', ''),
|
| 519 |
+
'supplier_name': invoice_data.get('supplier_name', ''),
|
| 520 |
+
'buyer_name': invoice_data.get('buyer_name', ''),
|
| 521 |
+
'amount': invoice_data.get('amount', 0),
|
| 522 |
+
'date': invoice_data.get('date', ''),
|
| 523 |
+
'file_name': invoice_data.get('file_info', {}).get('file_name', ''),
|
| 524 |
+
'document_text': document_text[:200],
|
| 525 |
+
'timestamp': datetime.now().isoformat()
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
# Add to store
|
| 529 |
+
self.vectors.append(embedding)
|
| 530 |
+
self.document_metadata.append(metadata)
|
| 531 |
+
|
| 532 |
+
return True
|
| 533 |
+
|
| 534 |
+
except Exception as e:
|
| 535 |
+
st.error(f"Error adding document to vector store: {e}")
|
| 536 |
+
return False
|
| 537 |
+
|
| 538 |
+
def semantic_search(self, query: str, top_k: int = 5) -> List[VectorSearchResult]:
|
| 539 |
+
"""Perform semantic search using cosine similarity"""
|
| 540 |
+
if not self.embedding_model or not self.vectors:
|
| 541 |
+
return []
|
| 542 |
+
|
| 543 |
+
try:
|
| 544 |
+
# Generate query embedding
|
| 545 |
+
query_embedding = self.embedding_model.encode(query, normalize_embeddings=True)
|
| 546 |
+
|
| 547 |
+
# Calculate similarities
|
| 548 |
+
similarities = []
|
| 549 |
+
for i, doc_embedding in enumerate(self.vectors):
|
| 550 |
+
similarity = np.dot(query_embedding, doc_embedding)
|
| 551 |
+
similarities.append((similarity, i))
|
| 552 |
+
|
| 553 |
+
# Sort by similarity
|
| 554 |
+
similarities.sort(reverse=True)
|
| 555 |
+
|
| 556 |
+
# Return top results
|
| 557 |
+
results = []
|
| 558 |
+
for similarity, idx in similarities[:top_k]:
|
| 559 |
+
if similarity > 0.1: # Relevance threshold
|
| 560 |
+
metadata = self.document_metadata[idx]
|
| 561 |
+
result = VectorSearchResult(
|
| 562 |
+
invoice_id=metadata.get('invoice_id', ''),
|
| 563 |
+
invoice_number=metadata.get('invoice_number', ''),
|
| 564 |
+
supplier_name=metadata.get('supplier_name', ''),
|
| 565 |
+
similarity_score=float(similarity),
|
| 566 |
+
content_preview=metadata.get('document_text', ''),
|
| 567 |
+
metadata=metadata
|
| 568 |
+
)
|
| 569 |
+
results.append(result)
|
| 570 |
+
|
| 571 |
+
return results
|
| 572 |
+
|
| 573 |
+
except Exception as e:
|
| 574 |
+
st.error(f"Error in semantic search: {e}")
|
| 575 |
+
return []
|
| 576 |
+
|
| 577 |
+
# ===============================================================================
|
| 578 |
+
# MAIN PROCESSOR CLASS
|
| 579 |
# ===============================================================================
|
| 580 |
|
| 581 |
+
class InvoiceProcessor:
|
| 582 |
+
"""Main invoice processor for Hugging Face Spaces"""
|
| 583 |
|
| 584 |
def __init__(self):
|
| 585 |
self.setup_storage()
|
| 586 |
+
self.document_processor = DocumentProcessor()
|
| 587 |
+
self.ai_extractor = AIExtractor()
|
| 588 |
+
self.vector_store = VectorStore() if SENTENCE_TRANSFORMERS_AVAILABLE else None
|
| 589 |
|
| 590 |
# Initialize stats
|
| 591 |
self.processing_stats = {
|
|
|
|
| 747 |
data["metadata"]["total_invoices"] = len(invoices)
|
| 748 |
|
| 749 |
# ===============================================================================
|
| 750 |
+
# CHATBOT CLASS
|
| 751 |
# ===============================================================================
|
| 752 |
|
| 753 |
+
class ChatBot:
|
| 754 |
+
"""Chatbot for invoice queries"""
|
| 755 |
|
| 756 |
+
def __init__(self, processor: InvoiceProcessor):
|
| 757 |
self.processor = processor
|
| 758 |
|
| 759 |
def query_database(self, query: str) -> str:
|
|
|
|
| 780 |
elif any(phrase in query_lower for phrase in ["supplier", "vendor", "company"]):
|
| 781 |
return self.handle_supplier_query(data, query)
|
| 782 |
|
|
|
|
| 783 |
elif self.processor.vector_store:
|
| 784 |
return self.handle_semantic_search(query)
|
| 785 |
|
|
|
|
| 965 |
return response
|
| 966 |
|
| 967 |
# ===============================================================================
|
| 968 |
+
# STREAMLIT APPLICATION
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 969 |
# ===============================================================================
|
| 970 |
|
| 971 |
+
def create_app():
|
| 972 |
+
"""Main Streamlit application"""
|
| 973 |
+
|
| 974 |
+
# Generate unique session ID for this run
|
| 975 |
+
if 'session_id' not in st.session_state:
|
| 976 |
+
st.session_state.session_id = str(uuid.uuid4())[:8]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 977 |
|
|
|
|
| 978 |
session_id = st.session_state.session_id
|
| 979 |
|
| 980 |
+
# Custom CSS
|
| 981 |
st.markdown("""
|
| 982 |
<style>
|
| 983 |
.main-header {
|
|
|
|
| 1012 |
""", unsafe_allow_html=True)
|
| 1013 |
|
| 1014 |
# Initialize processor
|
| 1015 |
+
if 'processor' not in st.session_state:
|
| 1016 |
with st.spinner("🔧 Initializing AI Invoice Processor..."):
|
| 1017 |
try:
|
| 1018 |
+
st.session_state.processor = InvoiceProcessor()
|
| 1019 |
+
st.session_state.chatbot = ChatBot(st.session_state.processor)
|
|
|
|
|
|
|
|
|
|
| 1020 |
st.session_state.chat_history = []
|
| 1021 |
st.success("✅ System initialized successfully!")
|
| 1022 |
except Exception as e:
|
| 1023 |
st.error(f"❌ Initialization failed: {e}")
|
| 1024 |
st.stop()
|
| 1025 |
|
| 1026 |
+
# Sidebar
|
| 1027 |
with st.sidebar:
|
| 1028 |
st.header("🎛️ System Status")
|
| 1029 |
|
| 1030 |
+
processor = st.session_state.processor
|
| 1031 |
|
| 1032 |
+
# Component status
|
| 1033 |
+
if processor.document_processor.processors:
|
| 1034 |
st.markdown('<span class="status-ok">✅ Document Processing</span>', unsafe_allow_html=True)
|
| 1035 |
else:
|
| 1036 |
st.markdown('<span class="status-error">❌ Document Processing</span>', unsafe_allow_html=True)
|
| 1037 |
|
| 1038 |
+
if processor.ai_extractor.use_transformers:
|
|
|
|
| 1039 |
st.markdown('<span class="status-ok">✅ AI Extraction</span>', unsafe_allow_html=True)
|
| 1040 |
else:
|
| 1041 |
st.markdown('<span class="status-warning">⚠️ Regex Extraction</span>', unsafe_allow_html=True)
|
| 1042 |
|
| 1043 |
+
if processor.vector_store and processor.vector_store.embedding_model:
|
|
|
|
| 1044 |
st.markdown('<span class="status-ok">✅ Semantic Search</span>', unsafe_allow_html=True)
|
| 1045 |
else:
|
| 1046 |
st.markdown('<span class="status-warning">⚠️ Keyword Search Only</span>', unsafe_allow_html=True)
|
|
|
|
| 1054 |
|
| 1055 |
st.metric("Total Invoices", total_invoices)
|
| 1056 |
st.metric("Total Value", f"₹{total_amount:,.2f}")
|
| 1057 |
+
st.metric("Success Rate", f"{processor.processing_stats['successful']}/{processor.processing_stats['total_processed']}")
|
|
|
|
|
|
|
|
|
|
| 1058 |
|
| 1059 |
except Exception as e:
|
| 1060 |
st.error(f"Stats error: {e}")
|
| 1061 |
|
| 1062 |
+
# System info
|
| 1063 |
st.header("⚙️ System Info")
|
| 1064 |
st.info(f"""
|
| 1065 |
**Session ID:** {session_id}
|
|
|
|
| 1112 |
</div>
|
| 1113 |
""", unsafe_allow_html=True)
|
| 1114 |
|
| 1115 |
+
# File upload
|
| 1116 |
st.markdown("### 📁 Upload Your Invoices")
|
| 1117 |
|
|
|
|
| 1118 |
timestamp = datetime.now().strftime("%H%M%S")
|
| 1119 |
|
| 1120 |
uploaded_files = st.file_uploader(
|
|
|
|
| 1150 |
with st.chat_message(message["role"]):
|
| 1151 |
st.markdown(message["content"])
|
| 1152 |
|
| 1153 |
+
# Chat input
|
| 1154 |
st.markdown("### ✍️ Ask a Question")
|
| 1155 |
|
| 1156 |
col1, col2 = st.columns([4, 1])
|
|
|
|
| 1212 |
st.header("📊 Analytics Dashboard")
|
| 1213 |
|
| 1214 |
try:
|
| 1215 |
+
data = st.session_state.processor.load_json_data()
|
| 1216 |
invoices = data.get("invoices", [])
|
| 1217 |
|
| 1218 |
if not invoices:
|
|
|
|
| 1280 |
st.header("📋 Data Explorer")
|
| 1281 |
|
| 1282 |
try:
|
| 1283 |
+
data = st.session_state.processor.load_json_data()
|
| 1284 |
invoices = data.get("invoices", [])
|
| 1285 |
|
| 1286 |
if not invoices:
|
|
|
|
| 1374 |
st.error(f"Data explorer error: {e}")
|
| 1375 |
|
| 1376 |
# -------------------------------------------------------------------------
|
| 1377 |
+
# GLOBAL CHAT INPUT
|
| 1378 |
# -------------------------------------------------------------------------
|
| 1379 |
|
| 1380 |
st.markdown("---")
|
| 1381 |
st.markdown("### 💬 Quick Chat (Works from any section)")
|
| 1382 |
|
|
|
|
| 1383 |
global_query = st.chat_input("Ask about your invoices...", key=f"global_chat_{session_id}")
|
| 1384 |
|
| 1385 |
if global_query:
|
|
|
|
| 1414 |
st.info(f"Processing: {uploaded_file.name}")
|
| 1415 |
|
| 1416 |
try:
|
| 1417 |
+
result = st.session_state.processor.process_uploaded_file(uploaded_file)
|
|
|
|
| 1418 |
|
| 1419 |
with results_container:
|
| 1420 |
if result and result.invoice_number:
|
|
|
|
| 1438 |
with results_container:
|
| 1439 |
st.error(f"❌ Error processing {uploaded_file.name}: {str(e)[:100]}")
|
| 1440 |
|
|
|
|
| 1441 |
with status_container:
|
| 1442 |
st.success(f"✅ Processing complete! {successful} successful, {failed} failed")
|
| 1443 |
|
|
|
|
| 1446 |
|
| 1447 |
def handle_chat_query(query, show_response=False):
|
| 1448 |
"""Handle chat query"""
|
|
|
|
| 1449 |
st.session_state.chat_history.append({
|
| 1450 |
"role": "user",
|
| 1451 |
"content": query,
|
| 1452 |
"timestamp": datetime.now()
|
| 1453 |
})
|
| 1454 |
|
|
|
|
| 1455 |
try:
|
| 1456 |
with st.spinner("🤖 AI is analyzing..."):
|
| 1457 |
+
response = st.session_state.chatbot.query_database(query)
|
| 1458 |
|
| 1459 |
st.session_state.chat_history.append({
|
| 1460 |
"role": "assistant",
|
|
|
|
| 1462 |
"timestamp": datetime.now()
|
| 1463 |
})
|
| 1464 |
|
|
|
|
| 1465 |
if show_response:
|
| 1466 |
with st.chat_message("assistant"):
|
| 1467 |
st.markdown(response)
|
|
|
|
| 1479 |
def main():
|
| 1480 |
"""Main entry point for Hugging Face Spaces"""
|
| 1481 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1482 |
if IS_HF_SPACE:
|
| 1483 |
st.sidebar.info("🤗 Running on Hugging Face Spaces")
|
| 1484 |
|
| 1485 |
+
create_app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1486 |
|
| 1487 |
except Exception as e:
|
| 1488 |
st.error(f"""
|
|
|
|
| 1493 |
Please refresh the page or check the logs for more details.
|
| 1494 |
""")
|
| 1495 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1496 |
if __name__ == "__main__":
|
| 1497 |
main()
|