|
|
|
|
|
|
|
|
"""
|
|
|
Main invoice processing pipeline
|
|
|
Orchestrates preprocessing, OCR, and extraction
|
|
|
"""
|
|
|
|
|
|
from typing import Dict, Any, Optional
|
|
|
from pathlib import Path
|
|
|
import json
|
|
|
import threading
|
|
|
from pydantic import ValidationError
|
|
|
import cv2
|
|
|
|
|
|
|
|
|
from src.preprocessing import load_image, convert_to_grayscale, remove_noise
|
|
|
from src.extraction import structure_output
|
|
|
from src.ml_extraction import extract_ml_based
|
|
|
from src.schema import InvoiceData
|
|
|
from src.pdf_utils import extract_text_from_pdf, convert_pdf_to_images
|
|
|
from src.utils import generate_semantic_hash
|
|
|
from src.repository import InvoiceRepository
|
|
|
from src.database import DB_CONNECTED
|
|
|
|
|
|
def process_invoice(image_path: str,
|
|
|
method: str = 'ml',
|
|
|
save_results: bool = False,
|
|
|
output_dir: str = 'outputs') -> Dict[str, Any]:
|
|
|
"""
|
|
|
Process an invoice image using either rule-based or ML-based extraction.
|
|
|
|
|
|
Args:
|
|
|
image_path: Path to the invoice image.
|
|
|
method: The extraction method to use ('ml' or 'rules'). Default is 'ml'.
|
|
|
save_results: Whether to save JSON results to a file.
|
|
|
output_dir: Directory to save results.
|
|
|
|
|
|
Returns:
|
|
|
A dictionary with the extracted invoice data.
|
|
|
"""
|
|
|
|
|
|
if not Path(image_path).exists():
|
|
|
raise FileNotFoundError(f"Image/PDF not found at path: {image_path}")
|
|
|
|
|
|
print(f"Processing: {image_path}")
|
|
|
|
|
|
raw_result = {}
|
|
|
is_digital_pdf = False
|
|
|
|
|
|
|
|
|
if image_path.lower().endswith('.pdf'):
|
|
|
print("📄 PDF detected. Checking type...")
|
|
|
try:
|
|
|
|
|
|
digital_text = extract_text_from_pdf(image_path)
|
|
|
|
|
|
|
|
|
if len(digital_text.strip()) > 50:
|
|
|
print(" ✅ Digital Text found. Using Rule-Based Engine (Fast Mode).")
|
|
|
|
|
|
raw_result = structure_output(digital_text)
|
|
|
is_digital_pdf = True
|
|
|
method = 'rules (digital)'
|
|
|
else:
|
|
|
print(" ⚠️ Sparse text detected. Treating as Scanned PDF.")
|
|
|
|
|
|
print(" 🔄 Converting Page 1 to Image...")
|
|
|
images = convert_pdf_to_images(image_path)
|
|
|
|
|
|
|
|
|
|
|
|
temp_jpg = image_path.replace('.pdf', '.jpg')
|
|
|
cv2.imwrite(temp_jpg, images[0])
|
|
|
|
|
|
|
|
|
image_path = temp_jpg
|
|
|
print(f" ➡️ Continuing with converted image: {image_path}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f" ❌ PDF Error: {e}. Falling back to standard processing.")
|
|
|
|
|
|
|
|
|
|
|
|
if not is_digital_pdf:
|
|
|
print(f"⚙️ Using '{method}' method on image...")
|
|
|
|
|
|
if method == 'ml':
|
|
|
try:
|
|
|
raw_result = extract_ml_based(image_path)
|
|
|
except Exception as e:
|
|
|
raise ValueError(f"Error during ML-based extraction: {e}")
|
|
|
|
|
|
elif method == 'rules':
|
|
|
try:
|
|
|
print("⚠️ Rule-based mode is deprecated. Redirecting to ML-based extraction.")
|
|
|
raw_result = extract_ml_based(image_path)
|
|
|
except Exception as e:
|
|
|
raise ValueError(f"Error during ML-based extraction: {e}")
|
|
|
|
|
|
|
|
|
if image_path.endswith('.jpg') and 'sample_pdf' in image_path:
|
|
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
final_data = raw_result
|
|
|
|
|
|
if method == 'ml':
|
|
|
try:
|
|
|
invoice = InvoiceData(**raw_result)
|
|
|
final_data = invoice.model_dump(mode='json')
|
|
|
final_data['validation_status'] = 'passed'
|
|
|
print("✅ Data Validation Passed")
|
|
|
except ValidationError as e:
|
|
|
print(f"❌ Data Validation Failed: {len(e.errors())} errors")
|
|
|
|
|
|
|
|
|
|
|
|
final_data = raw_result.copy()
|
|
|
final_data['validation_status'] = 'failed'
|
|
|
|
|
|
|
|
|
error_list = []
|
|
|
for err in e.errors():
|
|
|
field = " -> ".join(str(loc) for loc in err['loc'])
|
|
|
msg = err['msg']
|
|
|
print(f" - {field}: {msg}")
|
|
|
error_list.append(f"{field}: {msg}")
|
|
|
|
|
|
final_data['validation_errors'] = error_list
|
|
|
|
|
|
|
|
|
if 'raw_predictions' in raw_result:
|
|
|
final_data['raw_predictions'] = raw_result['raw_predictions']
|
|
|
if 'raw_text' in raw_result:
|
|
|
final_data['raw_text'] = raw_result['raw_text']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
final_data['semantic_hash'] = generate_semantic_hash(final_data)
|
|
|
|
|
|
|
|
|
def background_db_operation(data_to_save):
|
|
|
"""Check for duplicate and save in background thread"""
|
|
|
try:
|
|
|
repo = InvoiceRepository()
|
|
|
if repo.session:
|
|
|
|
|
|
existing = repo.get_by_hash(data_to_save.get('semantic_hash', ''))
|
|
|
if existing:
|
|
|
print(f" ⚠️ [Background] Duplicate: {data_to_save.get('receipt_number')}")
|
|
|
else:
|
|
|
|
|
|
saved = repo.save_invoice(data_to_save)
|
|
|
if saved:
|
|
|
print(f" ✅ [Background] Saved: {data_to_save.get('receipt_number')}")
|
|
|
else:
|
|
|
print(f" ⚠️ [Background] Save failed: {data_to_save.get('receipt_number')}")
|
|
|
except Exception as e:
|
|
|
print(f" ⚠️ [Background] DB Error: {e}")
|
|
|
|
|
|
if DB_CONNECTED:
|
|
|
|
|
|
save_thread = threading.Thread(target=background_db_operation, args=(final_data.copy(),))
|
|
|
save_thread.start()
|
|
|
final_data['_db_status'] = 'queued'
|
|
|
else:
|
|
|
final_data['_db_status'] = 'disabled'
|
|
|
|
|
|
|
|
|
if save_results:
|
|
|
output_path = Path(output_dir)
|
|
|
output_path.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
|
|
json_path = output_path / (Path(image_path).stem + f"_{method}.json")
|
|
|
try:
|
|
|
with open(json_path, 'w', encoding='utf-8') as f:
|
|
|
|
|
|
json.dump(final_data, f, indent=2, ensure_ascii=False, default=str)
|
|
|
except Exception as e:
|
|
|
raise IOError(f"Error saving results to {json_path}: {e}")
|
|
|
|
|
|
return final_data
|
|
|
|
|
|
|
|
|
def process_batch(image_folder: str, output_dir: str = 'outputs') -> list:
|
|
|
"""Process multiple invoices in a folder"""
|
|
|
results = []
|
|
|
|
|
|
supported_extensions = ['*.jpg', '*.png', '*.jpeg']
|
|
|
|
|
|
for ext in supported_extensions:
|
|
|
for img_file in Path(image_folder).glob(ext):
|
|
|
print(f"🔄 Processing: {img_file}")
|
|
|
try:
|
|
|
result = process_invoice(str(img_file), save_results=True, output_dir=output_dir)
|
|
|
results.append(result)
|
|
|
except Exception as e:
|
|
|
print(f"❌ Error processing {img_file}: {e}")
|
|
|
|
|
|
print(f"\n🎉 Batch processing complete! {len(results)} invoices processed.")
|
|
|
return results
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""Command-line interface for invoice processing"""
|
|
|
import argparse
|
|
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
description='Process invoice images or folders and extract structured data.',
|
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
epilog="""
|
|
|
Examples:
|
|
|
# Process a single invoice
|
|
|
python src/pipeline.py data/raw/receipt1.jpg
|
|
|
|
|
|
# Process and save a single invoice
|
|
|
python src/pipeline.py data/raw/receipt1.jpg --save
|
|
|
|
|
|
# Process an entire folder of invoices
|
|
|
python src/pipeline.py data/raw --save --output results/
|
|
|
"""
|
|
|
)
|
|
|
|
|
|
|
|
|
parser.add_argument('path', help='Path to an invoice image or a folder of images')
|
|
|
parser.add_argument('--save', action='store_true', help='Save results to JSON files')
|
|
|
parser.add_argument('--output', default='outputs', help='Output directory for JSON files')
|
|
|
parser.add_argument('--method', default='ml', choices=['ml', 'rules'], help="Extraction method: 'ml' or 'rules'")
|
|
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
try:
|
|
|
|
|
|
if Path(args.path).is_dir():
|
|
|
process_batch(args.path, output_dir=args.output)
|
|
|
elif Path(args.path).is_file():
|
|
|
|
|
|
print(f"🔄 Processing: {args.path}")
|
|
|
result = process_invoice(args.path, method=args.method, save_results=args.save, output_dir=args.output)
|
|
|
|
|
|
print("\n📊 Extracted Data:")
|
|
|
print("=" * 60)
|
|
|
print(f"Vendor: {result.get('vendor', 'N/A')}")
|
|
|
print(f"Invoice Number: {result.get('invoice_number', 'N/A')}")
|
|
|
print(f"Date: {result.get('date', 'N/A')}")
|
|
|
print(f"Total: ${result.get('total_amount', 0.0)}")
|
|
|
print("=" * 60)
|
|
|
|
|
|
if args.save:
|
|
|
print(f"\n💾 JSON saved to: {args.output}/{Path(args.path).stem}.json")
|
|
|
else:
|
|
|
raise FileNotFoundError(f"Path does not exist: {args.path}")
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"❌ An error occurred: {e}")
|
|
|
return 1
|
|
|
|
|
|
return 0
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
import sys
|
|
|
sys.exit(main()) |