Spaces:

GSoumyajit2005
/

invoice-processor-ml

Sleeping

App Files Files Community

GSoumyajit2005 commited on Dec 1, 2025

Commit

d79b7f7

1 Parent(s): 22fe020

feat: Add Phase 3 generalization scripts and clean up legacy files

Browse files

Files changed (24) hide show

.gitignore +73 -70
LICENSE +21 -21
README.md +322 -322
app.py +312 -312
eval_new_dataset.py +42 -0
explore_new_dataset.py +113 -0
load_sroie_dataset.py +65 -0
notebooks/test_setup.py +0 -11
notebooks/test_visual.ipynb +0 -0
requirements.txt +0 -0
src/data_loader.py +197 -0
src/extraction.py +123 -273
src/ml_extraction.py +143 -175
src/ocr.py +15 -15
src/pipeline.py +150 -150
src/preprocessing.py +78 -78
tests/test_extraction.py +40 -40
tests/test_full_pipeline.py +41 -41
tests/test_ocr.py +100 -100
tests/test_pipeline.py +95 -95
tests/test_preprocessing.py +177 -177
tests/utils.py +6 -6
train_combined.py +187 -0
train_layoutlm.py +185 -0

.gitignore CHANGED Viewed

@@ -1,70 +1,73 @@
-# Python
-__pycache__/
-*.pyc
-*.pyo
-*.pyd
-# Environment
-env/
-venv/
-.env
-config.yaml
-credentials.json
-# IDE / Editor
-.vscode/
-.idea/
-*.swp
-*.swo
-# Notebooks / caches / logs
-.ipynb_checkpoints/
-.pytest_cache/
-*.log
-logs/
-.cache/
-# OS
-.DS_Store
-Thumbs.db
-ehthumbs.db
-*.code-workspace
-Desktop.ini
-# Streamlit temp folder
-temp/
-.streamlit/
-# Jupyter Notebook
-.ipynb_checkpoints
-# JSON outputs
-outputs/
-# Logs
-logs/
-*.log
-# --- Data Folders ---
-# Ignore all files inside the raw and processed data folders
-data/raw/*
-data/processed/*
-# But DO NOT ignore the .gitkeep files inside them
-!data/raw/.gitkeep
-!data/processed/.gitkeep
-!requirements.txt
-!README.md
-datasets/
-checkpoints/
-lightning_logs/
-wandb/
-mlruns/
-# Ignore all files in the models directory
-models/*
-!models/.gitkeep
-!models/README.md

+# Python
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+# Environment
+env/
+venv/
+.env
+config.yaml
+credentials.json
+# IDE / Editor
+.vscode/
+.idea/
+*.swp
+*.swo
+# Notebooks / caches / logs
+.ipynb_checkpoints/
+.pytest_cache/
+*.log
+logs/
+.cache/
+# OS
+.DS_Store
+Thumbs.db
+ehthumbs.db
+*.code-workspace
+Desktop.ini
+# Streamlit temp folder
+temp/
+.streamlit/
+# Jupyter Notebook
+.ipynb_checkpoints
+# JSON outputs
+outputs/
+# Logs
+logs/
+*.log
+# --- Data Folders ---
+# Ignore all files inside the raw and processed data folders
+data/raw/*
+data/processed/*
+# But DO NOT ignore the .gitkeep files inside them
+!data/raw/.gitkeep
+!data/processed/.gitkeep
+!requirements.txt
+!README.md
+datasets/
+checkpoints/
+lightning_logs/
+wandb/
+mlruns/
+# Ignore all files in the models directory
+models/*
+!models/.gitkeep
+!models/README.md
+# Ignore sroie files in the data directory
+data/sroie/

LICENSE CHANGED Viewed

@@ -1,21 +1,21 @@
-MIT License
-Copyright (c) 2025 Soumyajit Ghosh
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.

+MIT License
+Copyright (c) 2025 Soumyajit Ghosh
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,323 +1,323 @@
-# 📄 Smart Invoice Processor
-End-to-end invoice/receipt processing with OCR + Rule-based extraction and a fine‑tuned LayoutLMv3 model. Upload an image or run via CLI to get clean, structured JSON (vendor, date, totals, address, etc.).
-![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
-![Streamlit](https://img.shields.io/badge/Streamlit-1.51+-red.svg)
-![Tesseract](https://img.shields.io/badge/Tesseract-5.0+-green.svg)
-![Transformers](https://img.shields.io/badge/Transformers-4.x-purple.svg)
-![PyTorch](https://img.shields.io/badge/PyTorch-2.x-orange.svg)
----
-## 🎯 Features
-- ✅ OCR using Tesseract (configurable, fast, multi-platform)
-- ✅ Rule-based extraction (regex baselines)
-- ✅ ML-based extraction (LayoutLMv3 fine‑tuned on SROIE) for robust field detection
-- ✅ Clean JSON output (date, total, vendor, address, receipt number*)
-- ✅ Confidence and simple validation (e.g., total found among amounts)
-- ✅ Streamlit web UI with method toggle (ML vs Regex)
-- ✅ CLI for single/batch processing with saving to JSON
-- ✅ Tests for preprocessing/OCR/pipeline
-> Note: SROIE does not include invoice/receipt number labels; the ML model won’t output it unless you add labeled data. The rule-based extractor can still provide it when formats allow.
----
-## 📊 Demo
-### Web Interface
-![Homepage](docs/screenshots/homepage.png)
-*Clean upload → extract flow with method selector (ML vs Regex).*
-### Successful Extraction (ML-based)
-![Success Result](docs/screenshots/success_result.png)
-*Fields extracted with LayoutLMv3.*
-### Format Detection (simulated)
-![Format Detection](docs/screenshots/format_detection.png)
-*UI shows simple format hints and confidence.*
-### Example JSON (Rule-based)
-```json
-{
-  "receipt_number": "PEGIV-1030765",
-  "date": "15/01/2019",
-  "bill_to": {
-    "name": "THE PEAK QUARRY WORKS",
-    "email": null
-  },
-  "items": [],
-  "total_amount": 193.0,
-  "extraction_confidence": 100,
-  "validation_passed": true,
-  "vendor": "OJC MARKETING SDN BHD",
-  "address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR"
-}
-```
-### Example JSON (ML-based)
-```json
-{
-  "receipt_number": null,
-  "date": "15/01/2019",
-  "bill_to": null,
-  "items": [],
-  "total_amount": 193.0,
-  "vendor": "OJC MARKETING SDN BHD",
-  "address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR",
-  "raw_text": "…",
-  "raw_ocr_words": ["…"],
-  "raw_predictions": {
-    "DATE": {"text": "15/01/2019", "bbox": [[…]]},
-    "TOTAL": {"text": "193.00", "bbox": [[…]]},
-    "COMPANY": {"text": "OJC MARKETING SDN BHD", "bbox": [[…]]},
-    "ADDRESS": {"text": "…", "bbox": [[…]]}
-  }
-}
-```
-## 🚀 Quick Start
-### Prerequisites
-- Python 3.10+
-- Tesseract OCR
-- (Optional) CUDA-capable GPU for training/inference speed
-### Installation
-1. Clone the repository
-```bash
-git clone https://github.com/GSoumyajit2005/invoice-processor-ml
-cd invoice-processor-ml
-```
-2. Install dependencies
-```bash
-pip install -r requirements.txt
-```
-3. Install Tesseract OCR
-- **Windows**: Download from [UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki)
-- **Mac**: `brew install tesseract`
-- **Linux**: `sudo apt install tesseract-ocr`
-4. (Optional, Windows) Set Tesseract path in src/ocr.py if needed:
-```bash
-pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-```
-5. Run the web app
-```bash
-streamlit run app.py
-```
-## 💻 Usage
-### Web Interface (Recommended)
-The easiest way to use the processor is via the web interface.
-```bash
-streamlit run app.py
-```
-- Upload an invoice image (PNG/JPG).
-- Choose extraction method in sidebar:
-    - ML-Based (LayoutLMv3)
-    - Rule-Based (Regex)
-- View JSON, download results.
-### Command-Line Interface (CLI)
-You can also process invoices directly from the command line.
-#### 1. Processing a Single Invoice
-This command processes the provided sample invoice and prints the results to the console.
-```bash
-python src/pipeline.py data/samples/sample_invoice.jpg --save --method ml
-# or
-python src/pipeline.py data/samples/sample_invoice.jpg --save --method rules
-```
-#### 2. Batch Processing a Folder
-The CLI can process an entire folder of images at once.
-First, place your own invoice images (e.g., `my_invoice1.jpg`, `my_invoice2.png`) into the `data/raw/` folder.
-Then, run the following command. It will process all images in `data/raw/`. Saved files are written to `outputs/{stem}_{method}.json`.
-```bash
-python src/pipeline.py data/raw --save --method ml
-```
-### Python API
-You can integrate the pipeline directly into your own Python scripts.
-```python
-from src.pipeline import process_invoice
-import json
-result = process_invoice('data/samples/sample_invoice.jpg', method='ml')
-print(json.dumps(result, indent=2))
-```
-## 🏗️ Architecture
-```
-                           ┌────────────────┐
-                           │  Upload Image  │
-                           └───────┬────────┘
-                                   │
-                                   ▼
-                         ┌────────────────────┐
-                         │   Preprocessing    │  (OpenCV grayscale/denoise)
-                         └────────┬───────────┘
-                                  │
-                                  ▼
-                          ┌───────────────┐
-                          │     OCR       │  (Tesseract)
-                          └───────┬───────┘
-                                  │
-                   ┌──────────────┴──────────────┐
-                   │                             │
-                   ▼                             ▼
-         ┌──────────────────┐           ┌────────────────────────┐
-         │  Rule-based IE   │           │   ML-based IE (NER)    │
-         │  (regex, heur.)  │           │ LayoutLMv3 token-class │
-         └────────┬─────────┘           └───────────┬────────────┘
-                  │                                 │
-                  └──────────────┬──────────────────┘
-                                 ▼
-                         ┌──────────────────┐
-                         │   Post-process   │
-                         │ validate, scores │
-                         └────────┬─────────┘
-                                  ▼
-                         ┌──────────────────┐
-                         │    JSON Output   │
-                         └──────────────────┘
-```
-## 📁 Project Structure
-```
-invoice-processor-ml/
-│
-├── data/
-│   ├── raw/                    # Input invoice images for processing
-│   └── processed/              # (Reserved for future use)
-│
-│
-├── data/samples/
-│   └── sample_invoice.jpg      # Public sample for quick testing
-│
-├── docs/
-│ └── screenshots/              # UI Screenshots for the README demo
-│
-│
-├── models/
-│   └── layoutlmv3-sroie-best/  # Fine-tuned model (created after training)
-│
-├── outputs/                    # Default folder for saved JSON results
-│
-├── src/
-│   ├── preprocessing.py        # Image preprocessing functions (grayscale, denoise)
-│   ├── ocr.py                  # Tesseract OCR integration
-│   ├── extraction.py           # Regex-based information extraction logic
-│   ├── ml_extraction.py        # ML-based extraction (LayoutLMv3)
-│   └── pipeline.py             # Main orchestrator for the pipeline and CLI
-│
-│
-├── tests/ # <-- ADD THIS FOLDER
-│ ├── test_preprocessing.py       # Tests for the preprocessing module
-│ ├── test_ocr.py                 # Tests for the OCR module
-│ └── test_pipeline.py            # End-to-end pipeline tests
-│
-├── app.py                      # Streamlit web interface
-├── requirements.txt            # Python dependencies
-└── README.md                   # You are Here!
-```
-## 🧠 Model & Training
-- **Model**: `microsoft/layoutlmv3-base` (125M params)
-- **Task**:  Token Classification (NER) with 9 labels: `O, B/I-COMPANY, B/I-ADDRESS, B/I-DATE, B/I-TOTAL`
-- **Dataset**: SROIE (ICDAR 2019, English retail receipts)
-- **Training**: RTX 3050 6GB, PyTorch 2.x, Transformers 4.x
-- **Result**: Best F1 ≈ 0.922 on validation (epoch 5 saved)
-- Training scripts(local):
-- `train_layoutlm.py` (data prep, training loop with validation + model save)
-- Model saved to: `models/layoutlmv3-sroie-best/`
-## 📈 Performance
-- **OCR accuracy (clear images)**: High with Tesseract
-- **Rule-based extraction**: Strong on simple retail receipts
-- **ML-based extraction (SROIE-style)**:
-    - COMPANY / ADDRESS / DATE / TOTAL: High F1 on simple receipts
-    - Complex business invoices: Partial extraction unless further fine-tuned
-## ⚠️ Known Limitations
-1. **Layout Sensitivity**: The ML model was fine‑tuned only on SROIE (retail receipts). Professional multi-column invoices may underperform until you fine‑tune on more diverse datasets.
-2. **Invoice Number (ML)**: SROIE lacks invoice number labels; the ML model won’t output it unless you add labeled data. The rule-based method can still recover it on many formats.
-3. **Line Items/Tables**: Not trained for table extraction yet. Rule-based supports simple totals; table extraction comes later.
-4. **OCR Variability**: Tesseract outputs can vary; preprocessing and thresholds can impact ML results.
-## 🔮 Future Enhancements
-- [ ] Add and fine‑tune on mychen76/invoices-and-receipts_ocr_v1 (English) for broader invoice formats
-- [ ] (Optional) Add FATURA (table-focused) for line-item extraction
-- [ ] Sliding-window chunking for >512 token documents (to avoid truncation)
-- [ ] Table detection (Camelot/Tabula/DeepDeSRT) for line items
-- [ ] PDF support (pdf2image) for multipage invoices
-- [ ] FastAPI backend + Docker
-- [ ] Multilingual OCR (PaddleOCR) and multilingual fine‑tuning
-- [ ] Confidence calibration and better validation rules
-## 🛠️ Tech Stack
-| Component | Technology |
-|-----------|------------|
-| OCR | Tesseract 5.0+ |
-| Image Processing | OpenCV, Pillow |
-| ML/NLP | PyTorch 2.x, Transformers |
-| Model | LayoutLMv3 (token class.) |
-| Web Interface | Streamlit |
-| Data Format | JSON |
-## 📚 What I Learned
-- OCR challenges (confusable characters, confidence-based filtering)
-- Layout-aware NER with LayoutLMv3 (text + bbox + pixels)
-- Data normalization (bbox to 0–1000 scale)
-- End-to-end pipelines (UI + CLI + JSON output)
-- When regex is enough vs when ML is needed
-- Evaluation (seqeval F1 for NER)
-## 🤝 Contributing
-Contributions welcome! Areas needing improvement:
-- New patterns for regex extractor
-- Better preprocessing for OCR
-- New datasets and training configs
-- Tests and CI
-## 📝 License
-MIT License - See LICENSE file for details
-## 👨‍💻 Author
-**Soumyajit Ghosh** - 3rd Year BTech Student
-- Exploring AI/ML and practical applications
-- [LinkedIn](https://www.linkedin.com/in/soumyajit-ghosh-49a5b02b2?utm_source=share&utm_campaign) | [GitHub](https://github.com/GSoumyajit2005) | [Portfolio](#)(Coming Soon)
----
 **Note**: "This is a learning project demonstrating an end-to-end ML pipeline. Not recommended for production use without further validation, retraining on diverse datasets, and security hardening."

+# 📄 Smart Invoice Processor
+End-to-end invoice/receipt processing with OCR + Rule-based extraction and a fine‑tuned LayoutLMv3 model. Upload an image or run via CLI to get clean, structured JSON (vendor, date, totals, address, etc.).
+![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)
+![Streamlit](https://img.shields.io/badge/Streamlit-1.51+-red.svg)
+![Tesseract](https://img.shields.io/badge/Tesseract-5.0+-green.svg)
+![Transformers](https://img.shields.io/badge/Transformers-4.x-purple.svg)
+![PyTorch](https://img.shields.io/badge/PyTorch-2.x-orange.svg)
+---
+## 🎯 Features
+- ✅ OCR using Tesseract (configurable, fast, multi-platform)
+- ✅ Rule-based extraction (regex baselines)
+- ✅ ML-based extraction (LayoutLMv3 fine‑tuned on SROIE) for robust field detection
+- ✅ Clean JSON output (date, total, vendor, address, receipt number*)
+- ✅ Confidence and simple validation (e.g., total found among amounts)
+- ✅ Streamlit web UI with method toggle (ML vs Regex)
+- ✅ CLI for single/batch processing with saving to JSON
+- ✅ Tests for preprocessing/OCR/pipeline
+> Note: SROIE does not include invoice/receipt number labels; the ML model won’t output it unless you add labeled data. The rule-based extractor can still provide it when formats allow.
+u
+---
+## 📊 Demo
+### Web Interface
+![Homepage](docs/screenshots/homepage.png)
+*Clean upload → extract flow with method selector (ML vs Regex).*
+### Successful Extraction (ML-based)
+![Success Result](docs/screenshots/success_result.png)
+*Fields extracted with LayoutLMv3.*
+### Format Detection (simulated)
+![Format Detection](docs/screenshots/format_detection.png)
+*UI shows simple format hints and confidence.*
+### Example JSON (Rule-based)
+```json
+{
+  "receipt_number": "PEGIV-1030765",
+  "date": "15/01/2019",
+  "bill_to": {
+    "name": "THE PEAK QUARRY WORKS",
+    "email": null
+  },
+  "items": [],
+  "total_amount": 193.0,
+  "extraction_confidence": 100,
+  "validation_passed": true,
+  "vendor": "OJC MARKETING SDN BHD",
+  "address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR"
+}
+```
+### Example JSON (ML-based)
+```json
+{
+  "receipt_number": null,
+  "date": "15/01/2019",
+  "bill_to": null,
+  "items": [],
+  "total_amount": 193.0,
+  "vendor": "OJC MARKETING SDN BHD",
+  "address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR",
+  "raw_text": "…",
+  "raw_ocr_words": ["…"],
+  "raw_predictions": {
+    "DATE": {"text": "15/01/2019", "bbox": [[…]]},
+    "TOTAL": {"text": "193.00", "bbox": [[…]]},
+    "COMPANY": {"text": "OJC MARKETING SDN BHD", "bbox": [[…]]},
+    "ADDRESS": {"text": "…", "bbox": [[…]]}
+  }
+}
+```
+## 🚀 Quick Start
+### Prerequisites
+- Python 3.10+
+- Tesseract OCR
+- (Optional) CUDA-capable GPU for training/inference speed
+### Installation
+1. Clone the repository
+```bash
+git clone https://github.com/GSoumyajit2005/invoice-processor-ml
+cd invoice-processor-ml
+```
+2. Install dependencies
+```bash
+pip install -r requirements.txt
+```
+3. Install Tesseract OCR
+- **Windows**: Download from [UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki)
+- **Mac**: `brew install tesseract`
+- **Linux**: `sudo apt install tesseract-ocr`
+4. (Optional, Windows) Set Tesseract path in src/ocr.py if needed:
+```bash
+pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
+```
+5. Run the web app
+```bash
+streamlit run app.py
+```
+## 💻 Usage
+### Web Interface (Recommended)
+The easiest way to use the processor is via the web interface.
+```bash
+streamlit run app.py
+```
+- Upload an invoice image (PNG/JPG).
+- Choose extraction method in sidebar:
+    - ML-Based (LayoutLMv3)
+    - Rule-Based (Regex)
+- View JSON, download results.
+### Command-Line Interface (CLI)
+You can also process invoices directly from the command line.
+#### 1. Processing a Single Invoice
+This command processes the provided sample invoice and prints the results to the console.
+```bash
+python src/pipeline.py data/samples/sample_invoice.jpg --save --method ml
+# or
+python src/pipeline.py data/samples/sample_invoice.jpg --save --method rules
+```
+#### 2. Batch Processing a Folder
+The CLI can process an entire folder of images at once.
+First, place your own invoice images (e.g., `my_invoice1.jpg`, `my_invoice2.png`) into the `data/raw/` folder.
+Then, run the following command. It will process all images in `data/raw/`. Saved files are written to `outputs/{stem}_{method}.json`.
+```bash
+python src/pipeline.py data/raw --save --method ml
+```
+### Python API
+You can integrate the pipeline directly into your own Python scripts.
+```python
+from src.pipeline import process_invoice
+import json
+result = process_invoice('data/samples/sample_invoice.jpg', method='ml')
+print(json.dumps(result, indent=2))
+```
+## 🏗️ Architecture
+```
+                           ┌────────────────┐
+                           │  Upload Image  │
+                           └───────┬────────┘
+                                   │
+                                   ▼
+                         ┌────────────────────┐
+                         │   Preprocessing    │  (OpenCV grayscale/denoise)
+                         └────────┬───────────┘
+                                  │
+                                  ▼
+                          ┌───────────────┐
+                          │     OCR       │  (Tesseract)
+                          └───────┬───────┘
+                                  │
+                   ┌──────────────┴──────────────┐
+                   │                             │
+                   ▼                             ▼
+         ┌──────────────────┐           ┌────────────────────────┐
+         │  Rule-based IE   │           │   ML-based IE (NER)    │
+         │  (regex, heur.)  │           │ LayoutLMv3 token-class │
+         └────────┬─────────┘           └───────────┬────────────┘
+                  │                                 │
+                  └──────────────┬──────────────────┘
+                                 ▼
+                         ┌──────────────────┐
+                         │   Post-process   │
+                         │ validate, scores │
+                         └────────┬─────────┘
+                                  ▼
+                         ┌──────────────────┐
+                         │    JSON Output   │
+                         └──────────────────┘
+```
+## 📁 Project Structure
+```
+invoice-processor-ml/
+│
+├── data/
+│   ├── raw/                    # Input invoice images for processing
+│   └── processed/              # (Reserved for future use)
+│
+│
+├── data/samples/
+│   └── sample_invoice.jpg      # Public sample for quick testing
+│
+├── docs/
+│ └── screenshots/              # UI Screenshots for the README demo
+│
+│
+├── models/
+│   └── layoutlmv3-sroie-best/  # Fine-tuned model (created after training)
+│
+├── outputs/                    # Default folder for saved JSON results
+│
+├── src/
+│   ├── preprocessing.py        # Image preprocessing functions (grayscale, denoise)
+│   ├── ocr.py                  # Tesseract OCR integration
+│   ├── extraction.py           # Regex-based information extraction logic
+│   ├── ml_extraction.py        # ML-based extraction (LayoutLMv3)
+│   └── pipeline.py             # Main orchestrator for the pipeline and CLI
+│
+│
+├── tests/ # <-- ADD THIS FOLDER
+│ ├── test_preprocessing.py       # Tests for the preprocessing module
+│ ├── test_ocr.py                 # Tests for the OCR module
+│ └── test_pipeline.py            # End-to-end pipeline tests
+│
+├── app.py                      # Streamlit web interface
+├── requirements.txt            # Python dependencies
+└── README.md                   # You are Here!
+```
+## 🧠 Model & Training
+- **Model**: `microsoft/layoutlmv3-base` (125M params)
+- **Task**:  Token Classification (NER) with 9 labels: `O, B/I-COMPANY, B/I-ADDRESS, B/I-DATE, B/I-TOTAL`
+- **Dataset**: SROIE (ICDAR 2019, English retail receipts)
+- **Training**: RTX 3050 6GB, PyTorch 2.x, Transformers 4.x
+- **Result**: Best F1 ≈ 0.922 on validation (epoch 5 saved)
+- Training scripts (local):
+- `train_layoutlm.py` (data prep, training loop with validation + model save)
+- Model saved to: `models/layoutlmv3-sroie-best/`
+## 📈 Performance
+- **OCR accuracy (clear images)**: High with Tesseract
+- **Rule-based extraction**: Strong on simple retail receipts
+- **ML-based extraction (SROIE-style)**:
+    - COMPANY / ADDRESS / DATE / TOTAL: High F1 on simple receipts
+    - Complex business invoices: Partial extraction unless further fine-tuned
+## ⚠️ Known Limitations
+1. **Layout Sensitivity**: The ML model was fine‑tuned only on SROIE (retail receipts). Professional multi-column invoices may underperform until you fine‑tune on more diverse datasets.
+2. **Invoice Number (ML)**: SROIE lacks invoice number labels; the ML model won’t output it unless you add labeled data. The rule-based method can still recover it on many formats.
+3. **Line Items/Tables**: Not trained for table extraction yet. Rule-based supports simple totals; table extraction comes later.
+4. **OCR Variability**: Tesseract outputs can vary; preprocessing and thresholds can impact ML results.
+## 🔮 Future Enhancements
+- [ ] Add and fine‑tune on mychen76/invoices-and-receipts_ocr_v1 (English) for broader invoice formats
+- [ ] (Optional) Add FATURA (table-focused) for line-item extraction
+- [ ] Sliding-window chunking for >512 token documents (to avoid truncation)
+- [ ] Table detection (Camelot/Tabula/DeepDeSRT) for line items
+- [ ] PDF support (pdf2image) for multipage invoices
+- [ ] FastAPI backend + Docker
+- [ ] Multilingual OCR (PaddleOCR) and multilingual fine‑tuning
+- [ ] Confidence calibration and better validation rules
+## 🛠️ Tech Stack
+| Component | Technology |
+|-----------|------------|
+| OCR | Tesseract 5.0+ |
+| Image Processing | OpenCV, Pillow |
+| ML/NLP | PyTorch 2.x, Transformers |
+| Model | LayoutLMv3 (token class.) |
+| Web Interface | Streamlit |
+| Data Format | JSON |
+## 📚 What I Learned
+- OCR challenges (confusable characters, confidence-based filtering)
+- Layout-aware NER with LayoutLMv3 (text + bbox + pixels)
+- Data normalization (bbox to 0–1000 scale)
+- End-to-end pipelines (UI + CLI + JSON output)
+- When regex is enough vs when ML is needed
+- Evaluation (seqeval F1 for NER)
+## 🤝 Contributing
+Contributions welcome! Areas needing improvement:
+- New patterns for regex extractor
+- Better preprocessing for OCR
+- New datasets and training configs
+- Tests and CI
+## 📝 License
+MIT License - See LICENSE file for details
+## 👨‍💻 Author
+**Soumyajit Ghosh** - 3rd Year BTech Student
+- Exploring AI/ML and practical applications
+- [LinkedIn](https://www.linkedin.com/in/soumyajit-ghosh-49a5b02b2?utm_source=share&utm_campaign) | [GitHub](https://github.com/GSoumyajit2005) | [Portfolio](#)(Coming Soon)
+---
 **Note**: "This is a learning project demonstrating an end-to-end ML pipeline. Not recommended for production use without further validation, retraining on diverse datasets, and security hardening."

app.py CHANGED Viewed

@@ -1,313 +1,313 @@
-import streamlit as st
-import os
-import json
-from datetime import datetime
-from PIL import Image
-import numpy as np
-import pandas as pd
-from pathlib import Path
-# Import our actual, working pipeline function
-import sys
-sys.path.append('src')
-from pipeline import process_invoice
-# --- Mock Functions to support the UI without errors ---
-# These functions simulate the ones from your example README.
-# They allow the UI to render without needing to build a complex format detector today.
-def detect_invoice_format(ocr_text: str):
-    """
-    A mock function to simulate format detection.
-    In a real system, this would analyze the text layout.
-    """
-    # Simple heuristic: if it contains "SDN BHD", it's our known format.
-    if "SDN BHD" in ocr_text:
-        return {
-            'name': 'Template A (Retail)',
-            'confidence': 95.0,
-            'supported': True,
-            'indicators': ["Found 'SDN BHD' suffix", "Date format DD/MM/YYYY detected"]
-        }
-    else:
-        return {
-            'name': 'Unknown Format',
-            'confidence': 20.0,
-            'supported': False,
-            'indicators': ["No known company suffixes found"]
-        }
-def get_format_recommendations(format_info):
-    """Mock recommendations based on the detected format."""
-    if format_info['supported']:
-        return ["• Extraction should be highly accurate."]
-    else:
-        return ["• Results may be incomplete.", "• Consider adding patterns for this format."]
-# --- Streamlit App ---
-# Page configuration
-st.set_page_config(
-    page_title="Invoice Processor",
-    page_icon="📄",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-# Custom CSS for styling
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 3rem;
-        color: #1f77b4;
-        text-align: center;
-        margin-bottom: 2rem;
-    }
-    .success-box {
-        padding: 1rem;
-        border-radius: 0.5rem;
-        background-color: #d4edda;
-        border: 1px solid #c3e6cb;
-        margin: 1rem 0;
-    }
-    .warning-box {
-        padding: 1rem;
-        border-radius: 0.5rem;
-        background-color: #fff3cd;
-        border: 1px solid #ffeaa7;
-        margin: 1rem 0;
-    }
-    .error-box {
-        padding: 1rem;
-        border-radius: 0.5rem;
-        background-color: #f8d7da;
-        border: 1px solid #f5c6cb;
-        margin: 1rem 0;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Title
-st.markdown('<h1 class="main-header">📄 Smart Invoice Processor</h1>', unsafe_allow_html=True)
-st.markdown("### Extract structured data from invoices using your custom-built OCR pipeline")
-# Sidebar
-with st.sidebar:
-    st.header("ℹ️ About")
-    st.info("""
-    This app uses the pipeline you built to automatically extract:
-    - Receipt/Invoice number
-    - Date
-    - Customer information
-    - Line items
-    - Total amount
-    **Technology Stack:**
-    - Tesseract OCR
-    - OpenCV
-    - Python Regex
-    - Streamlit
-    """)
-    st.header("📊 Stats")
-    if 'processed_count' not in st.session_state:
-        st.session_state.processed_count = 0
-    st.metric("Invoices Processed Today", st.session_state.processed_count)
-    st.header("⚙️ Configuration")
-    extraction_method = st.selectbox(
-        "Choose Extraction Method:",
-        ('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
-        help="ML-Based is more robust but may miss fields not in its training data. Rule-Based is faster but more fragile."
-    )
-# Main content
-tab1, tab2, tab3 = st.tabs(["📤 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"])
-with tab1:
-    st.header("Upload an Invoice")
-    uploaded_file = st.file_uploader(
-        "Choose an invoice image (JPG, PNG)",
-        type=['jpg', 'jpeg', 'png'],
-        help="Upload a clear image of an invoice or receipt"
-    )
-    if uploaded_file is not None:
-        col1, col2 = st.columns([1, 1])
-        with col1:
-            st.subheader("📸 Original Image")
-            image = Image.open(uploaded_file)
-            st.image(image, use_container_width=True)
-            st.caption(f"Filename: {uploaded_file.name}")
-        with col2:
-            st.subheader("🔄 Processing Status")
-            if st.button("🚀 Extract Data", type="primary"):
-                with st.spinner("Executing your custom pipeline..."):
-                    try:
-                        # Save the uploaded file to a temporary path to be used by our pipeline
-                        temp_dir = "temp"
-                        os.makedirs(temp_dir, exist_ok=True)
-                        temp_path = os.path.join(temp_dir, uploaded_file.name)
-                        with open(temp_path, "wb") as f:
-                            f.write(uploaded_file.getbuffer())
-                        # Step 1: Call YOUR full pipeline function
-                        st.write("✅ Calling `process_invoice`...")
-                        # Map the user-friendly name from the dropdown to the actual method parameter
-                        method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
-                        st.write(f"⚙️ Using **{method.upper()}** extraction method...")
-                        # Call the pipeline with the selected method
-                        extracted_data = process_invoice(temp_path, method=method)
-                        # Step 2: Simulate format detection using the extracted data
-                        st.write("✅ Simulating format detection...")
-                        format_info = detect_invoice_format(extracted_data.get("raw_text", ""))
-                        # Store results in session state to display them
-                        st.session_state.extracted_data = extracted_data
-                        st.session_state.format_info = format_info
-                        st.session_state.processed_count += 1
-                        st.success("✅ Pipeline executed successfully!")
-                    except Exception as e:
-                        st.error(f"❌ An error occurred in the pipeline: {str(e)}")
-        # Display results if they exist in the session state
-        if 'extracted_data' in st.session_state:
-            st.markdown("---")
-            st.header("📊 Extraction Results")
-            # --- Format Detection Section ---
-            format_info = st.session_state.format_info
-            st.subheader("📋 Detected Format (Simulated)")
-            col1_fmt, col2_fmt = st.columns([2, 3])
-            with col1_fmt:
-                st.metric("Format Type", format_info['name'])
-                st.metric("Detection Confidence", f"{format_info['confidence']:.0f}%")
-                if format_info['supported']: st.success("✅ Fully Supported")
-                else: st.warning("⚠️ Limited Support")
-            with col2_fmt:
-                st.write("**Detected Indicators:**")
-                for indicator in format_info['indicators']: st.write(f"• {indicator}")
-                st.write("**Recommendations:**")
-                for rec in get_format_recommendations(format_info): st.write(rec)
-            st.markdown("---")
-            # --- Main Results Section ---
-            data = st.session_state.extracted_data
-            # Confidence display
-            confidence = data.get('extraction_confidence', 0)
-            if confidence >= 80:
-                st.markdown(f'<div class="success-box">✅ <strong>High Confidence: {confidence}%</strong> - Most key fields were found.</div>', unsafe_allow_html=True)
-            elif confidence >= 50:
-                st.markdown(f'<div class="warning-box">⚠️ <strong>Medium Confidence: {confidence}%</strong> - Some fields may be missing.</div>', unsafe_allow_html=True)
-            else:
-                st.markdown(f'<div class="error-box">❌ <strong>Low Confidence: {confidence}%</strong> - Format likely unsupported.</div>', unsafe_allow_html=True)
-            # Validation display
-            if data.get('validation_passed', False):
-                st.success("✔️ Validation Passed: Total amount appears consistent with other extracted amounts.")
-            else:
-                st.warning("⚠️ Validation Failed: Total amount could not be verified against other numbers.")
-            # Key metrics display
-            # Key metrics display
-            st.metric("🏢 Vendor", data.get('vendor') or "N/A") # <-- ADD THIS
-            res_col1, res_col2, res_col3 = st.columns(3)
-            res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
-            res_col2.metric("📅 Date", data.get('date') or "N/A")
-            res_col3.metric("💵 Total Amount", f"${data.get('total_amount'):.2f}" if data.get('total_amount') is not None else "N/A")
-            # Use an expander for longer text fields like address
-            with st.expander("Show More Details"):
-                st.markdown(f"**👤 Bill To:** {data.get('bill_to', {}).get('name') if data.get('bill_to') else 'N/A'}")
-                st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
-            # Line items table
-            if data.get('items'):
-                st.subheader("🛒 Line Items")
-                # Ensure data is in the right format for DataFrame
-                items_df_data = [{
-                    "Description": item.get("description", "N/A"),
-                    "Qty": item.get("quantity", "N/A"),
-                    "Unit Price": f"${item.get('unit_price', 0.0):.2f}",
-                    "Total": f"${item.get('total', 0.0):.2f}"
-                } for item in data['items']]
-                df = pd.DataFrame(items_df_data)
-                st.dataframe(df, use_container_width=True)
-            else:
-                st.info("ℹ️ No line items were extracted.")
-            # JSON output and download
-            with st.expander("📄 View Full JSON Output"):
-                st.json(data)
-            json_str = json.dumps(data, indent=2)
-            st.download_button(
-                label="💾 Download JSON",
-                data=json_str,
-                file_name=f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
-                mime="application/json"
-            )
-            with st.expander("📝 View Raw OCR Text"):
-                raw_text = data.get('raw_text', '')
-                if raw_text:
-                    st.text(raw_text)
-                else:
-                    st.info("No OCR text available.")
-with tab2:
-    st.header("📚 Sample Invoices")
-    st.write("Try the sample invoice below to see how the system performs:")
-    sample_dir = "data/samples" # ✅ Points to the correct folder
-    if os.path.exists(sample_dir):
-        sample_files = [f for f in os.listdir(sample_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
-        if sample_files:
-            # Display the first sample found
-            img_path = os.path.join(sample_dir, sample_files[0])
-            st.image(Image.open(img_path), caption=sample_files[0], use_container_width=True)
-            st.info("You can download this image and upload it in the 'Upload & Process' tab to test the pipeline.")
-        else:
-            st.warning("No sample invoices found in `data/samples/`.")
-    else:
-        st.error("The `data/samples` directory was not found.")
-with tab3:
-    st.header("ℹ️ How It Works (Your Custom Pipeline)")
-    st.markdown("""
-    This app follows the exact pipeline you built:
-    ```
-    1. 📸 Image Upload
-       ↓
-    2. 🔄 Preprocessing (OpenCV)
-       Grayscale conversion and noise removal.
-       ↓
-    3. 🔍 OCR (Tesseract)
-       Optimized with PSM 6 for receipt layouts.
-       ↓
-    4. 🎯 Rule-Based Extraction (Regex)
-       Your custom patterns find specific fields.
-       ↓
-    5. ✅ Confidence & Validation
-       Heuristics to check the quality of the extraction.
-       ↓
-    6. 📊 Output JSON
-       Presents all extracted data in a structured format.
-    ```
-    """)
-    st.info("This rule-based system is a great foundation. The next step is to replace the extraction logic with an ML model like LayoutLM to handle more diverse formats!")
-# Footer
-st.markdown("---")
 st.markdown("<div style='text-align: center; color: #666;'>Built with your custom Python pipeline | UI by Streamlit</div>", unsafe_allow_html=True)

+import streamlit as st
+import os
+import json
+from datetime import datetime
+from PIL import Image
+import numpy as np
+import pandas as pd
+from pathlib import Path
+# Import our actual, working pipeline function
+import sys
+sys.path.append('src')
+from pipeline import process_invoice
+# --- Mock Functions to support the UI without errors ---
+# These functions simulate the ones from your example README.
+# They allow the UI to render without needing to build a complex format detector today.
+def detect_invoice_format(ocr_text: str):
+    """
+    A mock function to simulate format detection.
+    In a real system, this would analyze the text layout.
+    """
+    # Simple heuristic: if it contains "SDN BHD", it's our known format.
+    if "SDN BHD" in ocr_text:
+        return {
+            'name': 'Template A (Retail)',
+            'confidence': 95.0,
+            'supported': True,
+            'indicators': ["Found 'SDN BHD' suffix", "Date format DD/MM/YYYY detected"]
+        }
+    else:
+        return {
+            'name': 'Unknown Format',
+            'confidence': 20.0,
+            'supported': False,
+            'indicators': ["No known company suffixes found"]
+        }
+def get_format_recommendations(format_info):
+    """Mock recommendations based on the detected format."""
+    if format_info['supported']:
+        return ["• Extraction should be highly accurate."]
+    else:
+        return ["• Results may be incomplete.", "• Consider adding patterns for this format."]
+# --- Streamlit App ---
+# Page configuration
+st.set_page_config(
+    page_title="Invoice Processor",
+    page_icon="📄",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for styling
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 3rem;
+        color: #1f77b4;
+        text-align: center;
+        margin-bottom: 2rem;
+    }
+    .success-box {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        background-color: #d4edda;
+        border: 1px solid #c3e6cb;
+        margin: 1rem 0;
+    }
+    .warning-box {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        background-color: #fff3cd;
+        border: 1px solid #ffeaa7;
+        margin: 1rem 0;
+    }
+    .error-box {
+        padding: 1rem;
+        border-radius: 0.5rem;
+        background-color: #f8d7da;
+        border: 1px solid #f5c6cb;
+        margin: 1rem 0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Title
+st.markdown('<h1 class="main-header">📄 Smart Invoice Processor</h1>', unsafe_allow_html=True)
+st.markdown("### Extract structured data from invoices using your custom-built OCR pipeline")
+# Sidebar
+with st.sidebar:
+    st.header("ℹ️ About")
+    st.info("""
+    This app uses the pipeline you built to automatically extract:
+    - Receipt/Invoice number
+    - Date
+    - Customer information
+    - Line items
+    - Total amount
+    **Technology Stack:**
+    - Tesseract OCR
+    - OpenCV
+    - Python Regex
+    - Streamlit
+    """)
+    st.header("📊 Stats")
+    if 'processed_count' not in st.session_state:
+        st.session_state.processed_count = 0
+    st.metric("Invoices Processed Today", st.session_state.processed_count)
+    st.header("⚙️ Configuration")
+    extraction_method = st.selectbox(
+        "Choose Extraction Method:",
+        ('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
+        help="ML-Based is more robust but may miss fields not in its training data. Rule-Based is faster but more fragile."
+    )
+# Main content
+tab1, tab2, tab3 = st.tabs(["📤 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"])
+with tab1:
+    st.header("Upload an Invoice")
+    uploaded_file = st.file_uploader(
+        "Choose an invoice image (JPG, PNG)",
+        type=['jpg', 'jpeg', 'png'],
+        help="Upload a clear image of an invoice or receipt"
+    )
+    if uploaded_file is not None:
+        col1, col2 = st.columns([1, 1])
+        with col1:
+            st.subheader("📸 Original Image")
+            image = Image.open(uploaded_file)
+            st.image(image, use_container_width=True)
+            st.caption(f"Filename: {uploaded_file.name}")
+        with col2:
+            st.subheader("🔄 Processing Status")
+            if st.button("🚀 Extract Data", type="primary"):
+                with st.spinner("Executing your custom pipeline..."):
+                    try:
+                        # Save the uploaded file to a temporary path to be used by our pipeline
+                        temp_dir = "temp"
+                        os.makedirs(temp_dir, exist_ok=True)
+                        temp_path = os.path.join(temp_dir, uploaded_file.name)
+                        with open(temp_path, "wb") as f:
+                            f.write(uploaded_file.getbuffer())
+                        # Step 1: Call YOUR full pipeline function
+                        st.write("✅ Calling `process_invoice`...")
+                        # Map the user-friendly name from the dropdown to the actual method parameter
+                        method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
+                        st.write(f"⚙️ Using **{method.upper()}** extraction method...")
+                        # Call the pipeline with the selected method
+                        extracted_data = process_invoice(temp_path, method=method)
+                        # Step 2: Simulate format detection using the extracted data
+                        st.write("✅ Simulating format detection...")
+                        format_info = detect_invoice_format(extracted_data.get("raw_text", ""))
+                        # Store results in session state to display them
+                        st.session_state.extracted_data = extracted_data
+                        st.session_state.format_info = format_info
+                        st.session_state.processed_count += 1
+                        st.success("✅ Pipeline executed successfully!")
+                    except Exception as e:
+                        st.error(f"❌ An error occurred in the pipeline: {str(e)}")
+        # Display results if they exist in the session state
+        if 'extracted_data' in st.session_state:
+            st.markdown("---")
+            st.header("📊 Extraction Results")
+            # --- Format Detection Section ---
+            format_info = st.session_state.format_info
+            st.subheader("📋 Detected Format (Simulated)")
+            col1_fmt, col2_fmt = st.columns([2, 3])
+            with col1_fmt:
+                st.metric("Format Type", format_info['name'])
+                st.metric("Detection Confidence", f"{format_info['confidence']:.0f}%")
+                if format_info['supported']: st.success("✅ Fully Supported")
+                else: st.warning("⚠️ Limited Support")
+            with col2_fmt:
+                st.write("**Detected Indicators:**")
+                for indicator in format_info['indicators']: st.write(f"• {indicator}")
+                st.write("**Recommendations:**")
+                for rec in get_format_recommendations(format_info): st.write(rec)
+            st.markdown("---")
+            # --- Main Results Section ---
+            data = st.session_state.extracted_data
+            # Confidence display
+            confidence = data.get('extraction_confidence', 0)
+            if confidence >= 80:
+                st.markdown(f'<div class="success-box">✅ <strong>High Confidence: {confidence}%</strong> - Most key fields were found.</div>', unsafe_allow_html=True)
+            elif confidence >= 50:
+                st.markdown(f'<div class="warning-box">⚠️ <strong>Medium Confidence: {confidence}%</strong> - Some fields may be missing.</div>', unsafe_allow_html=True)
+            else:
+                st.markdown(f'<div class="error-box">❌ <strong>Low Confidence: {confidence}%</strong> - Format likely unsupported.</div>', unsafe_allow_html=True)
+            # Validation display
+            if data.get('validation_passed', False):
+                st.success("✔️ Validation Passed: Total amount appears consistent with other extracted amounts.")
+            else:
+                st.warning("⚠️ Validation Failed: Total amount could not be verified against other numbers.")
+            # Key metrics display
+            # Key metrics display
+            st.metric("🏢 Vendor", data.get('vendor') or "N/A") # <-- ADD THIS
+            res_col1, res_col2, res_col3 = st.columns(3)
+            res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
+            res_col2.metric("📅 Date", data.get('date') or "N/A")
+            res_col3.metric("💵 Total Amount", f"${data.get('total_amount'):.2f}" if data.get('total_amount') is not None else "N/A")
+            # Use an expander for longer text fields like address
+            with st.expander("Show More Details"):
+                st.markdown(f"**👤 Bill To:** {data.get('bill_to', {}).get('name') if data.get('bill_to') else 'N/A'}")
+                st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
+            # Line items table
+            if data.get('items'):
+                st.subheader("🛒 Line Items")
+                # Ensure data is in the right format for DataFrame
+                items_df_data = [{
+                    "Description": item.get("description", "N/A"),
+                    "Qty": item.get("quantity", "N/A"),
+                    "Unit Price": f"${item.get('unit_price', 0.0):.2f}",
+                    "Total": f"${item.get('total', 0.0):.2f}"
+                } for item in data['items']]
+                df = pd.DataFrame(items_df_data)
+                st.dataframe(df, use_container_width=True)
+            else:
+                st.info("ℹ️ No line items were extracted.")
+            # JSON output and download
+            with st.expander("📄 View Full JSON Output"):
+                st.json(data)
+            json_str = json.dumps(data, indent=2)
+            st.download_button(
+                label="💾 Download JSON",
+                data=json_str,
+                file_name=f"invoice_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
+                mime="application/json"
+            )
+            with st.expander("📝 View Raw OCR Text"):
+                raw_text = data.get('raw_text', '')
+                if raw_text:
+                    st.text(raw_text)
+                else:
+                    st.info("No OCR text available.")
+with tab2:
+    st.header("📚 Sample Invoices")
+    st.write("Try the sample invoice below to see how the system performs:")
+    sample_dir = "data/samples" # ✅ Points to the correct folder
+    if os.path.exists(sample_dir):
+        sample_files = [f for f in os.listdir(sample_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
+        if sample_files:
+            # Display the first sample found
+            img_path = os.path.join(sample_dir, sample_files[0])
+            st.image(Image.open(img_path), caption=sample_files[0], use_container_width=True)
+            st.info("You can download this image and upload it in the 'Upload & Process' tab to test the pipeline.")
+        else:
+            st.warning("No sample invoices found in `data/samples/`.")
+    else:
+        st.error("The `data/samples` directory was not found.")
+with tab3:
+    st.header("ℹ️ How It Works (Your Custom Pipeline)")
+    st.markdown("""
+    This app follows the exact pipeline you built:
+    ```
+    1. 📸 Image Upload
+       ↓
+    2. 🔄 Preprocessing (OpenCV)
+       Grayscale conversion and noise removal.
+       ↓
+    3. 🔍 OCR (Tesseract)
+       Optimized with PSM 6 for receipt layouts.
+       ↓
+    4. 🎯 Rule-Based Extraction (Regex)
+       Your custom patterns find specific fields.
+       ↓
+    5. ✅ Confidence & Validation
+       Heuristics to check the quality of the extraction.
+       ↓
+    6. 📊 Output JSON
+       Presents all extracted data in a structured format.
+    ```
+    """)
+    st.info("This rule-based system is a great foundation. The next step is to replace the extraction logic with an ML model like LayoutLM to handle more diverse formats!")
+# Footer
+st.markdown("---")
 st.markdown("<div style='text-align: center; color: #666;'>Built with your custom Python pipeline | UI by Streamlit</div>", unsafe_allow_html=True)

eval_new_dataset.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import torch
+from src.data_loader import load_unified_dataset
+from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor, DataCollatorForTokenClassification
+from torch.utils.data import DataLoader
+from seqeval.metrics import classification_report
+from tqdm import tqdm
+from train_combined import UnifiedDataset, label2id, id2label, LABEL_LIST
+# Load Model
+model_path = "./models/layoutlmv3-generalized"
+model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
+processor = LayoutLMv3Processor.from_pretrained(model_path, apply_ocr=False)
+device = torch.device("cuda")
+model.to(device)
+# Load ONLY the new dataset (validation split)
+# We want to see how well it learned THIS specific dataset
+print("Loading new dataset validation split...")
+val_data = load_unified_dataset(split="valid", sample_size=None)
+dataset = UnifiedDataset(val_data, processor, label2id)
+loader = DataLoader(dataset, batch_size=4, collate_fn=DataCollatorForTokenClassification(processor.tokenizer, padding=True, return_tensors="pt"))
+print("Running evaluation...")
+model.eval()
+preds, labs = [], []
+for batch in tqdm(loader):
+    batch = {k: v.to(device) for k, v in batch.items()}
+    with torch.no_grad():
+        outputs = model(**batch)
+    predictions = outputs.logits.argmax(dim=-1)
+    labels = batch['labels']
+    for i in range(len(labels)):
+        p = [id2label[p.item()] for p, l in zip(predictions[i], labels[i]) if l.item() != -100]
+        l = [id2label[l.item()] for l in labels[i] if l.item() != -100]
+        preds.append(p)
+        labs.append(l)
+print("\nClassification Report:")
+print(classification_report(labs, preds))

explore_new_dataset.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from datasets import load_dataset
+import json
+import ast  # <--- Added for robust parsing
+# --- 1. Load the dataset ---
+print("📥 Loading 'mychen76/invoices-and-receipts_ocr_v1' from Hugging Face...")
+try:
+    dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1", split='train')
+    print("✅ Dataset loaded successfully!")
+except Exception as e:
+    print(f"❌ Failed to load dataset. Error: {e}")
+    exit()
+# --- 2. Print Dataset Information ---
+print("\n" + "="*60)
+print("📊 DATASET INFORMATION & FEATURES")
+print("="*60)
+print(f"Number of examples: {len(dataset)}")
+print(f"\nFeatures (Columns): {dataset.features}")
+# --- 3. Explore a Single Example ---
+print("\n" + "="*60)
+print("📄 EXPLORING THE FIRST SAMPLE")
+print("="*60)
+if len(dataset) > 0:
+    sample = dataset[0]
+    # Parse the main wrapper JSONs
+    try:
+        raw_data = json.loads(sample['raw_data'])
+        parsed_data = json.loads(sample['parsed_data'])
+    except json.JSONDecodeError as e:
+        print(f"❌ Error decoding main JSON wrappers: {e}")
+        exit()
+    print(f"\nImage object: {sample['image']}")
+    # --- ROBUST PARSING LOGIC ---
+    def safe_parse(content):
+        """Try JSON, fallback to AST (for single quotes)"""
+        if isinstance(content, list):
+            return content # Already a list
+        if isinstance(content, str):
+            try:
+                return json.loads(content)
+            except json.JSONDecodeError:
+                try:
+                    return ast.literal_eval(content)
+                except:
+                    return None
+        return None
+    ocr_words = safe_parse(raw_data.get('ocr_words'))
+    ocr_boxes = safe_parse(raw_data.get('ocr_boxes'))
+    if ocr_words and ocr_boxes:
+        print(f"\nFound {len(ocr_words)} OCR words.")
+        print("Sample Word & Box Format:")
+        # Print first 3 to check coordinate format (4 numbers or 8 numbers?)
+        for i in range(min(3, len(ocr_words))):
+            print(f"  Word: '{ocr_words[i]}' | Box: {ocr_boxes[i]}")
+    else:
+        print("❌ OCR fields missing or could not be parsed.")
+else:
+    print("Dataset is empty.")
+# --- 4. Discover All Unique NER Tags ---
+print("\n" + "="*60)
+print("📋 ALL UNIQUE ENTITY LABELS IN THIS DATASET")
+print("="*60)
+if len(dataset) > 0:
+    all_entity_labels = set()
+    print("Scanning dataset for labels...")
+    for i, example in enumerate(dataset):
+        try:
+            # Parse parsed_data
+            parsed_example = json.loads(example['parsed_data'])
+            # The 'json' field inside might be a string or a dict
+            fields_data = parsed_example.get('json', {})
+            if isinstance(fields_data, str):
+                try:
+                    fields = json.loads(fields_data)
+                except:
+                    fields = ast.literal_eval(fields_data)
+            else:
+                fields = fields_data
+            if fields:
+                all_entity_labels.update(fields.keys())
+        except Exception:
+            continue # Skip corrupted examples silently
+    if all_entity_labels:
+        print(f"\nFound {len(all_entity_labels)} unique entity labels:")
+        print(sorted(list(all_entity_labels)))
+    else:
+        print("Could not find any entity labels.")
+else:
+    print("Cannot analyze tags of an empty dataset.")
+# Add this to explore_new_dataset.py
+sample = dataset[0]
+sample['image'].save("data/samples/test_invoice_no.jpg")
+print("Saved sample image to data/samples/test_invoice_no.jpg")

load_sroie_dataset.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+from pathlib import Path
+from PIL import Image
+def load_sroie(path):
+    print(f"🔄 Loading SROIE from local path: {path}")
+    path = Path(path)
+    dataset = {'train': [], 'test': []}
+    for split in ["train", "test"]:
+        split_path = path / split
+        if (split_path / "images").exists(): img_dir = split_path / "images"
+        elif (split_path / "img").exists(): img_dir = split_path / "img"
+        else: continue
+        if (split_path / "tagged").exists(): ann_dir = split_path / "tagged"
+        elif (split_path / "box").exists(): ann_dir = split_path / "box"
+        else: continue
+        examples = []
+        for img_file in sorted(img_dir.iterdir()):
+            if img_file.suffix.lower() not in [".jpg", ".png"]: continue
+            name = img_file.stem
+            json_path = ann_dir / f"{name}.json"
+            if not json_path.exists(): continue
+            with open(json_path, encoding="utf8") as f:
+                data = json.load(f)
+            if "words" in data and "bbox" in data and "labels" in data:
+                # --- NORMALIZATION HAPPENS HERE (YOUR FIX) ---
+                try:
+                    with Image.open(img_file) as img:
+                        width, height = img.size
+                    norm_boxes = []
+                    for box in data["bbox"]:
+                        # SROIE is raw [x0, y0, x1, y1]
+                        x0, y0, x1, y1 = box
+                        # Normalize and Clamp
+                        norm_box = [
+                            int(max(0, min(1000 * (x0 / width), 1000))),
+                            int(max(0, min(1000 * (y0 / height), 1000))),
+                            int(max(0, min(1000 * (x1 / width), 1000))),
+                            int(max(0, min(1000 * (y1 / height), 1000)))
+                        ]
+                        norm_boxes.append(norm_box)
+                    examples.append({
+                        "image_path": str(img_file),
+                        "words": data["words"],
+                        "bboxes": norm_boxes, # Storing normalized boxes
+                        "ner_tags": data["labels"]
+                    })
+                except Exception as e:
+                    print(f"Skipping {name}: {e}")
+                    continue
+        dataset[split] = examples
+        print(f"   Mapped {len(examples)} paths for {split}")
+    return dataset

notebooks/test_setup.py DELETED Viewed

@@ -1,11 +0,0 @@
-# This is just a verification script - you can copy this
-import pytesseract
-from PIL import Image
-import cv2
-import numpy as np
-# If Windows, you might need to set this path:
-# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
-print("✅ All imports successful!")
-print(f"Tesseract version: {pytesseract.get_tesseract_version()}")

notebooks/test_visual.ipynb DELETED Viewed

File without changes

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

src/data_loader.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# src/data_loader.py
+import json
+import ast
+import numpy as np
+from datasets import load_dataset
+from difflib import SequenceMatcher
+# --- CONFIGURATION ---
+LABEL_MAPPING = {
+    # Vendor/Company
+    "seller": "COMPANY",
+    "store_name": "COMPANY",
+    # Address
+    "store_addr": "ADDRESS",
+    # Date
+    "date": "DATE",
+    "invoice_date": "DATE",
+    # Total
+    "total": "TOTAL",
+    "total_gross_worth": "TOTAL",
+    # Receipt Number / Invoice No
+    "invoice_no": "INVOICE_NO",
+    # Bill To / Client
+    "client": "BILL_TO"
+}
+def safe_parse(content):
+    """Robustly parses input that might be a list, a JSON string, or a Python string literal."""
+    if isinstance(content, list):
+        return content
+    if isinstance(content, str):
+        try:
+            return json.loads(content)
+        except json.JSONDecodeError:
+            pass
+        try:
+            return ast.literal_eval(content)
+        except (ValueError, SyntaxError):
+            pass
+    return []
+def normalize_box(box, width, height):
+    """Converts 8-point polygons to 4-point normalized [0-1000] bbox."""
+    try:
+        # Handle nested format variations
+        if isinstance(box, list) and len(box) == 2 and isinstance(box[0], list):
+            polygon = box[0]
+        elif isinstance(box, list) and len(box) == 4 and isinstance(box[0], list):
+            polygon = box
+        else:
+            return None
+        xs = [point[0] for point in polygon]
+        ys = [point[1] for point in polygon]
+        return [
+            int(max(0, min(1000 * (min(xs) / width), 1000))),
+            int(max(0, min(1000 * (min(ys) / height), 1000))),
+            int(max(0, min(1000 * (max(xs) / width), 1000))),
+            int(max(0, min(1000 * (max(ys) / height), 1000)))
+        ]
+    except Exception:
+        return None
+def tokenize_and_spread_boxes(words, boxes):
+    """
+    Splits phrases into individual words and duplicates the bounding box.
+    Input: ['Invoice #123'], [BOX_A]
+    Output: ['Invoice', '#123'], [BOX_A, BOX_A]
+    """
+    tokenized_words = []
+    tokenized_boxes = []
+    for word, box in zip(words, boxes):
+        # Split by whitespace
+        sub_words = str(word).split()
+        for sw in sub_words:
+            tokenized_words.append(sw)
+            tokenized_boxes.append(box)
+    return tokenized_words, tokenized_boxes
+def align_labels(ocr_words, label_map):
+    """Matches OCR words to Ground Truth values using Sub-sequence Matching."""
+    tags = ["O"] * len(ocr_words)
+    for target_text, label_class in label_map.items():
+        if not target_text: continue
+        target_tokens = str(target_text).split()
+        if not target_tokens: continue
+        n_target = len(target_tokens)
+        # Sliding window search
+        for i in range(len(ocr_words) - n_target + 1):
+            window = ocr_words[i : i + n_target]
+            # Check match
+            match = True
+            for j in range(n_target):
+                # Clean punctuation for comparison
+                w_clean = window[j].strip(".,-:")
+                t_clean = target_tokens[j].strip(".,-:")
+                if w_clean not in t_clean and t_clean not in w_clean:
+                    match = False
+                    break
+            if match:
+                tags[i] = f"B-{label_class}"
+                for k in range(1, n_target):
+                    tags[i + k] = f"I-{label_class}"
+    return tags
+def load_unified_dataset(split="train", sample_size=None):
+    print(f"🔄 Loading dataset 'mychen76/invoices-and-receipts_ocr_v1' ({split})...")
+    dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1", split=split)
+    if sample_size:
+        dataset = dataset.select(range(sample_size))
+    processed_data = []
+    print("⚙️ Processing, Tokenizing, and Aligning...")
+    for example in dataset:
+        try:
+            image = example['image']
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+            width, height = image.size
+            # 1. Parse Raw OCR
+            raw_words = safe_parse(json.loads(example['raw_data']).get('ocr_words'))
+            raw_boxes = safe_parse(json.loads(example['raw_data']).get('ocr_boxes'))
+            if not raw_words or not raw_boxes or len(raw_words) != len(raw_boxes):
+                continue
+            # 2. Normalize Boxes first
+            norm_boxes = []
+            valid_words = []
+            for i, box in enumerate(raw_boxes):
+                nb = normalize_box(box, width, height)
+                if nb:
+                    norm_boxes.append(nb)
+                    valid_words.append(raw_words[i])
+            # 3. TOKENIZE (The Fix)
+            final_words, final_boxes = tokenize_and_spread_boxes(valid_words, norm_boxes)
+            # 4. Map Labels
+            parsed_json = json.loads(example['parsed_data'])
+            fields = safe_parse(parsed_json.get('json', {}))
+            label_value_map = {}
+            if isinstance(fields, dict):
+                for k, v in fields.items():
+                    if k in LABEL_MAPPING and v:
+                        label_value_map[v] = LABEL_MAPPING[k]
+            # 5. Align Labels
+            final_tags = align_labels(final_words, label_value_map)
+            # Only keep if we found at least one entity (cleaner training data)
+            unique_tags = set(final_tags)
+            if len(unique_tags) > 1:
+                processed_data.append({
+                    "image": image,
+                    "words": final_words,
+                    "bboxes": final_boxes,
+                    "ner_tags": final_tags
+                })
+        except Exception:
+            continue
+    print(f"✅ Successfully processed {len(processed_data)} examples.")
+    return processed_data
+if __name__ == "__main__":
+    # Test run
+    data = load_unified_dataset(sample_size=20)
+    if len(data) > 0:
+        print(f"\nSample 0 Words: {data[0]['words'][:10]}...")
+        print(f"Sample 0 Tags:  {data[0]['ner_tags'][:10]}...")
+        all_tags = [t for item in data for t in item['ner_tags']]
+        unique_tags = set(all_tags)
+        print(f"\nUnique Tags Found in Sample: {unique_tags}")
+    else:
+        print("No valid examples found in sample.")

src/extraction.py CHANGED Viewed

@@ -1,273 +1,123 @@
-import re
-from typing import List, Dict, Optional, Any
-def extract_dates(text: str) -> List[str]:
-    if not text:
-        return []
-    dates = []
-    pattern1 = r'\d{2}[/-]\d{2}[/-]\d{4}'
-    pattern2 = r'\d{2}[/-]\d{2}[/-]\d{2}(?!\d)'
-    pattern3 = r'\d{4}[/-]\d{2}[/-]\d{2}'
-    dates.extend(re.findall(pattern1, text))
-    dates.extend(re.findall(pattern2, text))
-    dates.extend(re.findall(pattern3, text))
-    dates = list(dict.fromkeys(dates))
-    return dates
-def extract_amounts(text:  str) -> List[float]:
-    if not text:
-        return []
-    # Matches: 123.45, 1,234.56, $123.45, 123.45 RM
-    pattern = r'(?:RM|Rs\.?|\$|€)?\s*\d{1,3}(?:,\d{3})*[.,]\d{2}'
-    amounts_strings = (re.findall(pattern, text))
-    amounts = []
-    for amt_str in amounts_strings:
-        amt_cleaned = re.sub(r'[^\d.,]', '', amt_str)
-        amt_cleaned = amt_cleaned.replace(',', '.')
-        try:
-            amounts.append(float(amt_cleaned))
-        except ValueError:
-            continue
-    return amounts
-def extract_total(text: str) -> Optional[float]:
-    if not text:
-        return None
-    pattern = r'(?:TOTAL|GRAND\s*TOTAL|AMOUNT\s*DUE|BALANCE)\s*:?\s*(\d+[.,]\d{2})'
-    match = re.search(pattern, text, re.IGNORECASE)
-    if match:
-            amount_str = match.group(1).replace(',', '.')
-            return float(amount_str)
-    return None
-def extract_vendor(text: str) -> Optional[str]:
-    if not text:
-        return None
-    lines = text.strip().split('\n')
-    company_suffixes = ['SDN BHD', 'INC', 'LTD', 'LLC', 'PLC', 'CORP', 'PTY', 'PVT']
-    for line in lines:
-        line = line.strip()
-        # Skip empty or very short line
-        if len(line) < 3:
-            continue
-        # Skip lines with only symbols
-        if all(c in '*-=_#' for c in line.replace(' ', '')):
-            continue
-        for suffix in company_suffixes:
-            if suffix in line.upper():
-                return line
-    # If we've gone through 10 lines and found nothing,
-    # return the first substantial line
-    # (Vendor is usually in first few lines)
-    # Fallback: return first non-trivial line
-    for line in lines[:10]:
-        line = line.strip()
-        if len(line) >= 3 and not all(c in '*-=_#' for c in line.replace(' ', '')):
-            return line
-    return None
-def extract_invoice_number(text: str) -> Optional[str]:
-    if not text:
-        return None
-    # Look for invoice number patterns (alphanumeric with hyphens, 5+ chars)
-    # Typically near invoice-related text
-    lines = text.split('\n')
-    for line in lines[:15]:  # Check first 15 lines (invoice # is usually at top)
-        # If line mentions anything invoice-related
-        if any(keyword in line.lower() for keyword in ['nvoice', 'receipt', 'bill', 'no']):
-            # Find alphanumeric patterns
-            patterns = re.findall(r'[A-Z]{2,}[A-Z0-9\-]{3,}', line, re.IGNORECASE)
-            for pattern in patterns:
-                # Must be 5+ chars and contain both letters and numbers
-                if (len(pattern) >= 5 and
-                    any(c.isdigit() for c in pattern) and
-                    any(c.isalpha() for c in pattern)):
-                    return pattern.upper()
-    return None
-def extract_bill_to(text: str) -> Optional[Dict[str, str]]:
-    if not text:
-        return None
-    bill_to = None
-    # Normalize lines and remove empty lines
-    lines = [line.strip() for line in text.splitlines() if line.strip()]
-    # Possible headings
-    headings = ['bill to', 'billed to', 'billing name', 'customer']
-    bill_to_text = None
-    for i, line in enumerate(lines):
-        lower_line = line.lower()
-        if any(h in lower_line for h in headings):
-            # Capture text after colon or hyphen if present
-            split_line = re.split(r'[:\-]', line, maxsplit=1)
-            if len(split_line) > 1:
-                bill_to_text = split_line[1].strip()
-            else:
-                # If name is on next line
-                if i + 1 < len(lines):
-                    bill_to_text = lines[i + 1].strip()
-            break
-    if not bill_to_text:
-        return None
-    # Extract email if present
-    email_match = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', bill_to_text)
-    email = email_match.group(0) if email_match else None
-    # Remove email from name
-    if email:
-        bill_to_text = bill_to_text.replace(email, '').strip()
-    if len(bill_to_text) > 2:  # Basic validation
-        bill_to = {"name": bill_to_text, "email": email}
-    return bill_to
-def extract_line_items(text: str) -> List[Dict[str, Any]]:
-    """
-    Extract line items from receipt text more robustly.
-    Handles:
-        - Multi-line descriptions
-        - Prices with or without currency symbols
-        - Quantities in different formats
-        - Missing decimals
-    Args:
-        text: Raw OCR text
-    Returns:
-        List of dictionaries with description, quantity, unit_price, total
-    """
-    items = []
-    lines = text.split('\n')
-    # Keywords to detect start/end of item section
-    start_keywords = ['description', 'item', 'qty', 'price', 'amount']
-    end_keywords = ['total', 'subtotal', 'tax', 'gst']
-    # Detect section
-    start_index = -1
-    end_index = len(lines)
-    for i, line in enumerate(lines):
-        lower = line.lower()
-        if start_index == -1 and any(k in lower for k in start_keywords):
-            start_index = i + 1
-        if start_index != -1 and any(k in lower for k in end_keywords):
-            end_index = i
-            break
-    if start_index == -1:
-        return []
-    item_lines = lines[start_index:end_index]
-    current_description = ""
-    for line in item_lines:
-        # Remove currency symbols, commas, etc.
-        clean_line = re.sub(r'[^\d\.\s]', '', line)
-        # Find all numbers (floats or integers)
-        amounts_on_line = re.findall(r'\d+(?:\.\d+)?', clean_line)
-        # Attempt to detect quantity at the start: "2 ", "3 x", etc.
-        qty_match = re.match(r'^\s*(\d+)\s*(?:x)?', line)
-        quantity = int(qty_match.group(1)) if qty_match else 1
-        # Extract description by removing numbers and common symbols
-        desc_part = re.sub(r'[\d\.\s]+', '', line).strip()
-        if len(desc_part) > 0:
-            if current_description:
-                current_description += " " + desc_part
-            else:
-                current_description = desc_part
-        # If there are numbers and a description, create item
-        if amounts_on_line and current_description:
-            try:
-                # Heuristic: last number is total, second last is unit price
-                item_total = float(amounts_on_line[-1])
-                unit_price = float(amounts_on_line[-2]) if len(amounts_on_line) > 1 else item_total
-                items.append({
-                    "description": current_description.strip(),
-                    "quantity": quantity,
-                    "unit_price": unit_price,
-                    "total": item_total
-                })
-                current_description = ""  # reset for next item
-            except ValueError:
-                current_description = ""
-                continue
-    return items
-def structure_output(text: str) -> Dict[str, Any]:
-    """
-    Extract all information and return in the desired advanced format.
-    """
-    # Old fields
-    date = extract_dates(text)[0] if extract_dates(text) else None
-    total = extract_total(text)
-    # New fields
-    bill_to = extract_bill_to(text)
-    items = extract_line_items(text)
-    invoice_num = extract_invoice_number(text) # Renamed for clarity
-    data = {
-        "receipt_number": invoice_num,
-        "date": date,
-        "bill_to": bill_to,
-        "items": items,
-        "total_amount": total,
-        "raw_text": text
-    }
-    # --- Confidence and Validation ---
-    fields_to_check = ['receipt_number', 'date', 'bill_to', 'total_amount']
-    extracted_fields = sum(1 for field in fields_to_check if data.get(field) is not None)
-    if items: # Count items as an extracted field
-        extracted_fields += 1
-    data['extraction_confidence'] = int((extracted_fields / (len(fields_to_check) + 1)) * 100)
-    # A more advanced validation
-    items_total = sum(item.get('total', 0) for item in items)
-    data['validation_passed'] = False
-    if total is not None and abs(total - items_total) < 0.01: # Check if total matches sum of items
-        data['validation_passed'] = True
-    return data

+# src/extraction.py
+import re
+from typing import List, Dict, Optional, Any
+def extract_dates(text: str) -> List[str]:
+    if not text: return []
+    dates = []
+    # DD/MM/YYYY or DD-MM-YYYY
+    pattern1 = r'\b\d{1,2}[/-]\d{1,2}[/-]\d{2,4}\b'
+    # YYYY-MM-DD
+    pattern2 = r'\b\d{4}[/-]\d{1,2}[/-]\d{1,2}\b'
+    dates.extend(re.findall(pattern1, text))
+    dates.extend(re.findall(pattern2, text))
+    return list(dict.fromkeys(dates))
+def extract_amounts(text:  str) -> List[float]:
+    if not text: return []
+    # Matches: 1,234.56 or 1234.56
+    pattern = r'\b\d{1,3}(?:,\d{3})*\.\d{2}\b'
+    amounts_strings = re.findall(pattern, text)
+    amounts = []
+    for amt_str in amounts_strings:
+        amt_cleaned = amt_str.replace(',', '')
+        try:
+            amounts.append(float(amt_cleaned))
+        except ValueError:
+            continue
+    return amounts
+def extract_total(text: str) -> Optional[float]:
+    """
+    Robust total extraction looking for keywords + largest number context.
+    """
+    if not text: return None
+    # 1. Try specific "Total" keywords first
+    # Looks for "Total: 123.45" or "Total Amount $123.45"
+    pattern = r'(?:TOTAL|AMOUNT DUE|GRAND TOTAL|BALANCE|PAYABLE)[\w\s]*[:$]?\s*([\d,]+\.\d{2})'
+    matches = re.findall(pattern, text, re.IGNORECASE)
+    if matches:
+        # Return the last match (often the grand total at bottom)
+        try:
+            return float(matches[-1].replace(',', ''))
+        except ValueError:
+            pass
+    # 2. Fallback: Find the largest monetary value in the bottom half of text
+    # (Risky, but better than None)
+    amounts = extract_amounts(text)
+    if amounts:
+        return max(amounts)
+    return None
+def extract_vendor(text: str) -> Optional[str]:
+    if not text: return None
+    lines = text.strip().split('\n')
+    company_suffixes = ['SDN BHD', 'INC', 'LTD', 'LLC', 'PLC', 'CORP', 'PTY', 'PVT', 'LIMITED']
+    for line in lines[:10]: # Check top 10 lines
+        line_upper = line.upper()
+        if any(suffix in line_upper for suffix in company_suffixes):
+            return line.strip()
+    # Fallback: Return first non-empty line that isn't a date
+    for line in lines[:5]:
+        if len(line.strip()) > 3 and not re.search(r'\d{2}/\d{2}', line):
+             return line.strip()
+    return None
+def extract_invoice_number(text: str) -> Optional[str]:
+    """
+    Improved regex that handles alphanumeric AND numeric IDs.
+    """
+    if not text: return None
+    # Strategy 1: Look for "Invoice No: XXXXX" pattern
+    # Matches: "Invoice No: 12345", "Inv #: AB-123", "Bill No. 999"
+    keyword_pattern = r'(?:INVOICE|BILL|RECEIPT)\s*(?:NO|NUMBER|#|NUM)?[\s\.:-]*([A-Z0-9\-/]{3,})'
+    match = re.search(keyword_pattern, text, re.IGNORECASE)
+    if match:
+        return match.group(1)
+    # Strategy 2: Look for standalone labeled patterns (Existing Logic)
+    # Only if Strategy 1 fails
+    lines = text.split('\n')
+    for line in lines[:20]:
+        if any(k in line.lower() for k in ['invoice', 'no', '#']):
+            # Allow pure digits now if they are long enough (e.g. 40378170)
+            # Match 4+ digits OR alphanumeric
+            token_match = re.search(r'\b([A-Z0-9-]{4,})\b', line)
+            if token_match:
+                return token_match.group(1)
+    return None
+def extract_bill_to(text: str) -> Optional[Dict[str, str]]:
+    if not text: return None
+    # Look for "Bill To" block
+    match = re.search(r'(?:BILL|BILLED)\s*TO[:\s]+([^\n]+)', text, re.IGNORECASE)
+    if match:
+        name = match.group(1).strip()
+        return {"name": name, "email": None}
+    return None
+def extract_line_items(text: str) -> List[Dict[str, Any]]:
+    # (Keeping your existing logic simple for now)
+    return []
+def structure_output(text: str) -> Dict[str, Any]:
+    """Legacy wrapper for rule-based-only pipeline"""
+    return {
+        "receipt_number": extract_invoice_number(text),
+        "date": extract_dates(text)[0] if extract_dates(text) else None,
+        "total_amount": extract_total(text),
+        "vendor": extract_vendor(text),
+        "raw_text": text
+    }

src/ml_extraction.py CHANGED Viewed

@@ -1,176 +1,144 @@
-# src/ml_extraction.py
-import torch
-from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
-from PIL import Image
-import pytesseract
-from typing import List, Dict, Any
-import re
-# --- CONFIGURATION ---
-# The local path where we expect to find/save the model
-LOCAL_MODEL_PATH = "./models/layoutlmv3-sroie-best"
-# The Hugging Face Hub ID for the model to download if not found locally
-HUB_MODEL_ID = "GSoumyajit2005/layoutlmv3-sroie-invoice-extraction"
-# --- Function to load the model ---
-def load_model_and_processor(model_path, hub_id):
-    """
-    Tries to load the model from a local path. If it fails,
-    it downloads it from the Hugging Face Hub.
-    """
-    try:
-        # Try loading from local path first
-        print(f"Attempting to load model from local path: {model_path}...")
-        processor = LayoutLMv3Processor.from_pretrained(model_path)
-        model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
-        print("✅ Model loaded successfully from local path.")
-    except OSError:
-        # If it fails, download from the Hub
-        print(f"Model not found locally. Downloading from Hugging Face Hub: {hub_id}...")
-        from huggingface_hub import snapshot_download
-        # Download the model files and save them to the local path
-        snapshot_download(repo_id=hub_id, local_dir=model_path, local_dir_use_symlinks=False)
-        # Now load from the local path again
-        processor = LayoutLMv3Processor.from_pretrained(model_path)
-        model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
-        print("✅ Model downloaded and loaded successfully from the Hub.")
-    return model, processor
-# --- Load the model and processor only ONCE when the module is imported ---
-MODEL, PROCESSOR = load_model_and_processor(LOCAL_MODEL_PATH, HUB_MODEL_ID)
-if MODEL and PROCESSOR:
-    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    MODEL.to(DEVICE)
-    MODEL.eval()
-    print(f"ML Model is ready on device: {DEVICE}")
-else:
-    DEVICE = None
-    print("❌ Could not load ML model.")
-# --- Helper Function to group entities ---
-def _process_predictions(words: List[str], unnormalized_boxes: List[List[int]], encoding, predictions: List[int], id2label: Dict[int, str]) -> Dict[str, Any]:
-    word_ids = encoding.word_ids(batch_index=0)
-    word_level_preds = {}
-    for idx, word_id in enumerate(word_ids):
-        if word_id is not None:
-            label_id = predictions[idx]
-            if label_id != -100:
-                if word_id not in word_level_preds:
-                    word_level_preds[word_id] = id2label[label_id]
-    entities = {}
-    for word_idx, label in word_level_preds.items():
-        if label == 'O': continue
-        entity_type = label[2:]
-        word = words[word_idx]
-        if label.startswith('B-'):
-            entities[entity_type] = {"text": word, "bbox": [unnormalized_boxes[word_idx]]}
-        elif label.startswith('I-') and entity_type in entities:
-            if word_idx > 0 and word_level_preds.get(word_idx - 1) in (f'B-{entity_type}', f'I-{entity_type}'):
-                entities[entity_type]['text'] += " " + word
-                entities[entity_type]['bbox'].append(unnormalized_boxes[word_idx])
-            else:
-                 entities[entity_type] = {"text": word, "bbox": [unnormalized_boxes[word_idx]]}
-    # Clean up the final text field
-    for entity in entities.values():
-        entity['text'] = entity['text'].strip()
-    return entities
-# --- Main Function to be called from the pipeline ---
-def extract_ml_based(image_path: str) -> Dict[str, Any]:
-    """
-    Performs end-to-end ML-based extraction on a single image.
-    Args:
-        image_path: The path to the invoice image.
-    Returns:
-        A dictionary containing the extracted entities.
-    """
-    if not MODEL or not PROCESSOR:
-        raise RuntimeError("ML model is not loaded. Cannot perform extraction.")
-    # 1. Load Image
-    image = Image.open(image_path).convert("RGB")
-    width, height = image.size
-    # 2. Perform OCR
-    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
-    n_boxes = len(ocr_data['level'])
-    words = []
-    unnormalized_boxes = []
-    for i in range(n_boxes):
-        if int(ocr_data['conf'][i]) > 60 and ocr_data['text'][i].strip() != '':
-            word = ocr_data['text'][i]
-            (x, y, w, h) = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
-            words.append(word)
-            unnormalized_boxes.append([x, y, x + w, y + h])
-    # 3. Normalize Boxes and Prepare Inputs
-    normalized_boxes = []
-    for box in unnormalized_boxes:
-        normalized_boxes.append([
-            int(1000 * (box[0] / width)),
-            int(1000 * (box[1] / height)),
-            int(1000 * (box[2] / width)),
-            int(1000 * (box[3] / height)),
-        ])
-    # 4. Process with LayoutLMv3 Processor
-    encoding = PROCESSOR(
-        image,
-        text=words,
-        boxes=normalized_boxes,
-        truncation=True,
-        max_length=512,
-        return_tensors="pt"
-    ).to(DEVICE)
-    # 5. Run Inference
-    with torch.no_grad():
-        outputs = MODEL(**encoding)
-    predictions = outputs.logits.argmax(-1).squeeze().tolist()
-    # 6. Post-process to get final entities
-    extracted_entities = _process_predictions(words, unnormalized_boxes, encoding, predictions, MODEL.config.id2label)
-    # 7. Format the output to be consistent with your rule-based output
-        # Format the output to be consistent with the desired UI structure
-        # Format the output to be a superset of all possible fields
-    final_output = {
-        # --- Standard UI Fields ---
-        "receipt_number": None,  # SROIE doesn't train for this. Your regex model will provide it.
-        "date": extracted_entities.get("DATE", {}).get("text"),
-        "bill_to": None,         # SROIE doesn't train for this. Your regex model will provide it.
-        "items": [],             # SROIE doesn't train for line items.
-        "total_amount": None,
-        # --- Additional Fields from ML Model ---
-        "vendor": extracted_entities.get("COMPANY", {}).get("text"), # The ML model finds 'COMPANY'
-        "address": extracted_entities.get("ADDRESS", {}).get("text"),
-        # --- Debugging Info ---
-        "raw_text": " ".join(words),
-        "raw_ocr_words": words,
-        "raw_predictions": extracted_entities
-    }
-    # Safely extract and convert total
-    total_text = extracted_entities.get("TOTAL", {}).get("text")
-    if total_text:
-        try:
-            cleaned_total = re.sub(r'[^\d.]', '', total_text)
-            final_output["total_amount"] = float(cleaned_total)
-        except (ValueError, TypeError):
-            final_output["total_amount"] = None
     return final_output

+# src/ml_extraction.py
+import torch
+from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
+from PIL import Image
+import pytesseract
+from typing import List, Dict, Any
+import re
+import numpy as np
+from extraction import extract_invoice_number, extract_total
+# --- CONFIGURATION ---
+LOCAL_MODEL_PATH = "./models/layoutlmv3-generalized"
+HUB_MODEL_ID = "GSoumyajit2005/layoutlmv3-sroie-invoice-extraction"
+# --- Load Model ---
+def load_model_and_processor(model_path, hub_id):
+    try:
+        print(f"Attempting to load model from local path: {model_path}...")
+        processor = LayoutLMv3Processor.from_pretrained(model_path)
+        model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
+        print("✅ Model loaded successfully from local path.")
+    except OSError:
+        print(f"Model not found locally. Downloading from Hub: {hub_id}...")
+        from huggingface_hub import snapshot_download
+        snapshot_download(repo_id=hub_id, local_dir=model_path, local_dir_use_symlinks=False)
+        processor = LayoutLMv3Processor.from_pretrained(model_path)
+        model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
+        print("✅ Model downloaded and loaded successfully.")
+    return model, processor
+MODEL, PROCESSOR = load_model_and_processor(LOCAL_MODEL_PATH, HUB_MODEL_ID)
+if MODEL and PROCESSOR:
+    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    MODEL.to(DEVICE)
+    MODEL.eval()
+    print(f"ML Model is ready on device: {DEVICE}")
+else:
+    DEVICE = None
+    print("❌ Could not load ML model.")
+def _process_predictions(words, unnormalized_boxes, encoding, predictions, id2label):
+    word_ids = encoding.word_ids(batch_index=0)
+    word_level_preds = {}
+    for idx, word_id in enumerate(word_ids):
+        if word_id is not None:
+            label_id = predictions[idx]
+            if label_id != -100:
+                if word_id not in word_level_preds:
+                    word_level_preds[word_id] = id2label[label_id]
+    entities = {}
+    for word_idx, label in word_level_preds.items():
+        if label == 'O': continue
+        entity_type = label[2:]
+        word = words[word_idx]
+        if label.startswith('B-'):
+            entities[entity_type] = {"text": word, "bbox": [unnormalized_boxes[word_idx]]}
+        elif label.startswith('I-') and entity_type in entities:
+            entities[entity_type]['text'] += " " + word
+            entities[entity_type]['bbox'].append(unnormalized_boxes[word_idx])
+    for entity in entities.values():
+        entity['text'] = entity['text'].strip()
+    return entities
+def extract_ml_based(image_path: str) -> Dict[str, Any]:
+    if not MODEL or not PROCESSOR:
+        raise RuntimeError("ML model is not loaded.")
+    # 1. Load Image
+    image = Image.open(image_path).convert("RGB")
+    width, height = image.size
+    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
+    words = []
+    unnormalized_boxes = []
+    for i in range(len(ocr_data['level'])):
+        if int(ocr_data['conf'][i]) > 30 and ocr_data['text'][i].strip() != '':
+            words.append(ocr_data['text'][i])
+            unnormalized_boxes.append([
+                ocr_data['left'][i], ocr_data['top'][i],
+                ocr_data['width'][i], ocr_data['height'][i]
+            ])
+    raw_text = " ".join(words)
+    # 2. Normalize Boxes (WITH SAFETY CLAMP)
+    normalized_boxes = []
+    for box in unnormalized_boxes:
+        x, y, w, h = box
+        x0, y0, x1, y1 = x, y, x + w, y + h
+        # ⚠️ The Fix: Ensure values never exceed 1000 or drop below 0
+        normalized_boxes.append([
+            max(0, min(1000, int(1000 * (x0 / width)))),
+            max(0, min(1000, int(1000 * (y0 / height)))),
+            max(0, min(1000, int(1000 * (x1 / width)))),
+            max(0, min(1000, int(1000 * (y1 / height)))),
+        ])
+    # 3. Inference
+    encoding = PROCESSOR(
+        image, text=words, boxes=normalized_boxes,
+        truncation=True, max_length=512, return_tensors="pt"
+    ).to(DEVICE)
+    with torch.no_grad():
+        outputs = MODEL(**encoding)
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    extracted_entities = _process_predictions(words, unnormalized_boxes, encoding, predictions, MODEL.config.id2label)
+    # 4. Construct Output
+    final_output = {
+        "vendor": extracted_entities.get("COMPANY", {}).get("text"),
+        "date": extracted_entities.get("DATE", {}).get("text"),
+        "address": extracted_entities.get("ADDRESS", {}).get("text"),
+        "receipt_number": extracted_entities.get("INVOICE_NO", {}).get("text"),
+        "bill_to": extracted_entities.get("BILL_TO", {}).get("text"),
+        "total_amount": None,
+        "items": [],
+        "raw_text": raw_text
+    }
+    # Fallbacks
+    ml_total = extracted_entities.get("TOTAL", {}).get("text")
+    if ml_total:
+        try:
+            cleaned = re.sub(r'[^\d.,]', '', ml_total).replace(',', '.')
+            final_output["total_amount"] = float(cleaned)
+        except (ValueError, TypeError):
+            pass
+    if final_output["total_amount"] is None:
+        final_output["total_amount"] = extract_total(raw_text)
+    if not final_output["receipt_number"]:
+        final_output["receipt_number"] = extract_invoice_number(raw_text)
     return final_output

src/ocr.py CHANGED Viewed

@@ -1,15 +1,15 @@
-import pytesseract
-import numpy as np
-from typing import Optional
-pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
-def extract_text(image: np.ndarray, lang: str='eng', config: str='--psm 11') -> str:
-    if image is None:
-        raise ValueError("Input image is None")
-    text = pytesseract.image_to_string(image, lang=lang, config=config)
-    return text.strip()
-def extract_text_with_boxes(image):
-    pass

+import pytesseract
+import numpy as np
+from typing import Optional
+#pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+def extract_text(image: np.ndarray, lang: str='eng', config: str='--psm 11') -> str:
+    if image is None:
+        raise ValueError("Input image is None")
+    text = pytesseract.image_to_string(image, lang=lang, config=config)
+    return text.strip()
+def extract_text_with_boxes(image):
+    pass

src/pipeline.py CHANGED Viewed

@@ -1,151 +1,151 @@
-"""
-Main invoice processing pipeline
-Orchestrates preprocessing, OCR, and extraction
-"""
-from typing import Dict, Any, Optional
-from pathlib import Path
-import json
-# Make sure all your modules are imported
-from preprocessing import load_image, convert_to_grayscale, remove_noise
-from ocr import extract_text
-from extraction import structure_output
-from ml_extraction import extract_ml_based
-def process_invoice(image_path: str,
-                   method: str = 'ml', # <-- New parameter: 'ml' or 'rules'
-                   save_results: bool = False,
-                   output_dir: str = 'outputs') -> Dict[str, Any]:
-    """
-    Process an invoice image using either rule-based or ML-based extraction.
-    Args:
-        image_path: Path to the invoice image.
-        method: The extraction method to use ('ml' or 'rules'). Default is 'ml'.
-        save_results: Whether to save JSON results to a file.
-        output_dir: Directory to save results.
-    Returns:
-        A dictionary with the extracted invoice data.
-    """
-    if not Path(image_path).exists():
-        raise FileNotFoundError(f"Image not found at path: {image_path}")
-    print(f"Processing with '{method}' method...")
-    if method == 'ml':
-        # --- ML-Based Extraction ---
-        try:
-            # The ml_extraction function handles everything internally
-            structured_data = extract_ml_based(image_path)
-        except Exception as e:
-            raise ValueError(f"Error during ML-based extraction: {e}")
-    elif method == 'rules':
-        # --- Rule-Based Extraction (Your original logic) ---
-        try:
-            image = load_image(image_path)
-            gray_image = convert_to_grayscale(image)
-            preprocessed_image = remove_noise(gray_image, kernel_size=3)
-            text = extract_text(preprocessed_image, config='--psm 6')
-            structured_data = structure_output(text) # Calls your old extraction.py
-        except Exception as e:
-            raise ValueError(f"Error during rule-based extraction: {e}")
-    else:
-        raise ValueError(f"Unknown extraction method: '{method}'. Choose 'ml' or 'rules'.")
-    # --- Saving Logic (remains the same) ---
-    if save_results:
-        output_path = Path(output_dir)
-        output_path.mkdir(parents=True, exist_ok=True)
-        json_path = output_path / (Path(image_path).stem + f"_{method}.json") # Add method to filename
-        try:
-            with open(json_path, 'w', encoding='utf-8') as f:
-                json.dump(structured_data, f, indent=2, ensure_ascii=False)
-        except Exception as e:
-            raise IOError(f"Error saving results to {json_path}: {e}")
-    return structured_data
-def process_batch(image_folder: str, output_dir: str = 'outputs') -> list:
-    """Process multiple invoices in a folder""" # Corrected indentation
-    results = []
-    supported_extensions = ['*.jpg', '*.png', '*.jpeg']
-    for ext in supported_extensions:
-        for img_file in Path(image_folder).glob(ext):
-            print(f"🔄 Processing: {img_file}")
-            try:
-                result = process_invoice(str(img_file), save_results=True, output_dir=output_dir)
-                results.append(result)
-            except Exception as e:
-                print(f"❌ Error processing {img_file}: {e}")
-    print(f"\n🎉 Batch processing complete! {len(results)} invoices processed.")
-    return results
-def main():
-    """Command-line interface for invoice processing"""
-    import argparse
-    parser = argparse.ArgumentParser(
-        description='Process invoice images or folders and extract structured data.',
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-  # Process a single invoice
-  python src/pipeline.py data/raw/receipt1.jpg
-  # Process and save a single invoice
-  python src/pipeline.py data/raw/receipt1.jpg --save
-  # Process an entire folder of invoices
-  python src/pipeline.py data/raw --save --output results/
-        """
-    )
-    # Corrected: Single 'path' argument
-    parser.add_argument('path', help='Path to an invoice image or a folder of images')
-    parser.add_argument('--save', action='store_true', help='Save results to JSON files')
-    parser.add_argument('--output', default='outputs', help='Output directory for JSON files')
-    parser.add_argument('--method', default='ml', choices=['ml', 'rules'], help="Extraction method: 'ml' or 'rules'")
-    args = parser.parse_args()
-    try:
-        # Check if path is a directory or a file
-        if Path(args.path).is_dir():
-            process_batch(args.path, output_dir=args.output)
-        elif Path(args.path).is_file():
-            # Corrected: Use args.path
-            print(f"🔄 Processing: {args.path}")
-            result = process_invoice(args.path, method=args.method, save_results=args.save, output_dir=args.output)
-            print("\n📊 Extracted Data:")
-            print("=" * 60)
-            print(f"Vendor:         {result.get('vendor', 'N/A')}")
-            print(f"Invoice Number: {result.get('invoice_number', 'N/A')}")
-            print(f"Date:           {result.get('date', 'N/A')}")
-            print(f"Total:          ${result.get('total', 0.0)}")
-            print("=" * 60)
-            if args.save:
-                print(f"\n💾 JSON saved to: {args.output}/{Path(args.path).stem}.json")
-        else:
-            raise FileNotFoundError(f"Path does not exist: {args.path}")
-    except Exception as e:
-        print(f"❌ An error occurred: {e}")
-        return 1
-    return 0
-if __name__ == '__main__':
-    import sys
     sys.exit(main())

+"""
+Main invoice processing pipeline
+Orchestrates preprocessing, OCR, and extraction
+"""
+from typing import Dict, Any, Optional
+from pathlib import Path
+import json
+# Make sure all your modules are imported
+from preprocessing import load_image, convert_to_grayscale, remove_noise
+from ocr import extract_text
+from extraction import structure_output
+from ml_extraction import extract_ml_based
+def process_invoice(image_path: str,
+                   method: str = 'ml', # <-- New parameter: 'ml' or 'rules'
+                   save_results: bool = False,
+                   output_dir: str = 'outputs') -> Dict[str, Any]:
+    """
+    Process an invoice image using either rule-based or ML-based extraction.
+    Args:
+        image_path: Path to the invoice image.
+        method: The extraction method to use ('ml' or 'rules'). Default is 'ml'.
+        save_results: Whether to save JSON results to a file.
+        output_dir: Directory to save results.
+    Returns:
+        A dictionary with the extracted invoice data.
+    """
+    if not Path(image_path).exists():
+        raise FileNotFoundError(f"Image not found at path: {image_path}")
+    print(f"Processing with '{method}' method...")
+    if method == 'ml':
+        # --- ML-Based Extraction ---
+        try:
+            # The ml_extraction function handles everything internally
+            structured_data = extract_ml_based(image_path)
+        except Exception as e:
+            raise ValueError(f"Error during ML-based extraction: {e}")
+    elif method == 'rules':
+        # --- Rule-Based Extraction (Your original logic) ---
+        try:
+            image = load_image(image_path)
+            gray_image = convert_to_grayscale(image)
+            preprocessed_image = remove_noise(gray_image, kernel_size=3)
+            text = extract_text(preprocessed_image, config='--psm 6')
+            structured_data = structure_output(text) # Calls your old extraction.py
+        except Exception as e:
+            raise ValueError(f"Error during rule-based extraction: {e}")
+    else:
+        raise ValueError(f"Unknown extraction method: '{method}'. Choose 'ml' or 'rules'.")
+    # --- Saving Logic (remains the same) ---
+    if save_results:
+        output_path = Path(output_dir)
+        output_path.mkdir(parents=True, exist_ok=True)
+        json_path = output_path / (Path(image_path).stem + f"_{method}.json") # Add method to filename
+        try:
+            with open(json_path, 'w', encoding='utf-8') as f:
+                json.dump(structured_data, f, indent=2, ensure_ascii=False)
+        except Exception as e:
+            raise IOError(f"Error saving results to {json_path}: {e}")
+    return structured_data
+def process_batch(image_folder: str, output_dir: str = 'outputs') -> list:
+    """Process multiple invoices in a folder""" # Corrected indentation
+    results = []
+    supported_extensions = ['*.jpg', '*.png', '*.jpeg']
+    for ext in supported_extensions:
+        for img_file in Path(image_folder).glob(ext):
+            print(f"🔄 Processing: {img_file}")
+            try:
+                result = process_invoice(str(img_file), save_results=True, output_dir=output_dir)
+                results.append(result)
+            except Exception as e:
+                print(f"❌ Error processing {img_file}: {e}")
+    print(f"\n🎉 Batch processing complete! {len(results)} invoices processed.")
+    return results
+def main():
+    """Command-line interface for invoice processing"""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description='Process invoice images or folders and extract structured data.',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Process a single invoice
+  python src/pipeline.py data/raw/receipt1.jpg
+  # Process and save a single invoice
+  python src/pipeline.py data/raw/receipt1.jpg --save
+  # Process an entire folder of invoices
+  python src/pipeline.py data/raw --save --output results/
+        """
+    )
+    # Corrected: Single 'path' argument
+    parser.add_argument('path', help='Path to an invoice image or a folder of images')
+    parser.add_argument('--save', action='store_true', help='Save results to JSON files')
+    parser.add_argument('--output', default='outputs', help='Output directory for JSON files')
+    parser.add_argument('--method', default='ml', choices=['ml', 'rules'], help="Extraction method: 'ml' or 'rules'")
+    args = parser.parse_args()
+    try:
+        # Check if path is a directory or a file
+        if Path(args.path).is_dir():
+            process_batch(args.path, output_dir=args.output)
+        elif Path(args.path).is_file():
+            # Corrected: Use args.path
+            print(f"🔄 Processing: {args.path}")
+            result = process_invoice(args.path, method=args.method, save_results=args.save, output_dir=args.output)
+            print("\n📊 Extracted Data:")
+            print("=" * 60)
+            print(f"Vendor:         {result.get('vendor', 'N/A')}")
+            print(f"Invoice Number: {result.get('invoice_number', 'N/A')}")
+            print(f"Date:           {result.get('date', 'N/A')}")
+            print(f"Total:          ${result.get('total_amount', 0.0)}")
+            print("=" * 60)
+            if args.save:
+                print(f"\n💾 JSON saved to: {args.output}/{Path(args.path).stem}.json")
+        else:
+            raise FileNotFoundError(f"Path does not exist: {args.path}")
+    except Exception as e:
+        print(f"❌ An error occurred: {e}")
+        return 1
+    return 0
+if __name__ == '__main__':
+    import sys
     sys.exit(main())

src/preprocessing.py CHANGED Viewed

@@ -1,78 +1,78 @@
-import cv2
-import numpy as np
-from pathlib import Path
-def load_image(image_path: str) -> np.ndarray:
-    if not Path(image_path).exists():
-        raise FileNotFoundError(f"Image not found : {image_path}")
-    image = cv2.imread(image_path)
-    if image is None:
-        raise ValueError(f"Could not load image: {image_path}")
-    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-    return image
-def convert_to_grayscale(image: np.ndarray) -> np.ndarray:
-    if image is None:
-        raise ValueError(f"Image is None, cannot convert to grayscale")
-    if len(image.shape) ==2:
-        return image
-    return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
-def remove_noise(image: np.ndarray, kernel_size: int = 3) -> np.ndarray:
-    if image is None:
-        raise ValueError(f"Image is None, cannot remove noise")
-    if kernel_size <= 0:
-        raise ValueError("Kernel size must be positive")
-    if kernel_size % 2 == 0:
-        raise ValueError("Kernel size must be odd")
-    denoised_image = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
-    return denoised_image
-def binarize(image: np.ndarray, method: str = 'adaptive', block_size: int=11, C: int=2) -> np.ndarray:
-    if image is None:
-        raise ValueError(f"Image is None, cannot binarize")
-    if image.ndim != 2:
-        raise ValueError("Input image must be grayscale for binarization")
-    if method == 'simple':
-        _, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
-    elif method == 'adaptive':
-        binary_image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY, block_size, C)
-    else:
-        raise ValueError(f"Unknown binarization method: {method}")
-    return binary_image
-def deskew(image):
-    pass
-def preprocess_pipeline(image: np.ndarray,
-                       steps: list = ['grayscale', 'denoise', 'binarize'],
-                       denoise_kernel: int = 3,
-                       binarize_method: str = 'adaptive',
-                       binarize_block_size: int = 11,
-                       binarize_C: int = 2) -> np.ndarray:
-    if image is None:
-        raise ValueError("Input image is None")
-    processed = image
-    for step in steps:
-        if step == 'grayscale':
-            processed = convert_to_grayscale(processed)
-        elif step == 'denoise':
-            processed = remove_noise(processed, kernel_size=denoise_kernel)
-        elif step == 'binarize':
-            processed = binarize(processed,
-                               method=binarize_method,
-                               block_size=binarize_block_size,
-                               C=binarize_C)
-        else:
-            raise ValueError(f"Unknown preprocessing step: {step}")
-    return processed

+import cv2
+import numpy as np
+from pathlib import Path
+def load_image(image_path: str) -> np.ndarray:
+    if not Path(image_path).exists():
+        raise FileNotFoundError(f"Image not found : {image_path}")
+    image = cv2.imread(image_path)
+    if image is None:
+        raise ValueError(f"Could not load image: {image_path}")
+    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    return image
+def convert_to_grayscale(image: np.ndarray) -> np.ndarray:
+    if image is None:
+        raise ValueError(f"Image is None, cannot convert to grayscale")
+    if len(image.shape) ==2:
+        return image
+    return cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+def remove_noise(image: np.ndarray, kernel_size: int = 3) -> np.ndarray:
+    if image is None:
+        raise ValueError(f"Image is None, cannot remove noise")
+    if kernel_size <= 0:
+        raise ValueError("Kernel size must be positive")
+    if kernel_size % 2 == 0:
+        raise ValueError("Kernel size must be odd")
+    denoised_image = cv2.GaussianBlur(image, (kernel_size, kernel_size), 0)
+    return denoised_image
+def binarize(image: np.ndarray, method: str = 'adaptive', block_size: int=11, C: int=2) -> np.ndarray:
+    if image is None:
+        raise ValueError(f"Image is None, cannot binarize")
+    if image.ndim != 2:
+        raise ValueError("Input image must be grayscale for binarization")
+    if method == 'simple':
+        _, binary_image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
+    elif method == 'adaptive':
+        binary_image = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY, block_size, C)
+    else:
+        raise ValueError(f"Unknown binarization method: {method}")
+    return binary_image
+def deskew(image):
+    pass
+def preprocess_pipeline(image: np.ndarray,
+                       steps: list = ['grayscale', 'denoise', 'binarize'],
+                       denoise_kernel: int = 3,
+                       binarize_method: str = 'adaptive',
+                       binarize_block_size: int = 11,
+                       binarize_C: int = 2) -> np.ndarray:
+    if image is None:
+        raise ValueError("Input image is None")
+    processed = image
+    for step in steps:
+        if step == 'grayscale':
+            processed = convert_to_grayscale(processed)
+        elif step == 'denoise':
+            processed = remove_noise(processed, kernel_size=denoise_kernel)
+        elif step == 'binarize':
+            processed = binarize(processed,
+                               method=binarize_method,
+                               block_size=binarize_block_size,
+                               C=binarize_C)
+        else:
+            raise ValueError(f"Unknown preprocessing step: {step}")
+    return processed

tests/test_extraction.py CHANGED Viewed

@@ -1,41 +1,41 @@
-import sys
-sys.path.append('src')
-from extraction import extract_dates, extract_amounts, extract_total, extract_vendor, extract_invoice_number
-receipt_text = """
-tan chay yee
-*** COPY ***
-OJC MARKETING SDN BHD.
-ROC NO: 538358-H
-TAX INVOICE
-Invoice No: PEGIV-1030765
-Date: 15/01/2019 11:05:16 AM
-TOTAL: 193.00
-"""
-print("🧪 Testing Extraction Functions")
-print("=" * 60)
-dates = extract_dates(receipt_text)
-print(f"\n📅 Date: {dates}")
-amounts = extract_amounts(receipt_text)
-print(f"\n💰 Amounts: {amounts}")
-total = extract_total(receipt_text)
-print(f"\n💵 Total: {total}")
-vendor = extract_vendor(receipt_text)
-print(f"\n🏢 Vendor: {vendor}")
-invoice_num = extract_invoice_number(receipt_text)
-print(f"\n📄 Invoice Number: {invoice_num}")
 print("\n✅ All extraction tests complete!")

+import sys
+sys.path.append('src')
+from extraction import extract_dates, extract_amounts, extract_total, extract_vendor, extract_invoice_number
+receipt_text = """
+tan chay yee
+*** COPY ***
+OJC MARKETING SDN BHD.
+ROC NO: 538358-H
+TAX INVOICE
+Invoice No: PEGIV-1030765
+Date: 15/01/2019 11:05:16 AM
+TOTAL: 193.00
+"""
+print("🧪 Testing Extraction Functions")
+print("=" * 60)
+dates = extract_dates(receipt_text)
+print(f"\n📅 Date: {dates}")
+amounts = extract_amounts(receipt_text)
+print(f"\n💰 Amounts: {amounts}")
+total = extract_total(receipt_text)
+print(f"\n💵 Total: {total}")
+vendor = extract_vendor(receipt_text)
+print(f"\n🏢 Vendor: {vendor}")
+invoice_num = extract_invoice_number(receipt_text)
+print(f"\n📄 Invoice Number: {invoice_num}")
 print("\n✅ All extraction tests complete!")

tests/test_full_pipeline.py CHANGED Viewed

@@ -1,42 +1,42 @@
-import sys
-sys.path.append('src')
-from preprocessing import load_image, convert_to_grayscale, remove_noise
-from ocr import extract_text
-from extraction import structure_output
-import json
-print("=" * 60)
-print("🎯 FULL INVOICE PROCESSING PIPELINE TEST")
-print("=" * 60)
-# Step 1: Load and preprocess image
-print("\n1️⃣ Loading and preprocessing image...")
-image = load_image('data/raw/receipt3.jpg')
-gray = convert_to_grayscale(image)
-denoised = remove_noise(gray, kernel_size=3)
-print("✅ Image preprocessed")
-# Step 2: Extract text with OCR
-print("\n2️⃣ Extracting text with OCR...")
-text = extract_text(denoised, config='--psm 6')
-print(f"✅ Extracted {len(text)} characters")
-# Step 3: Extract structured information
-print("\n3️⃣ Extracting structured information...")
-result = structure_output(text)
-print("✅ Information extracted")
-# Step 4: Display results
-print("\n" + "=" * 60)
-print("📊 EXTRACTED INVOICE DATA (JSON)")
-print("=" * 60)
-print(json.dumps(result, indent=2, ensure_ascii=False))
-print("=" * 60)
-print("\n🎉 PIPELINE COMPLETE!")
-print("\n📋 Summary:")
-print(f"   Vendor: {result['vendor']}")
-print(f"   Invoice #: {result['invoice_number']}")
-print(f"   Date: {result['date']}")
 print(f"   Total: ${result['total']}")

+import sys
+sys.path.append('src')
+from preprocessing import load_image, convert_to_grayscale, remove_noise
+from ocr import extract_text
+from extraction import structure_output
+import json
+print("=" * 60)
+print("🎯 FULL INVOICE PROCESSING PIPELINE TEST")
+print("=" * 60)
+# Step 1: Load and preprocess image
+print("\n1️⃣ Loading and preprocessing image...")
+image = load_image('data/raw/receipt3.jpg')
+gray = convert_to_grayscale(image)
+denoised = remove_noise(gray, kernel_size=3)
+print("✅ Image preprocessed")
+# Step 2: Extract text with OCR
+print("\n2️⃣ Extracting text with OCR...")
+text = extract_text(denoised, config='--psm 6')
+print(f"✅ Extracted {len(text)} characters")
+# Step 3: Extract structured information
+print("\n3️⃣ Extracting structured information...")
+result = structure_output(text)
+print("✅ Information extracted")
+# Step 4: Display results
+print("\n" + "=" * 60)
+print("📊 EXTRACTED INVOICE DATA (JSON)")
+print("=" * 60)
+print(json.dumps(result, indent=2, ensure_ascii=False))
+print("=" * 60)
+print("\n🎉 PIPELINE COMPLETE!")
+print("\n📋 Summary:")
+print(f"   Vendor: {result['vendor']}")
+print(f"   Invoice #: {result['invoice_number']}")
+print(f"   Date: {result['date']}")
 print(f"   Total: ${result['total']}")

tests/test_ocr.py CHANGED Viewed

@@ -1,101 +1,101 @@
-import sys
-sys.path.append('src')
-from preprocessing import load_image, convert_to_grayscale, remove_noise
-from ocr import extract_text
-import matplotlib.pyplot as plt
-import numpy as np
-print("=" * 60)
-print("🎯 OPTIMIZING GRAYSCALE OCR")
-print("=" * 60)
-# Load and convert to grayscale
-image = load_image('data/raw/receipt3.jpg')
-gray = convert_to_grayscale(image)
-# Test 1: Different PSM modes
-print("\n📊 Testing different Tesseract PSM modes...\n")
-psm_configs = [
-    ('', 'Default'),
-    ('--psm 3', 'Automatic page segmentation'),
-    ('--psm 4', 'Single column of text'),
-    ('--psm 6', 'Uniform block of text'),
-    ('--psm 11', 'Sparse text, find as much as possible'),
-    ('--psm 12', 'Sparse text with OSD (Orientation and Script Detection)'),
-]
-results = {}
-for config, desc in psm_configs:
-    text = extract_text(gray, config=config)
-    results[desc] = text
-    print(f"{desc:50s} → {len(text):4d} chars")
-# Find best result
-best_desc = max(results, key=lambda k: len(results[k]))
-best_text = results[best_desc]
-print(f"\n✅ WINNER: {best_desc} ({len(best_text)} chars)")
-# Test 2: With slight denoising
-print("\n📊 Testing with light denoising...\n")
-denoised = remove_noise(gray, kernel_size=3)
-text_denoised = extract_text(denoised, config='--psm 6')
-print(f"Grayscale + Denoise (psm 6): {len(text_denoised)} chars")
-# Display best result
-print("\n" + "=" * 60)
-print("📄 BEST EXTRACTED TEXT:")
-print("=" * 60)
-print(best_text)
-print("=" * 60)
-# Visualize
-fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-axes[0].imshow(image)
-axes[0].set_title("Original")
-axes[0].axis('off')
-axes[1].imshow(gray, cmap='gray')
-axes[1].set_title(f"Grayscale\n({len(best_text)} chars - {best_desc})")
-axes[1].axis('off')
-axes[2].imshow(denoised, cmap='gray')
-axes[2].set_title(f"Denoised\n({len(text_denoised)} chars)")
-axes[2].axis('off')
-plt.tight_layout()
-plt.show()
-print(f"\n💡 Recommended pipeline: Grayscale + {best_desc}")
-# Test the combination we missed!
-print("\n📊 Testing BEST combination...\n")
-denoised = remove_noise(gray, kernel_size=3)
-# Test PSM 11 on denoised
-text_denoised_psm11 = extract_text(denoised, config='--psm 11')
-text_denoised_psm6 = extract_text(denoised, config='--psm 6')
-print(f"Denoised + PSM 6:  {len(text_denoised_psm6)} chars")
-print(f"Denoised + PSM 11: {len(text_denoised_psm11)} chars")
-if len(text_denoised_psm11) > len(text_denoised_psm6):
-    print(f"\n✅ PSM 11 wins! ({len(text_denoised_psm11)} chars)")
-    best_config = '--psm 11'
-    best_text_final = text_denoised_psm11
-else:
-    print(f"\n✅ PSM 6 wins! ({len(text_denoised_psm6)} chars)")
-    best_config = '--psm 6'
-    best_text_final = text_denoised_psm6
-print(f"\n🏆 FINAL WINNER: Denoised + {best_config}")
-print("\nFull text:")
-print("=" * 60)
-print(best_text_final)
 print("=" * 60)

+import sys
+sys.path.append('src')
+from preprocessing import load_image, convert_to_grayscale, remove_noise
+from ocr import extract_text
+import matplotlib.pyplot as plt
+import numpy as np
+print("=" * 60)
+print("🎯 OPTIMIZING GRAYSCALE OCR")
+print("=" * 60)
+# Load and convert to grayscale
+image = load_image('data/raw/receipt3.jpg')
+gray = convert_to_grayscale(image)
+# Test 1: Different PSM modes
+print("\n📊 Testing different Tesseract PSM modes...\n")
+psm_configs = [
+    ('', 'Default'),
+    ('--psm 3', 'Automatic page segmentation'),
+    ('--psm 4', 'Single column of text'),
+    ('--psm 6', 'Uniform block of text'),
+    ('--psm 11', 'Sparse text, find as much as possible'),
+    ('--psm 12', 'Sparse text with OSD (Orientation and Script Detection)'),
+]
+results = {}
+for config, desc in psm_configs:
+    text = extract_text(gray, config=config)
+    results[desc] = text
+    print(f"{desc:50s} → {len(text):4d} chars")
+# Find best result
+best_desc = max(results, key=lambda k: len(results[k]))
+best_text = results[best_desc]
+print(f"\n✅ WINNER: {best_desc} ({len(best_text)} chars)")
+# Test 2: With slight denoising
+print("\n📊 Testing with light denoising...\n")
+denoised = remove_noise(gray, kernel_size=3)
+text_denoised = extract_text(denoised, config='--psm 6')
+print(f"Grayscale + Denoise (psm 6): {len(text_denoised)} chars")
+# Display best result
+print("\n" + "=" * 60)
+print("📄 BEST EXTRACTED TEXT:")
+print("=" * 60)
+print(best_text)
+print("=" * 60)
+# Visualize
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+axes[0].imshow(image)
+axes[0].set_title("Original")
+axes[0].axis('off')
+axes[1].imshow(gray, cmap='gray')
+axes[1].set_title(f"Grayscale\n({len(best_text)} chars - {best_desc})")
+axes[1].axis('off')
+axes[2].imshow(denoised, cmap='gray')
+axes[2].set_title(f"Denoised\n({len(text_denoised)} chars)")
+axes[2].axis('off')
+plt.tight_layout()
+plt.show()
+print(f"\n💡 Recommended pipeline: Grayscale + {best_desc}")
+# Test the combination we missed!
+print("\n📊 Testing BEST combination...\n")
+denoised = remove_noise(gray, kernel_size=3)
+# Test PSM 11 on denoised
+text_denoised_psm11 = extract_text(denoised, config='--psm 11')
+text_denoised_psm6 = extract_text(denoised, config='--psm 6')
+print(f"Denoised + PSM 6:  {len(text_denoised_psm6)} chars")
+print(f"Denoised + PSM 11: {len(text_denoised_psm11)} chars")
+if len(text_denoised_psm11) > len(text_denoised_psm6):
+    print(f"\n✅ PSM 11 wins! ({len(text_denoised_psm11)} chars)")
+    best_config = '--psm 11'
+    best_text_final = text_denoised_psm11
+else:
+    print(f"\n✅ PSM 6 wins! ({len(text_denoised_psm6)} chars)")
+    best_config = '--psm 6'
+    best_text_final = text_denoised_psm6
+print(f"\n🏆 FINAL WINNER: Denoised + {best_config}")
+print("\nFull text:")
+print("=" * 60)
+print(best_text_final)
 print("=" * 60)

tests/test_pipeline.py CHANGED Viewed

@@ -1,96 +1,96 @@
-import sys
-import json
-from pathlib import Path
-# Add the 'src' directory to the Python path
-sys.path.append('src')
-from pipeline import process_invoice
-def test_full_pipeline():
-    """
-    Tests the full invoice processing pipeline on a sample receipt
-    and prints the advanced JSON structure.
-    """
-    print("=" * 60)
-    print("🎯 ADVANCED INVOICE PROCESSING PIPELINE TEST")
-    print("=" * 60)
-    # --- Configuration ---
-    image_path = 'data/raw/receipt1.jpg'
-    save_output = True
-    output_dir = 'outputs'
-    # Check if the image exists
-    if not Path(image_path).exists():
-        print(f"❌ ERROR: Test image not found at '{image_path}'")
-        return
-    # --- Processing ---
-    print(f"\n🔄 Processing invoice: {image_path}...")
-    try:
-        # Call the main processing function
-        result = process_invoice(image_path, save_results=save_output, output_dir=output_dir)
-        print("✅ Invoice processed successfully!")
-    except Exception as e:
-        print(f"❌ An error occurred during processing: {e}")
-        # Print traceback for detailed debugging
-        import traceback
-        traceback.print_exc()
-        return
-    # --- Display Results ---
-    print("\n" + "=" * 60)
-    print("📊 EXTRACTED INVOICE DATA (Advanced JSON)")
-    print("=" * 60)
-    # Pretty-print the JSON to the console
-    print(json.dumps(result, indent=2, ensure_ascii=False))
-    print("\n" + "=" * 60)
-    print("📋 SUMMARY OF KEY EXTRACTED FIELDS")
-    print("=" * 60)
-    # --- Print a clean summary ---
-    print(f"📄 Receipt Number: {result.get('receipt_number', 'N/A')}")
-    print(f"📅 Date: {result.get('date', 'N/A')}")
-    # Print Bill To info safely
-    bill_to = result.get('bill_to')
-    if bill_to and isinstance(bill_to, dict):
-        print(f"👤 Bill To: {bill_to.get('name', 'N/A')}")
-    else:
-        print("👤 Bill To: N/A")
-    # Print line items
-    print("\n🛒 Line Items:")
-    items = result.get('items', [])
-    if items:
-        for i, item in enumerate(items, 1):
-            desc = item.get('description', 'No Description')
-            qty = item.get('quantity', 1)
-            total = item.get('total', 0.0)
-            print(f"  - Item {i}: {desc[:40]:<40} | Qty: {qty} | Total: {total:.2f}")
-    else:
-        print("  - No line items extracted.")
-    # Print total and validation status
-    print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0):.2f}")
-    confidence = result.get('extraction_confidence', 0)
-    print(f"📈 Confidence: {confidence}%")
-    validation = "✅ Passed" if result.get('validation_passed', False) else "❌ Failed"
-    print(f"✔️ Validation: {validation}")
-    print("\n" + "=" * 60)
-    if save_output:
-        json_path = Path(output_dir) / (Path(image_path).stem + '.json')
-        print(f"\n💾 Full JSON output saved to: {json_path}")
-    print("\n🎉 PIPELINE TEST COMPLETE!")
-if __name__ == '__main__':
     test_full_pipeline()

+import sys
+import json
+from pathlib import Path
+# Add the 'src' directory to the Python path
+sys.path.append('src')
+from pipeline import process_invoice
+def test_full_pipeline():
+    """
+    Tests the full invoice processing pipeline on a sample receipt
+    and prints the advanced JSON structure.
+    """
+    print("=" * 60)
+    print("🎯 ADVANCED INVOICE PROCESSING PIPELINE TEST")
+    print("=" * 60)
+    # --- Configuration ---
+    image_path = 'data/raw/receipt1.jpg'
+    save_output = True
+    output_dir = 'outputs'
+    # Check if the image exists
+    if not Path(image_path).exists():
+        print(f"❌ ERROR: Test image not found at '{image_path}'")
+        return
+    # --- Processing ---
+    print(f"\n🔄 Processing invoice: {image_path}...")
+    try:
+        # Call the main processing function
+        result = process_invoice(image_path, save_results=save_output, output_dir=output_dir)
+        print("✅ Invoice processed successfully!")
+    except Exception as e:
+        print(f"❌ An error occurred during processing: {e}")
+        # Print traceback for detailed debugging
+        import traceback
+        traceback.print_exc()
+        return
+    # --- Display Results ---
+    print("\n" + "=" * 60)
+    print("📊 EXTRACTED INVOICE DATA (Advanced JSON)")
+    print("=" * 60)
+    # Pretty-print the JSON to the console
+    print(json.dumps(result, indent=2, ensure_ascii=False))
+    print("\n" + "=" * 60)
+    print("📋 SUMMARY OF KEY EXTRACTED FIELDS")
+    print("=" * 60)
+    # --- Print a clean summary ---
+    print(f"📄 Receipt Number: {result.get('receipt_number', 'N/A')}")
+    print(f"📅 Date: {result.get('date', 'N/A')}")
+    # Print Bill To info safely
+    bill_to = result.get('bill_to')
+    if bill_to and isinstance(bill_to, dict):
+        print(f"👤 Bill To: {bill_to.get('name', 'N/A')}")
+    else:
+        print("👤 Bill To: N/A")
+    # Print line items
+    print("\n🛒 Line Items:")
+    items = result.get('items', [])
+    if items:
+        for i, item in enumerate(items, 1):
+            desc = item.get('description', 'No Description')
+            qty = item.get('quantity', 1)
+            total = item.get('total', 0.0)
+            print(f"  - Item {i}: {desc[:40]:<40} | Qty: {qty} | Total: {total:.2f}")
+    else:
+        print("  - No line items extracted.")
+    # Print total and validation status
+    print(f"\n💵 Total Amount: ${result.get('total_amount', 0.0):.2f}")
+    confidence = result.get('extraction_confidence', 0)
+    print(f"📈 Confidence: {confidence}%")
+    validation = "✅ Passed" if result.get('validation_passed', False) else "❌ Failed"
+    print(f"✔️ Validation: {validation}")
+    print("\n" + "=" * 60)
+    if save_output:
+        json_path = Path(output_dir) / (Path(image_path).stem + '.json')
+        print(f"\n💾 Full JSON output saved to: {json_path}")
+    print("\n🎉 PIPELINE TEST COMPLETE!")
+if __name__ == '__main__':
     test_full_pipeline()

tests/test_preprocessing.py CHANGED Viewed

@@ -1,177 +1,177 @@
-import sys
-sys.path.append('src')  # So Python can find our modules
-from preprocessing import load_image, convert_to_grayscale, remove_noise, binarize, preprocess_pipeline
-import numpy as np
-import matplotlib.pyplot as plt
-# Test 1: Load a valid image
-print("Test 1: Loading receipt1.jpg...")
-image = load_image('data/raw/receipt1.jpg')
-print(f"✅ Success! Image shape: {image.shape}")
-print(f"   Data type: {image.dtype}")
-print(f"   Value range: {image.min()} to {image.max()}")
-# Test 2: Visualize it
-print("\nTest 2: Displaying image...")
-plt.imshow(image)
-plt.title("Loaded Receipt")
-plt.axis('off')
-plt.show()
-print("✅ If you see the receipt image, it worked!")
-# Test 3: Try loading non-existent file
-print("\nTest 3: Testing error handling...")
-try:
-    load_image('data/raw/fake_image.jpg')
-    print("❌ Should have raised FileNotFoundError!")
-except FileNotFoundError as e:
-    print(f"✅ Correctly raised error: {e}")
-# Test 4: Grayscale conversion
-print("\nTest 4: Converting to grayscale...")
-gray = convert_to_grayscale(image)
-print(f"✅ Success! Grayscale shape: {gray.shape}")
-print(f"   Original had 3 channels, now has: {len(gray.shape)} dimensions")
-# Visualize side-by-side
-fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
-ax1.imshow(image)
-ax1.set_title("Original (RGB)")
-ax1.axis('off')
-ax2.imshow(gray, cmap='gray')  # cmap='gray' tells matplotlib to display in grayscale
-ax2.set_title("Grayscale")
-ax2.axis('off')
-plt.tight_layout()
-plt.show()
-# Test 5: Already grayscale (should return as-is)
-print("\nTest 5: Converting already-grayscale image...")
-gray_again = convert_to_grayscale(gray)
-print(f"✅ Returned without error: {gray_again.shape}")
-assert gray_again is gray, "Should return same object if already grayscale"
-print("✅ Correctly returned the same image!")
-print("\n🎉 Grayscale tests passed!")
-# Test 6: Binarization - Simple method
-print("\nTest 6: Simple binarization...")
-binary_simple = binarize(gray, method='simple')
-print(f"✅ Success! Binary shape: {binary_simple.shape}")
-print(f"   Unique values: {np.unique(binary_simple)}")  # Should be [0, 255]
-# Test 7: Binarization - Adaptive method
-print("\nTest 7: Adaptive binarization...")
-binary_adaptive = binarize(gray, method='adaptive', block_size=11, C=2)
-print(f"✅ Success! Binary shape: {binary_adaptive.shape}")
-print(f"   Unique values: {np.unique(binary_adaptive)}")
-# Visualize comparison
-fig, axes = plt.subplots(2, 2, figsize=(12, 10))
-axes[0, 0].imshow(image)
-axes[0, 0].set_title("1. Original (RGB)")
-axes[0, 0].axis('off')
-axes[0, 1].imshow(gray, cmap='gray')
-axes[0, 1].set_title("2. Grayscale")
-axes[0, 1].axis('off')
-axes[1, 0].imshow(binary_simple, cmap='gray')
-axes[1, 0].set_title("3. Simple Threshold")
-axes[1, 0].axis('off')
-axes[1, 1].imshow(binary_adaptive, cmap='gray')
-axes[1, 1].set_title("4. Adaptive Threshold")
-axes[1, 1].axis('off')
-plt.tight_layout()
-plt.show()
-# Test 8: Error handling
-print("\nTest 8: Testing error handling...")
-try:
-    binarize(image, method='adaptive')  # RGB image (3D) should fail
-    print("❌ Should have raised ValueError!")
-except ValueError as e:
-    print(f"✅ Correctly raised error: {e}")
-print("\n🎉 Binarization tests passed!")
-# Test 9: Noise removal
-print("\nTest 9: Noise removal...")
-denoised = remove_noise(gray, kernel_size=3)
-print(f"✅ Success! Denoised shape: {denoised.shape}")
-# Test different kernel sizes
-denoised_light = remove_noise(gray, kernel_size=3)
-denoised_heavy = remove_noise(gray, kernel_size=7)
-# Visualize comparison
-fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-axes[0].imshow(gray, cmap='gray')
-axes[0].set_title("Original Grayscale")
-axes[0].axis('off')
-axes[1].imshow(denoised_light, cmap='gray')
-axes[1].set_title("Denoised (kernel=3)")
-axes[1].axis('off')
-axes[2].imshow(denoised_heavy, cmap='gray')
-axes[2].set_title("Denoised (kernel=7)")
-axes[2].axis('off')
-plt.tight_layout()
-plt.show()
-print("   Notice: kernel=7 is blurrier but removes more noise")
-# Test 10: Error handling
-print("\nTest 10: Noise removal error handling...")
-try:
-    remove_noise(gray, kernel_size=4)  # Even number
-    print("❌ Should have raised ValueError!")
-except ValueError as e:
-    print(f"✅ Correctly raised error: {e}")
-print("\n🎉 Noise removal tests passed!")
-# Test 11: Full pipeline
-print("\nTest 11: Full preprocessing pipeline...")
-# Test with all steps
-full_processed = preprocess_pipeline(image,
-                                     steps=['grayscale', 'denoise', 'binarize'],
-                                     denoise_kernel=3,
-                                     binarize_method='adaptive')
-print(f"✅ Full pipeline success! Shape: {full_processed.shape}")
-# Test with selective steps (your clean images)
-clean_processed = preprocess_pipeline(image,
-                                      steps=['grayscale', 'binarize'],
-                                      binarize_method='adaptive')
-print(f"✅ Clean pipeline success! Shape: {clean_processed.shape}")
-# Visualize comparison
-fig, axes = plt.subplots(1, 3, figsize=(15, 5))
-axes[0].imshow(image)
-axes[0].set_title("Original")
-axes[0].axis('off')
-axes[1].imshow(full_processed, cmap='gray')
-axes[1].set_title("Full Pipeline\n(grayscale → denoise → binarize)")
-axes[1].axis('off')
-axes[2].imshow(clean_processed, cmap='gray')
-axes[2].set_title("Clean Pipeline\n(grayscale → binarize)")
-axes[2].axis('off')
-plt.tight_layout()
-plt.show()
-print("\n🎉 Pipeline tests passed!")
-print("\n🎉 All tests passed!")

+import sys
+sys.path.append('src')  # So Python can find our modules
+from preprocessing import load_image, convert_to_grayscale, remove_noise, binarize, preprocess_pipeline
+import numpy as np
+import matplotlib.pyplot as plt
+# Test 1: Load a valid image
+print("Test 1: Loading receipt1.jpg...")
+image = load_image('data/raw/receipt1.jpg')
+print(f"✅ Success! Image shape: {image.shape}")
+print(f"   Data type: {image.dtype}")
+print(f"   Value range: {image.min()} to {image.max()}")
+# Test 2: Visualize it
+print("\nTest 2: Displaying image...")
+plt.imshow(image)
+plt.title("Loaded Receipt")
+plt.axis('off')
+plt.show()
+print("✅ If you see the receipt image, it worked!")
+# Test 3: Try loading non-existent file
+print("\nTest 3: Testing error handling...")
+try:
+    load_image('data/raw/fake_image.jpg')
+    print("❌ Should have raised FileNotFoundError!")
+except FileNotFoundError as e:
+    print(f"✅ Correctly raised error: {e}")
+# Test 4: Grayscale conversion
+print("\nTest 4: Converting to grayscale...")
+gray = convert_to_grayscale(image)
+print(f"✅ Success! Grayscale shape: {gray.shape}")
+print(f"   Original had 3 channels, now has: {len(gray.shape)} dimensions")
+# Visualize side-by-side
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
+ax1.imshow(image)
+ax1.set_title("Original (RGB)")
+ax1.axis('off')
+ax2.imshow(gray, cmap='gray')  # cmap='gray' tells matplotlib to display in grayscale
+ax2.set_title("Grayscale")
+ax2.axis('off')
+plt.tight_layout()
+plt.show()
+# Test 5: Already grayscale (should return as-is)
+print("\nTest 5: Converting already-grayscale image...")
+gray_again = convert_to_grayscale(gray)
+print(f"✅ Returned without error: {gray_again.shape}")
+assert gray_again is gray, "Should return same object if already grayscale"
+print("✅ Correctly returned the same image!")
+print("\n🎉 Grayscale tests passed!")
+# Test 6: Binarization - Simple method
+print("\nTest 6: Simple binarization...")
+binary_simple = binarize(gray, method='simple')
+print(f"✅ Success! Binary shape: {binary_simple.shape}")
+print(f"   Unique values: {np.unique(binary_simple)}")  # Should be [0, 255]
+# Test 7: Binarization - Adaptive method
+print("\nTest 7: Adaptive binarization...")
+binary_adaptive = binarize(gray, method='adaptive', block_size=11, C=2)
+print(f"✅ Success! Binary shape: {binary_adaptive.shape}")
+print(f"   Unique values: {np.unique(binary_adaptive)}")
+# Visualize comparison
+fig, axes = plt.subplots(2, 2, figsize=(12, 10))
+axes[0, 0].imshow(image)
+axes[0, 0].set_title("1. Original (RGB)")
+axes[0, 0].axis('off')
+axes[0, 1].imshow(gray, cmap='gray')
+axes[0, 1].set_title("2. Grayscale")
+axes[0, 1].axis('off')
+axes[1, 0].imshow(binary_simple, cmap='gray')
+axes[1, 0].set_title("3. Simple Threshold")
+axes[1, 0].axis('off')
+axes[1, 1].imshow(binary_adaptive, cmap='gray')
+axes[1, 1].set_title("4. Adaptive Threshold")
+axes[1, 1].axis('off')
+plt.tight_layout()
+plt.show()
+# Test 8: Error handling
+print("\nTest 8: Testing error handling...")
+try:
+    binarize(image, method='adaptive')  # RGB image (3D) should fail
+    print("❌ Should have raised ValueError!")
+except ValueError as e:
+    print(f"✅ Correctly raised error: {e}")
+print("\n🎉 Binarization tests passed!")
+# Test 9: Noise removal
+print("\nTest 9: Noise removal...")
+denoised = remove_noise(gray, kernel_size=3)
+print(f"✅ Success! Denoised shape: {denoised.shape}")
+# Test different kernel sizes
+denoised_light = remove_noise(gray, kernel_size=3)
+denoised_heavy = remove_noise(gray, kernel_size=7)
+# Visualize comparison
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+axes[0].imshow(gray, cmap='gray')
+axes[0].set_title("Original Grayscale")
+axes[0].axis('off')
+axes[1].imshow(denoised_light, cmap='gray')
+axes[1].set_title("Denoised (kernel=3)")
+axes[1].axis('off')
+axes[2].imshow(denoised_heavy, cmap='gray')
+axes[2].set_title("Denoised (kernel=7)")
+axes[2].axis('off')
+plt.tight_layout()
+plt.show()
+print("   Notice: kernel=7 is blurrier but removes more noise")
+# Test 10: Error handling
+print("\nTest 10: Noise removal error handling...")
+try:
+    remove_noise(gray, kernel_size=4)  # Even number
+    print("❌ Should have raised ValueError!")
+except ValueError as e:
+    print(f"✅ Correctly raised error: {e}")
+print("\n🎉 Noise removal tests passed!")
+# Test 11: Full pipeline
+print("\nTest 11: Full preprocessing pipeline...")
+# Test with all steps
+full_processed = preprocess_pipeline(image,
+                                     steps=['grayscale', 'denoise', 'binarize'],
+                                     denoise_kernel=3,
+                                     binarize_method='adaptive')
+print(f"✅ Full pipeline success! Shape: {full_processed.shape}")
+# Test with selective steps (your clean images)
+clean_processed = preprocess_pipeline(image,
+                                      steps=['grayscale', 'binarize'],
+                                      binarize_method='adaptive')
+print(f"✅ Clean pipeline success! Shape: {clean_processed.shape}")
+# Visualize comparison
+fig, axes = plt.subplots(1, 3, figsize=(15, 5))
+axes[0].imshow(image)
+axes[0].set_title("Original")
+axes[0].axis('off')
+axes[1].imshow(full_processed, cmap='gray')
+axes[1].set_title("Full Pipeline\n(grayscale → denoise → binarize)")
+axes[1].axis('off')
+axes[2].imshow(clean_processed, cmap='gray')
+axes[2].set_title("Clean Pipeline\n(grayscale → binarize)")
+axes[2].axis('off')
+plt.tight_layout()
+plt.show()
+print("\n🎉 Pipeline tests passed!")
+print("\n🎉 All tests passed!")

tests/utils.py CHANGED Viewed

@@ -1,7 +1,7 @@
-def save_image(image, path):
-def visualize_boxes(image, boxes, text):
-def validate_output(data):
 def format_currency(amount):

+def save_image(image, path):
+def visualize_boxes(image, boxes, text):
+def validate_output(data):
 def format_currency(amount):

train_combined.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor, DataCollatorForTokenClassification
+from PIL import Image
+from tqdm import tqdm
+from seqeval.metrics import f1_score
+from pathlib import Path
+import numpy as np
+import random
+import os
+# --- IMPORTS ---
+from load_sroie_dataset import load_sroie
+from src.data_loader import load_unified_dataset
+# --- CONFIGURATION ---
+# Points to your local SROIE copy
+SROIE_DATA_PATH = "data/sroie"
+MODEL_CHECKPOINT = "microsoft/layoutlmv3-base"
+OUTPUT_DIR = "models/layoutlmv3-generalized"
+# Standard Label Set
+LABEL_LIST = ['O', 'B-COMPANY', 'I-COMPANY', 'B-DATE', 'I-DATE',
+              'B-ADDRESS', 'I-ADDRESS', 'B-TOTAL', 'I-TOTAL',
+              'B-INVOICE_NO', 'I-INVOICE_NO','B-BILL_TO', 'I-BILL_TO']
+label2id = {label: idx for idx, label in enumerate(LABEL_LIST)}
+id2label = {idx: label for idx, label in enumerate(LABEL_LIST)}
+class UnifiedDataset(Dataset):
+    def __init__(self, data, processor, label2id):
+        self.data = data
+        self.processor = processor
+        self.label2id = label2id
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        example = self.data[idx]
+        # 1. Image Loading
+        try:
+            if 'image' in example and isinstance(example['image'], Image.Image):
+                image = example['image']
+            elif 'image_path' in example:
+                image = Image.open(example['image_path']).convert("RGB")
+            else:
+                image = Image.new('RGB', (224, 224), color='white')
+        except Exception:
+             image = Image.new('RGB', (224, 224), color='white')
+        # 2. Boxes are ALREADY normalized!
+        # Just need to ensure they are integers and valid
+        boxes = []
+        for box in example['bboxes']:
+             # Extra safety clamp, just in case
+             safe_box = [
+                 max(0, min(int(box[0]), 1000)),
+                 max(0, min(int(box[1]), 1000)),
+                 max(0, min(int(box[2]), 1000)),
+                 max(0, min(int(box[3]), 1000))
+             ]
+             boxes.append(safe_box)
+        # 3. Label Encoding
+        word_labels = []
+        for label in example['ner_tags']:
+            word_labels.append(self.label2id.get(label, 0))
+        # 4. Processor Encoding
+        encoding = self.processor(
+            image,
+            text=example['words'],
+            boxes=boxes,
+            word_labels=word_labels,
+            truncation=True,
+            padding="max_length",
+            max_length=512,
+            return_tensors="pt"
+        )
+        return {k: v.squeeze(0) for k, v in encoding.items()}
+def train():
+    print(f"{'='*40}\n🚀 STARTING HYBRID TRAINING\n{'='*40}")
+    # Check SROIE path
+    if not os.path.exists(SROIE_DATA_PATH):
+        print(f"❌ Error: SROIE path not found at {SROIE_DATA_PATH}")
+        print("Please make sure you copied the 'sroie' folder into 'data/'.")
+        return
+    # 1. Load SROIE
+    print("📦 Loading SROIE dataset...")
+    sroie_data = load_sroie(SROIE_DATA_PATH)
+    print(f"   - SROIE Train: {len(sroie_data['train'])}")
+    print(f"   - SROIE Test:  {len(sroie_data['test'])}")
+    # 2. Load New Dataset
+    print("📦 Loading General Invoice dataset...")
+    # Reduced sample size slightly to stay safe on RAM
+    new_data = load_unified_dataset(split='train', sample_size=600)
+    random.shuffle(new_data)
+    split_idx = int(len(new_data) * 0.9)
+    new_train = new_data[:split_idx]
+    new_test = new_data[split_idx:]
+    print(f"   - General Train: {len(new_train)}")
+    print(f"   - General Test:  {len(new_test)}")
+    # 3. Merge
+    full_train_data = sroie_data['train'] + new_train
+    full_test_data = sroie_data['test'] + new_test
+    print(f"\n🔗 COMBINED DATASET SIZE: {len(full_train_data)} Training Images")
+    # 4. Setup Model
+    processor = LayoutLMv3Processor.from_pretrained(MODEL_CHECKPOINT, apply_ocr=False)
+    model = LayoutLMv3ForTokenClassification.from_pretrained(
+        MODEL_CHECKPOINT, num_labels=len(LABEL_LIST),
+        id2label=id2label, label2id=label2id
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    print(f"   - Device: {device}")
+    # 5. Dataloaders
+    train_ds = UnifiedDataset(full_train_data, processor, label2id)
+    test_ds = UnifiedDataset(full_test_data, processor, label2id)
+    collator = DataCollatorForTokenClassification(processor.tokenizer, padding=True, return_tensors="pt")
+    train_loader = DataLoader(train_ds, batch_size=2, shuffle=True, collate_fn=collator)
+    test_loader = DataLoader(test_ds, batch_size=2, collate_fn=collator)
+    # 6. Optimize & Train
+    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
+    best_f1 = 0.0
+    NUM_EPOCHS = 5
+    print("\n🔥 Beginning Fine-Tuning...")
+    for epoch in range(NUM_EPOCHS):
+        model.train()
+        total_loss = 0
+        progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{NUM_EPOCHS}")
+        for batch in progress:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            optimizer.zero_grad()
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+            progress.set_postfix({"loss": f"{loss.item():.4f}"})
+        # --- Evaluation ---
+        model.eval()
+        all_preds, all_labels = [], []
+        print("   Running Validation...")
+        with torch.no_grad():
+            for batch in test_loader:
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                predictions = outputs.logits.argmax(dim=-1)
+                labels = batch['labels']
+                for i in range(len(labels)):
+                    true_labels = [id2label[l.item()] for l in labels[i] if l.item() != -100]
+                    pred_labels = [id2label[p.item()] for p, l in zip(predictions[i], labels[i]) if l.item() != -100]
+                    all_labels.append(true_labels)
+                    all_preds.append(pred_labels)
+        f1 = f1_score(all_labels, all_preds)
+        print(f"   📊 Epoch {epoch+1} F1 Score: {f1:.4f}")
+        if f1 > best_f1:
+            best_f1 = f1
+            print(f"   💾 Saving Improved Model to {OUTPUT_DIR}")
+            Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
+            model.save_pretrained(OUTPUT_DIR)
+            processor.save_pretrained(OUTPUT_DIR)
+if __name__ == "__main__":
+    train()

train_layoutlm.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3Processor, DataCollatorForTokenClassification
+from load_sroie_dataset import load_sroie # Assumes your helper script is in the root
+from PIL import Image
+from tqdm import tqdm
+from seqeval.metrics import f1_score, precision_score, recall_score
+from pathlib import Path
+# --- 1. Global Configuration & Label Mapping ---
+print("Setting up configuration...")
+label_list = ['O', 'B-COMPANY', 'I-COMPANY', 'B-DATE', 'I-DATE',
+              'B-ADDRESS', 'I-ADDRESS', 'B-TOTAL', 'I-TOTAL']
+label2id = {label: idx for idx, label in enumerate(label_list)}
+id2label = {idx: label for idx, label in enumerate(label_list)}
+MODEL_CHECKPOINT = "microsoft/layoutlmv3-base"
+SROIE_DATA_PATH = "C:\\Users\\Soumyajit Ghosh\\Downloads\\sroie\\sroie" # Make sure this path is correct
+# --- 2. PyTorch Dataset Class ---
+class SROIEDataset(Dataset):
+    """PyTorch Dataset for SROIE data."""
+    def __init__(self, data, processor, label2id):
+        self.data = data
+        self.processor = processor
+        self.label2id = label2id
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        example = self.data[idx]
+        # Load image and get its dimensions
+        image = Image.open(example['image_path']).convert("RGB")
+        width, height = image.size
+        # Normalize bounding boxes
+        boxes = []
+        for box in example['bboxes']:
+            x, y, w, h = box
+            x0, y0, x1, y1 = x, y, x + w, y + h
+            x0_norm = int((x0 / width) * 1000)
+            y0_norm = int((y0 / height) * 1000)
+            x1_norm = int((x1 / width) * 1000)
+            y1_norm = int((y1 / height) * 1000)
+            # Clip to ensure all values are within the 0-1000 range
+            x0_norm = max(0, min(x0_norm, 1000))
+            y0_norm = max(0, min(y0_norm, 1000))
+            x1_norm = max(0, min(x1_norm, 1000))
+            y1_norm = max(0, min(y1_norm, 1000))
+            boxes.append([x0_norm, y0_norm, x1_norm, y1_norm])
+        # Convert NER tags to IDs
+        word_labels = [self.label2id[label] for label in example['ner_tags']]
+        # Use processor to encode everything, with truncation
+        encoding = self.processor(
+            image,
+            text=example['words'],
+            boxes=boxes,
+            word_labels=word_labels,
+            truncation=True,
+            max_length=512,
+            return_tensors="pt"
+        )
+        # Squeeze the batch dimension to get 1D tensors
+        item = {key: val.squeeze(0) for key, val in encoding.items()}
+        return item
+# --- 3. Main Training Script ---
+def train():
+    """Main function to run the training process."""
+    # --- Load Data ---
+    print("Loading SROIE dataset...")
+    raw_dataset = load_sroie(SROIE_DATA_PATH)
+    # --- Load Processor ---
+    print("Creating processor...")
+    processor = LayoutLMv3Processor.from_pretrained(MODEL_CHECKPOINT, apply_ocr=False)
+    # --- Create PyTorch Datasets and DataLoaders ---
+    print("Creating PyTorch datasets and dataloaders...")
+    train_dataset = SROIEDataset(raw_dataset['train'], processor, label2id)
+    test_dataset = SROIEDataset(raw_dataset['test'], processor, label2id)
+    data_collator = DataCollatorForTokenClassification(
+        tokenizer=processor.tokenizer,
+        padding=True,
+        return_tensors="pt"
+    )
+    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=data_collator)
+    test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=data_collator)
+    # --- Load Model ---
+    print("Loading LayoutLMv3 model for fine-tuning...")
+    model = LayoutLMv3ForTokenClassification.from_pretrained(
+        MODEL_CHECKPOINT,
+        num_labels=len(label_list),
+        id2label=id2label,
+        label2id=label2id
+    )
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    print(f"Training on: {device}")
+    # --- Setup Optimizer ---
+    optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
+    # --- Training Loop ---
+    best_f1 = 0
+    NUM_EPOCHS = 10
+    for epoch in range(NUM_EPOCHS):
+        print(f"\n{'='*60}\nEpoch {epoch + 1}/{NUM_EPOCHS}\n{'='*60}")
+        # --- Training Step ---
+        model.train()
+        total_train_loss = 0
+        train_progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}")
+        for batch in train_progress_bar:
+            batch = {k: v.to(device) for k, v in batch.items()}
+            outputs = model(**batch)
+            loss = outputs.loss
+            loss.backward()
+            optimizer.step()
+            optimizer.zero_grad()
+            total_train_loss += loss.item()
+            train_progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
+        avg_train_loss = total_train_loss / len(train_dataloader)
+        # --- Validation Step ---
+        model.eval()
+        all_predictions = []
+        all_labels = []
+        with torch.no_grad():
+            for batch in tqdm(test_dataloader, desc="Validation"):
+                batch = {k: v.to(device) for k, v in batch.items()}
+                outputs = model(**batch)
+                predictions = outputs.logits.argmax(dim=-1)
+                labels = batch['labels']
+                for i in range(labels.shape[0]):
+                    true_labels_i = [id2label[l.item()] for l in labels[i] if l.item() != -100]
+                    pred_labels_i = [id2label[p.item()] for p, l in zip(predictions[i], labels[i]) if l.item() != -100]
+                    all_labels.append(true_labels_i)
+                    all_predictions.append(pred_labels_i)
+        # --- Calculate Metrics ---
+        f1 = f1_score(all_labels, all_predictions)
+        precision = precision_score(all_labels, all_predictions)
+        recall = recall_score(all_labels, all_predictions)
+        print(f"\n📊 Epoch {epoch + 1} Results:")
+        print(f"  Train Loss: {avg_train_loss:.4f}")
+        print(f"  F1 Score:   {f1:.4f}")
+        print(f"  Precision:  {precision:.4f}")
+        print(f"  Recall:     {recall:.4f}")
+        # --- Save Best Model ---
+        if f1 > best_f1:
+            best_f1 = f1
+            print(f"  🌟 New best F1! Saving model...")
+            save_path = Path("./models/layoutlmv3-sroie-best")
+            save_path.mkdir(parents=True, exist_ok=True)
+            model.save_pretrained(save_path)
+            processor.save_pretrained(save_path)
+    print(f"\n🎉 TRAINING COMPLETE! Best F1 Score: {best_f1:.4f}")
+    print(f"Model saved to: ./models/layoutlmv3-sroie-best")
+if __name__ == '__main__':
+    train()