Commit
·
42e1c04
1
Parent(s):
566dc81
feat: LayoutLMv3 integration, Streamlit UI toggle, README refresh, .gitignore
Browse files- .gitignore +21 -1
- README.md +158 -112
- app.py +24 -6
- requirements.txt +0 -0
- src/ml_extraction.py +176 -0
- src/pipeline.py +45 -20
.gitignore
CHANGED
|
@@ -17,10 +17,18 @@ credentials.json
|
|
| 17 |
*.swp
|
| 18 |
*.swo
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
# OS
|
| 21 |
.DS_Store
|
| 22 |
Thumbs.db
|
| 23 |
ehthumbs.db
|
|
|
|
| 24 |
Desktop.ini
|
| 25 |
|
| 26 |
# Streamlit temp folder
|
|
@@ -47,4 +55,16 @@ data/processed/*
|
|
| 47 |
!data/processed/.gitkeep
|
| 48 |
|
| 49 |
!requirements.txt
|
| 50 |
-
!README.md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
*.swp
|
| 18 |
*.swo
|
| 19 |
|
| 20 |
+
# Notebooks / caches / logs
|
| 21 |
+
.ipynb_checkpoints/
|
| 22 |
+
.pytest_cache/
|
| 23 |
+
*.log
|
| 24 |
+
logs/
|
| 25 |
+
.cache/
|
| 26 |
+
|
| 27 |
# OS
|
| 28 |
.DS_Store
|
| 29 |
Thumbs.db
|
| 30 |
ehthumbs.db
|
| 31 |
+
*.code-workspace
|
| 32 |
Desktop.ini
|
| 33 |
|
| 34 |
# Streamlit temp folder
|
|
|
|
| 55 |
!data/processed/.gitkeep
|
| 56 |
|
| 57 |
!requirements.txt
|
| 58 |
+
!README.md
|
| 59 |
+
|
| 60 |
+
datasets/
|
| 61 |
+
checkpoints/
|
| 62 |
+
lightning_logs/
|
| 63 |
+
wandb/
|
| 64 |
+
mlruns/
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# Ignore all files in the models directory
|
| 68 |
+
models/*
|
| 69 |
+
!models/.gitkeep
|
| 70 |
+
!models/README.md
|
README.md
CHANGED
|
@@ -1,37 +1,45 @@
|
|
| 1 |
# 📄 Smart Invoice Processor
|
| 2 |
|
| 3 |
-
|
| 4 |
|
| 5 |

|
| 6 |

|
| 7 |

|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
## 🎯 Features
|
| 10 |
|
| 11 |
-
- ✅
|
| 12 |
-
- ✅
|
| 13 |
-
- ✅
|
| 14 |
-
- ✅
|
| 15 |
-
- ✅
|
| 16 |
-
- ✅
|
| 17 |
-
- ✅
|
| 18 |
-
- ✅
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
## 📊 Demo
|
| 21 |
|
| 22 |
### Web Interface
|
| 23 |

|
| 24 |
-
*Clean
|
| 25 |
|
| 26 |
-
### Successful Extraction (
|
| 27 |

|
| 28 |
-
*
|
| 29 |
|
| 30 |
-
### Format Detection
|
| 31 |

|
| 32 |
-
*
|
| 33 |
|
| 34 |
-
###
|
| 35 |
```json
|
| 36 |
{
|
| 37 |
"receipt_number": "PEGIV-1030765",
|
|
@@ -40,17 +48,32 @@ An end-to-end invoice processing system that automatically extracts structured d
|
|
| 40 |
"name": "THE PEAK QUARRY WORKS",
|
| 41 |
"email": null
|
| 42 |
},
|
| 43 |
-
"items": [
|
| 44 |
-
{
|
| 45 |
-
"description": "SR",
|
| 46 |
-
"quantity": 111,
|
| 47 |
-
"unit_price": 1193.0,
|
| 48 |
-
"total": 193.0
|
| 49 |
-
}
|
| 50 |
-
],
|
| 51 |
"total_amount": 193.0,
|
| 52 |
"extraction_confidence": 100,
|
| 53 |
-
"validation_passed":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
}
|
| 55 |
```
|
| 56 |
|
|
@@ -59,12 +82,13 @@ An end-to-end invoice processing system that automatically extracts structured d
|
|
| 59 |
### Prerequisites
|
| 60 |
- Python 3.10+
|
| 61 |
- Tesseract OCR
|
|
|
|
| 62 |
|
| 63 |
### Installation
|
| 64 |
|
| 65 |
1. Clone the repository
|
| 66 |
```bash
|
| 67 |
-
git clone https://github.com/
|
| 68 |
cd invoice-processor-ml
|
| 69 |
```
|
| 70 |
|
|
@@ -78,7 +102,12 @@ pip install -r requirements.txt
|
|
| 78 |
- **Mac**: `brew install tesseract`
|
| 79 |
- **Linux**: `sudo apt install tesseract-ocr`
|
| 80 |
|
| 81 |
-
4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
```bash
|
| 83 |
streamlit run app.py
|
| 84 |
```
|
|
@@ -92,7 +121,11 @@ The easiest way to use the processor is via the web interface.
|
|
| 92 |
```bash
|
| 93 |
streamlit run app.py
|
| 94 |
```
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
### Command-Line Interface (CLI)
|
| 98 |
|
|
@@ -103,12 +136,9 @@ You can also process invoices directly from the command line.
|
|
| 103 |
This command processes the provided sample invoice and prints the results to the console.
|
| 104 |
|
| 105 |
```bash
|
| 106 |
-
python src/pipeline.py data/samples/sample_invoice.jpg
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
```bash
|
| 111 |
-
python src/pipeline.py data/samples/sample_invoice.jpg --save
|
| 112 |
```
|
| 113 |
|
| 114 |
#### 2. Batch Processing a Folder
|
|
@@ -117,10 +147,10 @@ The CLI can process an entire folder of images at once.
|
|
| 117 |
|
| 118 |
First, place your own invoice images (e.g., `my_invoice1.jpg`, `my_invoice2.png`) into the `data/raw/` folder.
|
| 119 |
|
| 120 |
-
Then, run the following command. It will process all images in `data/raw
|
| 121 |
|
| 122 |
```bash
|
| 123 |
-
python src/pipeline.py data/raw --save
|
| 124 |
```
|
| 125 |
|
| 126 |
### Python API
|
|
@@ -131,47 +161,45 @@ You can integrate the pipeline directly into your own Python scripts.
|
|
| 131 |
from src.pipeline import process_invoice
|
| 132 |
import json
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
# The function handles everything: loading, OCR, and extraction
|
| 138 |
-
result_data = process_invoice(image_path)
|
| 139 |
-
|
| 140 |
-
# Pretty-print the final structured JSON
|
| 141 |
-
print(json.dumps(result_data, indent=2))
|
| 142 |
```
|
| 143 |
|
| 144 |
## 🏗️ Architecture
|
| 145 |
|
| 146 |
```
|
| 147 |
-
|
| 148 |
-
│
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
│
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
│
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
│
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
│
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
```
|
| 176 |
|
| 177 |
## 📁 Project Structure
|
|
@@ -183,58 +211,74 @@ invoice-processor-ml/
|
|
| 183 |
│ ├── raw/ # Input invoice images for processing
|
| 184 |
│ └── processed/ # (Reserved for future use)
|
| 185 |
│
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
├── docs/
|
| 187 |
-
│ └── screenshots/
|
| 188 |
│
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
│
|
| 191 |
├── src/
|
| 192 |
-
│ ├── preprocessing.py
|
| 193 |
-
│ ├── ocr.py
|
| 194 |
-
│ ├── extraction.py
|
| 195 |
-
│
|
|
|
|
| 196 |
│
|
| 197 |
│
|
| 198 |
├── tests/ # <-- ADD THIS FOLDER
|
| 199 |
-
│ ├── test_preprocessing.py
|
| 200 |
-
│ ├── test_ocr.py
|
| 201 |
-
│ └── test_pipeline.py
|
| 202 |
│
|
| 203 |
├── app.py # Streamlit web interface
|
| 204 |
├── requirements.txt # Python dependencies
|
| 205 |
└── README.md # You are Here!
|
| 206 |
```
|
| 207 |
|
| 208 |
-
##
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
| **Template B** (Professional) | 10-20% | ⚠️ Limited Support |
|
| 214 |
-
| Other formats | Variable | ❌ Not Optimized |
|
| 215 |
|
| 216 |
## 📈 Performance
|
| 217 |
|
| 218 |
-
- **
|
| 219 |
-
- **
|
| 220 |
-
- **
|
|
|
|
|
|
|
| 221 |
|
| 222 |
## ⚠️ Known Limitations
|
| 223 |
|
| 224 |
-
1. **
|
| 225 |
-
2. **
|
| 226 |
-
3. **
|
| 227 |
-
4. **
|
| 228 |
|
| 229 |
## 🔮 Future Enhancements
|
| 230 |
|
| 231 |
-
- [ ] Add
|
| 232 |
-
- [ ]
|
| 233 |
-
- [ ]
|
| 234 |
-
- [ ] Table detection for
|
| 235 |
-
- [ ] PDF support
|
| 236 |
-
- [ ]
|
| 237 |
-
- [ ]
|
|
|
|
| 238 |
|
| 239 |
## 🛠️ Tech Stack
|
| 240 |
|
|
@@ -242,25 +286,27 @@ invoice-processor-ml/
|
|
| 242 |
|-----------|------------|
|
| 243 |
| OCR | Tesseract 5.0+ |
|
| 244 |
| Image Processing | OpenCV, Pillow |
|
| 245 |
-
|
|
|
|
|
| 246 |
| Web Interface | Streamlit |
|
| 247 |
| Data Format | JSON |
|
| 248 |
|
| 249 |
## 📚 What I Learned
|
| 250 |
|
| 251 |
-
-
|
| 252 |
-
-
|
| 253 |
-
-
|
| 254 |
-
-
|
| 255 |
-
-
|
|
|
|
| 256 |
|
| 257 |
## 🤝 Contributing
|
| 258 |
|
| 259 |
Contributions welcome! Areas needing improvement:
|
| 260 |
-
-
|
| 261 |
-
- Better
|
| 262 |
-
-
|
| 263 |
-
-
|
| 264 |
|
| 265 |
## 📝 License
|
| 266 |
|
|
@@ -270,8 +316,8 @@ MIT License - See LICENSE file for details
|
|
| 270 |
|
| 271 |
**Soumyajit Ghosh** - 3rd Year BTech Student
|
| 272 |
- Exploring AI/ML and practical applications
|
| 273 |
-
- [LinkedIn](https://www.linkedin.com/in/soumyajit-ghosh-49a5b02b2?utm_source=share&utm_campaign) | [GitHub](https://github.com/GSoumyajit2005) | [Portfolio](
|
| 274 |
|
| 275 |
---
|
| 276 |
|
| 277 |
-
**Note**: This is a learning project demonstrating end-to-end ML pipeline
|
|
|
|
| 1 |
# 📄 Smart Invoice Processor
|
| 2 |
|
| 3 |
+
End-to-end invoice/receipt processing with OCR + Rule-based extraction and a fine‑tuned LayoutLMv3 model. Upload an image or run via CLI to get clean, structured JSON (vendor, date, totals, address, etc.).
|
| 4 |
|
| 5 |

|
| 6 |

|
| 7 |

|
| 8 |
+

|
| 9 |
+

|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
|
| 13 |
## 🎯 Features
|
| 14 |
|
| 15 |
+
- ✅ OCR using Tesseract (configurable, fast, multi-platform)
|
| 16 |
+
- ✅ Rule-based extraction (regex baselines)
|
| 17 |
+
- ✅ ML-based extraction (LayoutLMv3 fine‑tuned on SROIE) for robust field detection
|
| 18 |
+
- ✅ Clean JSON output (date, total, vendor, address, receipt number*)
|
| 19 |
+
- ✅ Confidence and simple validation (e.g., total found among amounts)
|
| 20 |
+
- ✅ Streamlit web UI with method toggle (ML vs Regex)
|
| 21 |
+
- ✅ CLI for single/batch processing with saving to JSON
|
| 22 |
+
- ✅ Tests for preprocessing/OCR/pipeline
|
| 23 |
+
|
| 24 |
+
> Note: SROIE does not include invoice/receipt number labels; the ML model won’t output it unless you add labeled data. The rule-based extractor can still provide it when formats allow.
|
| 25 |
+
|
| 26 |
+
---
|
| 27 |
|
| 28 |
## 📊 Demo
|
| 29 |
|
| 30 |
### Web Interface
|
| 31 |

|
| 32 |
+
*Clean upload → extract flow with method selector (ML vs Regex).*
|
| 33 |
|
| 34 |
+
### Successful Extraction (ML-based)
|
| 35 |

|
| 36 |
+
*Fields extracted with LayoutLMv3.*
|
| 37 |
|
| 38 |
+
### Format Detection (simulated)
|
| 39 |

|
| 40 |
+
*UI shows simple format hints and confidence.*
|
| 41 |
|
| 42 |
+
### Example JSON (Rule-based)
|
| 43 |
```json
|
| 44 |
{
|
| 45 |
"receipt_number": "PEGIV-1030765",
|
|
|
|
| 48 |
"name": "THE PEAK QUARRY WORKS",
|
| 49 |
"email": null
|
| 50 |
},
|
| 51 |
+
"items": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
"total_amount": 193.0,
|
| 53 |
"extraction_confidence": 100,
|
| 54 |
+
"validation_passed": true,
|
| 55 |
+
"vendor": "OJC MARKETING SDN BHD",
|
| 56 |
+
"address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR"
|
| 57 |
+
}
|
| 58 |
+
```
|
| 59 |
+
### Example JSON (ML-based)
|
| 60 |
+
```json
|
| 61 |
+
{
|
| 62 |
+
"receipt_number": null,
|
| 63 |
+
"date": "15/01/2019",
|
| 64 |
+
"bill_to": null,
|
| 65 |
+
"items": [],
|
| 66 |
+
"total_amount": 193.0,
|
| 67 |
+
"vendor": "OJC MARKETING SDN BHD",
|
| 68 |
+
"address": "NO JALAN BAYU 4, BANDAR SERI ALAM, 81750 MASAI, JOHOR",
|
| 69 |
+
"raw_text": "…",
|
| 70 |
+
"raw_ocr_words": ["…"],
|
| 71 |
+
"raw_predictions": {
|
| 72 |
+
"DATE": {"text": "15/01/2019", "bbox": [[…]]},
|
| 73 |
+
"TOTAL": {"text": "193.00", "bbox": [[…]]},
|
| 74 |
+
"COMPANY": {"text": "OJC MARKETING SDN BHD", "bbox": [[…]]},
|
| 75 |
+
"ADDRESS": {"text": "…", "bbox": [[…]]}
|
| 76 |
+
}
|
| 77 |
}
|
| 78 |
```
|
| 79 |
|
|
|
|
| 82 |
### Prerequisites
|
| 83 |
- Python 3.10+
|
| 84 |
- Tesseract OCR
|
| 85 |
+
- (Optional) CUDA-capable GPU for training/inference speed
|
| 86 |
|
| 87 |
### Installation
|
| 88 |
|
| 89 |
1. Clone the repository
|
| 90 |
```bash
|
| 91 |
+
git clone https://github.com/GSoumyajit2005/invoice-processor-ml
|
| 92 |
cd invoice-processor-ml
|
| 93 |
```
|
| 94 |
|
|
|
|
| 102 |
- **Mac**: `brew install tesseract`
|
| 103 |
- **Linux**: `sudo apt install tesseract-ocr`
|
| 104 |
|
| 105 |
+
4. (Optional, Windows) Set Tesseract path in src/ocr.py if needed:
|
| 106 |
+
```bash
|
| 107 |
+
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
5. Run the web app
|
| 111 |
```bash
|
| 112 |
streamlit run app.py
|
| 113 |
```
|
|
|
|
| 121 |
```bash
|
| 122 |
streamlit run app.py
|
| 123 |
```
|
| 124 |
+
- Upload an invoice image (PNG/JPG).
|
| 125 |
+
- Choose extraction method in sidebar:
|
| 126 |
+
- ML-Based (LayoutLMv3)
|
| 127 |
+
- Rule-Based (Regex)
|
| 128 |
+
- View JSON, download results.
|
| 129 |
|
| 130 |
### Command-Line Interface (CLI)
|
| 131 |
|
|
|
|
| 136 |
This command processes the provided sample invoice and prints the results to the console.
|
| 137 |
|
| 138 |
```bash
|
| 139 |
+
python src/pipeline.py data/samples/sample_invoice.jpg --save --method ml
|
| 140 |
+
# or
|
| 141 |
+
python src/pipeline.py data/samples/sample_invoice.jpg --save --method rules
|
|
|
|
|
|
|
|
|
|
| 142 |
```
|
| 143 |
|
| 144 |
#### 2. Batch Processing a Folder
|
|
|
|
| 147 |
|
| 148 |
First, place your own invoice images (e.g., `my_invoice1.jpg`, `my_invoice2.png`) into the `data/raw/` folder.
|
| 149 |
|
| 150 |
+
Then, run the following command. It will process all images in `data/raw/`. Saved files are written to `outputs/{stem}_{method}.json`.
|
| 151 |
|
| 152 |
```bash
|
| 153 |
+
python src/pipeline.py data/raw --save --method ml
|
| 154 |
```
|
| 155 |
|
| 156 |
### Python API
|
|
|
|
| 161 |
from src.pipeline import process_invoice
|
| 162 |
import json
|
| 163 |
|
| 164 |
+
result = process_invoice('data/samples/sample_invoice.jpg', method='ml')
|
| 165 |
+
print(json.dumps(result, indent=2))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
```
|
| 167 |
|
| 168 |
## 🏗️ Architecture
|
| 169 |
|
| 170 |
```
|
| 171 |
+
┌────────────────┐
|
| 172 |
+
│ Upload Image │
|
| 173 |
+
└───────┬────────┘
|
| 174 |
+
│
|
| 175 |
+
▼
|
| 176 |
+
┌────────────────────┐
|
| 177 |
+
│ Preprocessing │ (OpenCV grayscale/denoise)
|
| 178 |
+
└────────┬───────────┘
|
| 179 |
+
│
|
| 180 |
+
▼
|
| 181 |
+
┌───────────────┐
|
| 182 |
+
│ OCR │ (Tesseract)
|
| 183 |
+
└───────┬───────┘
|
| 184 |
+
│
|
| 185 |
+
┌──────────────┴──────────────┐
|
| 186 |
+
│ │
|
| 187 |
+
▼ ▼
|
| 188 |
+
┌──────────────────┐ ┌────────────────────────┐
|
| 189 |
+
│ Rule-based IE │ │ ML-based IE (NER) │
|
| 190 |
+
│ (regex, heur.) │ │ LayoutLMv3 token-class │
|
| 191 |
+
└────────┬─────────┘ └───────────┬────────────┘
|
| 192 |
+
│ │
|
| 193 |
+
└──────────────┬──────────────────┘
|
| 194 |
+
▼
|
| 195 |
+
┌──────────────────┐
|
| 196 |
+
│ Post-process │
|
| 197 |
+
│ validate, scores │
|
| 198 |
+
└────────┬─────────┘
|
| 199 |
+
▼
|
| 200 |
+
┌──────────────────┐
|
| 201 |
+
│ JSON Output │
|
| 202 |
+
└───────��──────────┘
|
| 203 |
```
|
| 204 |
|
| 205 |
## 📁 Project Structure
|
|
|
|
| 211 |
│ ├── raw/ # Input invoice images for processing
|
| 212 |
│ └── processed/ # (Reserved for future use)
|
| 213 |
│
|
| 214 |
+
│
|
| 215 |
+
├── data/samples/
|
| 216 |
+
│ └── sample_invoice.jpg # Public sample for quick testing
|
| 217 |
+
│
|
| 218 |
├── docs/
|
| 219 |
+
│ └── screenshots/ # UI Screenshots for the README demo
|
| 220 |
│
|
| 221 |
+
│
|
| 222 |
+
├── models/
|
| 223 |
+
│ └── layoutlmv3-sroie-best/ # Fine-tuned model (created after training)
|
| 224 |
+
│
|
| 225 |
+
├── outputs/ # Default folder for saved JSON results
|
| 226 |
│
|
| 227 |
├── src/
|
| 228 |
+
│ ├── preprocessing.py # Image preprocessing functions (grayscale, denoise)
|
| 229 |
+
│ ├── ocr.py # Tesseract OCR integration
|
| 230 |
+
│ ├── extraction.py # Regex-based information extraction logic
|
| 231 |
+
│ ├── ml_extraction.py # ML-based extraction (LayoutLMv3)
|
| 232 |
+
│ └── pipeline.py # Main orchestrator for the pipeline and CLI
|
| 233 |
│
|
| 234 |
│
|
| 235 |
├── tests/ # <-- ADD THIS FOLDER
|
| 236 |
+
│ ├── test_preprocessing.py # Tests for the preprocessing module
|
| 237 |
+
│ ├── test_ocr.py # Tests for the OCR module
|
| 238 |
+
│ └── test_pipeline.py # End-to-end pipeline tests
|
| 239 |
│
|
| 240 |
├── app.py # Streamlit web interface
|
| 241 |
├── requirements.txt # Python dependencies
|
| 242 |
└── README.md # You are Here!
|
| 243 |
```
|
| 244 |
|
| 245 |
+
## 🧠 Model & Training
|
| 246 |
+
|
| 247 |
+
- **Model**: `microsoft/layoutlmv3-base` (125M params)
|
| 248 |
+
- **Task**: Token Classification (NER) with 9 labels: `O, B/I-COMPANY, B/I-ADDRESS, B/I-DATE, B/I-TOTAL`
|
| 249 |
+
- **Dataset**: SROIE (ICDAR 2019, English retail receipts)
|
| 250 |
+
- **Training**: RTX 3050 6GB, PyTorch 2.x, Transformers 4.x
|
| 251 |
+
- **Result**: Best F1 ≈ 0.922 on validation (epoch 5 saved)
|
| 252 |
|
| 253 |
+
- Training scripts(local):
|
| 254 |
+
- `train_layoutlm.py` (data prep, training loop with validation + model save)
|
| 255 |
+
- Model saved to: `models/layoutlmv3-sroie-best/`
|
|
|
|
|
|
|
| 256 |
|
| 257 |
## 📈 Performance
|
| 258 |
|
| 259 |
+
- **OCR accuracy (clear images)**: High with Tesseract
|
| 260 |
+
- **Rule-based extraction**: Strong on simple retail receipts
|
| 261 |
+
- **ML-based extraction (SROIE-style)**:
|
| 262 |
+
- COMPANY / ADDRESS / DATE / TOTAL: High F1 on simple receipts
|
| 263 |
+
- Complex business invoices: Partial extraction unless further fine-tuned
|
| 264 |
|
| 265 |
## ⚠️ Known Limitations
|
| 266 |
|
| 267 |
+
1. **Layout Sensitivity**: The ML model was fine‑tuned only on SROIE (retail receipts). Professional multi-column invoices may underperform until you fine‑tune on more diverse datasets.
|
| 268 |
+
2. **Invoice Number (ML)**: SROIE lacks invoice number labels; the ML model won’t output it unless you add labeled data. The rule-based method can still recover it on many formats.
|
| 269 |
+
3. **Line Items/Tables**: Not trained for table extraction yet. Rule-based supports simple totals; table extraction comes later.
|
| 270 |
+
4. **OCR Variability**: Tesseract outputs can vary; preprocessing and thresholds can impact ML results.
|
| 271 |
|
| 272 |
## 🔮 Future Enhancements
|
| 273 |
|
| 274 |
+
- [ ] Add and fine‑tune on mychen76/invoices-and-receipts_ocr_v1 (English) for broader invoice formats
|
| 275 |
+
- [ ] (Optional) Add FATURA (table-focused) for line-item extraction
|
| 276 |
+
- [ ] Sliding-window chunking for >512 token documents (to avoid truncation)
|
| 277 |
+
- [ ] Table detection (Camelot/Tabula/DeepDeSRT) for line items
|
| 278 |
+
- [ ] PDF support (pdf2image) for multipage invoices
|
| 279 |
+
- [ ] FastAPI backend + Docker
|
| 280 |
+
- [ ] Multilingual OCR (PaddleOCR) and multilingual fine‑tuning
|
| 281 |
+
- [ ] Confidence calibration and better validation rules
|
| 282 |
|
| 283 |
## 🛠️ Tech Stack
|
| 284 |
|
|
|
|
| 286 |
|-----------|------------|
|
| 287 |
| OCR | Tesseract 5.0+ |
|
| 288 |
| Image Processing | OpenCV, Pillow |
|
| 289 |
+
| ML/NLP | PyTorch 2.x, Transformers |
|
| 290 |
+
| Model | LayoutLMv3 (token class.) |
|
| 291 |
| Web Interface | Streamlit |
|
| 292 |
| Data Format | JSON |
|
| 293 |
|
| 294 |
## 📚 What I Learned
|
| 295 |
|
| 296 |
+
- OCR challenges (confusable characters, confidence-based filtering)
|
| 297 |
+
- Layout-aware NER with LayoutLMv3 (text + bbox + pixels)
|
| 298 |
+
- Data normalization (bbox to 0–1000 scale)
|
| 299 |
+
- End-to-end pipelines (UI + CLI + JSON output)
|
| 300 |
+
- When regex is enough vs when ML is needed
|
| 301 |
+
- Evaluation (seqeval F1 for NER)
|
| 302 |
|
| 303 |
## 🤝 Contributing
|
| 304 |
|
| 305 |
Contributions welcome! Areas needing improvement:
|
| 306 |
+
- New patterns for regex extractor
|
| 307 |
+
- Better preprocessing for OCR
|
| 308 |
+
- New datasets and training configs
|
| 309 |
+
- Tests and CI
|
| 310 |
|
| 311 |
## 📝 License
|
| 312 |
|
|
|
|
| 316 |
|
| 317 |
**Soumyajit Ghosh** - 3rd Year BTech Student
|
| 318 |
- Exploring AI/ML and practical applications
|
| 319 |
+
- [LinkedIn](https://www.linkedin.com/in/soumyajit-ghosh-49a5b02b2?utm_source=share&utm_campaign) | [GitHub](https://github.com/GSoumyajit2005) | [Portfolio](Coming Soon)
|
| 320 |
|
| 321 |
---
|
| 322 |
|
| 323 |
+
**Note**: "This is a learning project demonstrating an end-to-end ML pipeline. Not recommended for production use without further validation, retraining on diverse datasets, and security hardening."
|
app.py
CHANGED
|
@@ -114,6 +114,13 @@ with st.sidebar:
|
|
| 114 |
st.session_state.processed_count = 0
|
| 115 |
st.metric("Invoices Processed Today", st.session_state.processed_count)
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# Main content
|
| 118 |
tab1, tab2, tab3 = st.tabs(["📤 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"])
|
| 119 |
|
|
@@ -150,7 +157,12 @@ with tab1:
|
|
| 150 |
|
| 151 |
# Step 1: Call YOUR full pipeline function
|
| 152 |
st.write("✅ Calling `process_invoice`...")
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# Step 2: Simulate format detection using the extracted data
|
| 156 |
st.write("✅ Simulating format detection...")
|
|
@@ -206,12 +218,18 @@ with tab1:
|
|
| 206 |
st.warning("⚠️ Validation Failed: Total amount could not be verified against other numbers.")
|
| 207 |
|
| 208 |
# Key metrics display
|
|
|
|
|
|
|
|
|
|
| 209 |
res_col1, res_col2, res_col3 = st.columns(3)
|
| 210 |
-
res_col1.metric("Receipt Number", data.get('receipt_number') or "N/A")
|
| 211 |
-
res_col2.metric("Date", data.get('date') or "N/A")
|
| 212 |
-
res_col3.metric("Total Amount", f"${data.get('total_amount'):.2f}" if data.get('total_amount') is not None else "N/A")
|
| 213 |
-
|
| 214 |
-
|
|
|
|
|
|
|
|
|
|
| 215 |
|
| 216 |
# Line items table
|
| 217 |
if data.get('items'):
|
|
|
|
| 114 |
st.session_state.processed_count = 0
|
| 115 |
st.metric("Invoices Processed Today", st.session_state.processed_count)
|
| 116 |
|
| 117 |
+
st.header("⚙️ Configuration")
|
| 118 |
+
extraction_method = st.selectbox(
|
| 119 |
+
"Choose Extraction Method:",
|
| 120 |
+
('ML-Based (LayoutLMv3)', 'Rule-Based (Regex)'),
|
| 121 |
+
help="ML-Based is more robust but may miss fields not in its training data. Rule-Based is faster but more fragile."
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
# Main content
|
| 125 |
tab1, tab2, tab3 = st.tabs(["📤 Upload & Process", "📚 Sample Invoices", "ℹ️ How It Works"])
|
| 126 |
|
|
|
|
| 157 |
|
| 158 |
# Step 1: Call YOUR full pipeline function
|
| 159 |
st.write("✅ Calling `process_invoice`...")
|
| 160 |
+
# Map the user-friendly name from the dropdown to the actual method parameter
|
| 161 |
+
method = 'ml' if extraction_method == 'ML-Based (LayoutLMv3)' else 'rules'
|
| 162 |
+
st.write(f"⚙️ Using **{method.upper()}** extraction method...")
|
| 163 |
+
|
| 164 |
+
# Call the pipeline with the selected method
|
| 165 |
+
extracted_data = process_invoice(temp_path, method=method)
|
| 166 |
|
| 167 |
# Step 2: Simulate format detection using the extracted data
|
| 168 |
st.write("✅ Simulating format detection...")
|
|
|
|
| 218 |
st.warning("⚠️ Validation Failed: Total amount could not be verified against other numbers.")
|
| 219 |
|
| 220 |
# Key metrics display
|
| 221 |
+
# Key metrics display
|
| 222 |
+
st.metric("🏢 Vendor", data.get('vendor') or "N/A") # <-- ADD THIS
|
| 223 |
+
|
| 224 |
res_col1, res_col2, res_col3 = st.columns(3)
|
| 225 |
+
res_col1.metric("📄 Receipt Number", data.get('receipt_number') or "N/A")
|
| 226 |
+
res_col2.metric("📅 Date", data.get('date') or "N/A")
|
| 227 |
+
res_col3.metric("💵 Total Amount", f"${data.get('total_amount'):.2f}" if data.get('total_amount') is not None else "N/A")
|
| 228 |
+
|
| 229 |
+
# Use an expander for longer text fields like address
|
| 230 |
+
with st.expander("Show More Details"):
|
| 231 |
+
st.markdown(f"**👤 Bill To:** {data.get('bill_to', {}).get('name') if data.get('bill_to') else 'N/A'}")
|
| 232 |
+
st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
|
| 233 |
|
| 234 |
# Line items table
|
| 235 |
if data.get('items'):
|
requirements.txt
CHANGED
|
Binary files a/requirements.txt and b/requirements.txt differ
|
|
|
src/ml_extraction.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# src/ml_extraction.py
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
|
| 5 |
+
from PIL import Image
|
| 6 |
+
import pytesseract
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
import re
|
| 9 |
+
|
| 10 |
+
# --- CONFIGURATION ---
|
| 11 |
+
# The local path where we expect to find/save the model
|
| 12 |
+
LOCAL_MODEL_PATH = "./models/layoutlmv3-sroie-best"
|
| 13 |
+
# The Hugging Face Hub ID for the model to download if not found locally
|
| 14 |
+
HUB_MODEL_ID = "GSoumyajit2005/layoutlmv3-sroie-invoice-extraction"
|
| 15 |
+
|
| 16 |
+
# --- Function to load the model ---
|
| 17 |
+
def load_model_and_processor(model_path, hub_id):
|
| 18 |
+
"""
|
| 19 |
+
Tries to load the model from a local path. If it fails,
|
| 20 |
+
it downloads it from the Hugging Face Hub.
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
# Try loading from local path first
|
| 24 |
+
print(f"Attempting to load model from local path: {model_path}...")
|
| 25 |
+
processor = LayoutLMv3Processor.from_pretrained(model_path)
|
| 26 |
+
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
|
| 27 |
+
print("✅ Model loaded successfully from local path.")
|
| 28 |
+
except OSError:
|
| 29 |
+
# If it fails, download from the Hub
|
| 30 |
+
print(f"Model not found locally. Downloading from Hugging Face Hub: {hub_id}...")
|
| 31 |
+
from huggingface_hub import snapshot_download
|
| 32 |
+
# Download the model files and save them to the local path
|
| 33 |
+
snapshot_download(repo_id=hub_id, local_dir=model_path, local_dir_use_symlinks=False)
|
| 34 |
+
# Now load from the local path again
|
| 35 |
+
processor = LayoutLMv3Processor.from_pretrained(model_path)
|
| 36 |
+
model = LayoutLMv3ForTokenClassification.from_pretrained(model_path)
|
| 37 |
+
print("✅ Model downloaded and loaded successfully from the Hub.")
|
| 38 |
+
|
| 39 |
+
return model, processor
|
| 40 |
+
|
| 41 |
+
# --- Load the model and processor only ONCE when the module is imported ---
|
| 42 |
+
MODEL, PROCESSOR = load_model_and_processor(LOCAL_MODEL_PATH, HUB_MODEL_ID)
|
| 43 |
+
|
| 44 |
+
if MODEL and PROCESSOR:
|
| 45 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 46 |
+
MODEL.to(DEVICE)
|
| 47 |
+
MODEL.eval()
|
| 48 |
+
print(f"ML Model is ready on device: {DEVICE}")
|
| 49 |
+
else:
|
| 50 |
+
DEVICE = None
|
| 51 |
+
print("❌ Could not load ML model.")
|
| 52 |
+
|
| 53 |
+
# --- Helper Function to group entities ---
|
| 54 |
+
def _process_predictions(words: List[str], unnormalized_boxes: List[List[int]], encoding, predictions: List[int], id2label: Dict[int, str]) -> Dict[str, Any]:
|
| 55 |
+
word_ids = encoding.word_ids(batch_index=0)
|
| 56 |
+
|
| 57 |
+
word_level_preds = {}
|
| 58 |
+
for idx, word_id in enumerate(word_ids):
|
| 59 |
+
if word_id is not None:
|
| 60 |
+
label_id = predictions[idx]
|
| 61 |
+
if label_id != -100:
|
| 62 |
+
if word_id not in word_level_preds:
|
| 63 |
+
word_level_preds[word_id] = id2label[label_id]
|
| 64 |
+
|
| 65 |
+
entities = {}
|
| 66 |
+
for word_idx, label in word_level_preds.items():
|
| 67 |
+
if label == 'O': continue
|
| 68 |
+
|
| 69 |
+
entity_type = label[2:]
|
| 70 |
+
word = words[word_idx]
|
| 71 |
+
|
| 72 |
+
if label.startswith('B-'):
|
| 73 |
+
entities[entity_type] = {"text": word, "bbox": [unnormalized_boxes[word_idx]]}
|
| 74 |
+
elif label.startswith('I-') and entity_type in entities:
|
| 75 |
+
if word_idx > 0 and word_level_preds.get(word_idx - 1) in (f'B-{entity_type}', f'I-{entity_type}'):
|
| 76 |
+
entities[entity_type]['text'] += " " + word
|
| 77 |
+
entities[entity_type]['bbox'].append(unnormalized_boxes[word_idx])
|
| 78 |
+
else:
|
| 79 |
+
entities[entity_type] = {"text": word, "bbox": [unnormalized_boxes[word_idx]]}
|
| 80 |
+
|
| 81 |
+
# Clean up the final text field
|
| 82 |
+
for entity in entities.values():
|
| 83 |
+
entity['text'] = entity['text'].strip()
|
| 84 |
+
|
| 85 |
+
return entities
|
| 86 |
+
|
| 87 |
+
# --- Main Function to be called from the pipeline ---
|
| 88 |
+
def extract_ml_based(image_path: str) -> Dict[str, Any]:
|
| 89 |
+
"""
|
| 90 |
+
Performs end-to-end ML-based extraction on a single image.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
image_path: The path to the invoice image.
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
A dictionary containing the extracted entities.
|
| 97 |
+
"""
|
| 98 |
+
if not MODEL or not PROCESSOR:
|
| 99 |
+
raise RuntimeError("ML model is not loaded. Cannot perform extraction.")
|
| 100 |
+
|
| 101 |
+
# 1. Load Image
|
| 102 |
+
image = Image.open(image_path).convert("RGB")
|
| 103 |
+
width, height = image.size
|
| 104 |
+
|
| 105 |
+
# 2. Perform OCR
|
| 106 |
+
ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
| 107 |
+
n_boxes = len(ocr_data['level'])
|
| 108 |
+
words = []
|
| 109 |
+
unnormalized_boxes = []
|
| 110 |
+
for i in range(n_boxes):
|
| 111 |
+
if int(ocr_data['conf'][i]) > 60 and ocr_data['text'][i].strip() != '':
|
| 112 |
+
word = ocr_data['text'][i]
|
| 113 |
+
(x, y, w, h) = (ocr_data['left'][i], ocr_data['top'][i], ocr_data['width'][i], ocr_data['height'][i])
|
| 114 |
+
words.append(word)
|
| 115 |
+
unnormalized_boxes.append([x, y, x + w, y + h])
|
| 116 |
+
|
| 117 |
+
# 3. Normalize Boxes and Prepare Inputs
|
| 118 |
+
normalized_boxes = []
|
| 119 |
+
for box in unnormalized_boxes:
|
| 120 |
+
normalized_boxes.append([
|
| 121 |
+
int(1000 * (box[0] / width)),
|
| 122 |
+
int(1000 * (box[1] / height)),
|
| 123 |
+
int(1000 * (box[2] / width)),
|
| 124 |
+
int(1000 * (box[3] / height)),
|
| 125 |
+
])
|
| 126 |
+
|
| 127 |
+
# 4. Process with LayoutLMv3 Processor
|
| 128 |
+
encoding = PROCESSOR(
|
| 129 |
+
image,
|
| 130 |
+
text=words,
|
| 131 |
+
boxes=normalized_boxes,
|
| 132 |
+
truncation=True,
|
| 133 |
+
max_length=512,
|
| 134 |
+
return_tensors="pt"
|
| 135 |
+
).to(DEVICE)
|
| 136 |
+
|
| 137 |
+
# 5. Run Inference
|
| 138 |
+
with torch.no_grad():
|
| 139 |
+
outputs = MODEL(**encoding)
|
| 140 |
+
|
| 141 |
+
predictions = outputs.logits.argmax(-1).squeeze().tolist()
|
| 142 |
+
|
| 143 |
+
# 6. Post-process to get final entities
|
| 144 |
+
extracted_entities = _process_predictions(words, unnormalized_boxes, encoding, predictions, MODEL.config.id2label)
|
| 145 |
+
|
| 146 |
+
# 7. Format the output to be consistent with your rule-based output
|
| 147 |
+
# Format the output to be consistent with the desired UI structure
|
| 148 |
+
# Format the output to be a superset of all possible fields
|
| 149 |
+
final_output = {
|
| 150 |
+
# --- Standard UI Fields ---
|
| 151 |
+
"receipt_number": None, # SROIE doesn't train for this. Your regex model will provide it.
|
| 152 |
+
"date": extracted_entities.get("DATE", {}).get("text"),
|
| 153 |
+
"bill_to": None, # SROIE doesn't train for this. Your regex model will provide it.
|
| 154 |
+
"items": [], # SROIE doesn't train for line items.
|
| 155 |
+
"total_amount": None,
|
| 156 |
+
|
| 157 |
+
# --- Additional Fields from ML Model ---
|
| 158 |
+
"vendor": extracted_entities.get("COMPANY", {}).get("text"), # The ML model finds 'COMPANY'
|
| 159 |
+
"address": extracted_entities.get("ADDRESS", {}).get("text"),
|
| 160 |
+
|
| 161 |
+
# --- Debugging Info ---
|
| 162 |
+
"raw_text": " ".join(words),
|
| 163 |
+
"raw_ocr_words": words,
|
| 164 |
+
"raw_predictions": extracted_entities
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
# Safely extract and convert total
|
| 168 |
+
total_text = extracted_entities.get("TOTAL", {}).get("text")
|
| 169 |
+
if total_text:
|
| 170 |
+
try:
|
| 171 |
+
cleaned_total = re.sub(r'[^\d.]', '', total_text)
|
| 172 |
+
final_output["total_amount"] = float(cleaned_total)
|
| 173 |
+
except (ValueError, TypeError):
|
| 174 |
+
final_output["total_amount"] = None
|
| 175 |
+
|
| 176 |
+
return final_output
|
src/pipeline.py
CHANGED
|
@@ -11,37 +11,61 @@ import json
|
|
| 11 |
from preprocessing import load_image, convert_to_grayscale, remove_noise
|
| 12 |
from ocr import extract_text
|
| 13 |
from extraction import structure_output
|
|
|
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
| 17 |
"""
|
| 18 |
-
Process an invoice image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
"""
|
| 20 |
if not Path(image_path).exists():
|
| 21 |
raise FileNotFoundError(f"Image not found at path: {image_path}")
|
| 22 |
|
| 23 |
-
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
|
|
|
| 34 |
if save_results:
|
| 35 |
output_path = Path(output_dir)
|
| 36 |
output_path.mkdir(parents=True, exist_ok=True)
|
| 37 |
-
json_path = output_path / (Path(image_path).stem +
|
| 38 |
try:
|
| 39 |
-
with open(json_path, 'w', encoding='utf-8') as
|
| 40 |
-
json.dump(structured_data,
|
| 41 |
-
except
|
| 42 |
-
raise
|
| 43 |
-
except OSError as e:
|
| 44 |
-
raise IOError(f"Error saving results to {json_path}:\n {e}")
|
| 45 |
|
| 46 |
return structured_data
|
| 47 |
|
|
@@ -89,6 +113,7 @@ Examples:
|
|
| 89 |
parser.add_argument('path', help='Path to an invoice image or a folder of images')
|
| 90 |
parser.add_argument('--save', action='store_true', help='Save results to JSON files')
|
| 91 |
parser.add_argument('--output', default='outputs', help='Output directory for JSON files')
|
|
|
|
| 92 |
|
| 93 |
args = parser.parse_args()
|
| 94 |
|
|
@@ -99,7 +124,7 @@ Examples:
|
|
| 99 |
elif Path(args.path).is_file():
|
| 100 |
# Corrected: Use args.path
|
| 101 |
print(f"🔄 Processing: {args.path}")
|
| 102 |
-
result = process_invoice(args.path, save_results=args.save, output_dir=args.output)
|
| 103 |
|
| 104 |
print("\n📊 Extracted Data:")
|
| 105 |
print("=" * 60)
|
|
|
|
| 11 |
from preprocessing import load_image, convert_to_grayscale, remove_noise
|
| 12 |
from ocr import extract_text
|
| 13 |
from extraction import structure_output
|
| 14 |
+
from ml_extraction import extract_ml_based
|
| 15 |
|
| 16 |
+
def process_invoice(image_path: str,
|
| 17 |
+
method: str = 'ml', # <-- New parameter: 'ml' or 'rules'
|
| 18 |
+
save_results: bool = False,
|
| 19 |
+
output_dir: str = 'outputs') -> Dict[str, Any]:
|
| 20 |
"""
|
| 21 |
+
Process an invoice image using either rule-based or ML-based extraction.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
image_path: Path to the invoice image.
|
| 25 |
+
method: The extraction method to use ('ml' or 'rules'). Default is 'ml'.
|
| 26 |
+
save_results: Whether to save JSON results to a file.
|
| 27 |
+
output_dir: Directory to save results.
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
A dictionary with the extracted invoice data.
|
| 31 |
"""
|
| 32 |
if not Path(image_path).exists():
|
| 33 |
raise FileNotFoundError(f"Image not found at path: {image_path}")
|
| 34 |
|
| 35 |
+
print(f"Processing with '{method}' method...")
|
| 36 |
|
| 37 |
+
if method == 'ml':
|
| 38 |
+
# --- ML-Based Extraction ---
|
| 39 |
+
try:
|
| 40 |
+
# The ml_extraction function handles everything internally
|
| 41 |
+
structured_data = extract_ml_based(image_path)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
raise ValueError(f"Error during ML-based extraction: {e}")
|
| 44 |
+
|
| 45 |
+
elif method == 'rules':
|
| 46 |
+
# --- Rule-Based Extraction (Your original logic) ---
|
| 47 |
+
try:
|
| 48 |
+
image = load_image(image_path)
|
| 49 |
+
gray_image = convert_to_grayscale(image)
|
| 50 |
+
preprocessed_image = remove_noise(gray_image, kernel_size=3)
|
| 51 |
+
text = extract_text(preprocessed_image, config='--psm 6')
|
| 52 |
+
structured_data = structure_output(text) # Calls your old extraction.py
|
| 53 |
+
except Exception as e:
|
| 54 |
+
raise ValueError(f"Error during rule-based extraction: {e}")
|
| 55 |
+
|
| 56 |
+
else:
|
| 57 |
+
raise ValueError(f"Unknown extraction method: '{method}'. Choose 'ml' or 'rules'.")
|
| 58 |
|
| 59 |
+
# --- Saving Logic (remains the same) ---
|
| 60 |
if save_results:
|
| 61 |
output_path = Path(output_dir)
|
| 62 |
output_path.mkdir(parents=True, exist_ok=True)
|
| 63 |
+
json_path = output_path / (Path(image_path).stem + f"_{method}.json") # Add method to filename
|
| 64 |
try:
|
| 65 |
+
with open(json_path, 'w', encoding='utf-8') as f:
|
| 66 |
+
json.dump(structured_data, f, indent=2, ensure_ascii=False)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
raise IOError(f"Error saving results to {json_path}: {e}")
|
|
|
|
|
|
|
| 69 |
|
| 70 |
return structured_data
|
| 71 |
|
|
|
|
| 113 |
parser.add_argument('path', help='Path to an invoice image or a folder of images')
|
| 114 |
parser.add_argument('--save', action='store_true', help='Save results to JSON files')
|
| 115 |
parser.add_argument('--output', default='outputs', help='Output directory for JSON files')
|
| 116 |
+
parser.add_argument('--method', default='ml', choices=['ml', 'rules'], help="Extraction method: 'ml' or 'rules'")
|
| 117 |
|
| 118 |
args = parser.parse_args()
|
| 119 |
|
|
|
|
| 124 |
elif Path(args.path).is_file():
|
| 125 |
# Corrected: Use args.path
|
| 126 |
print(f"🔄 Processing: {args.path}")
|
| 127 |
+
result = process_invoice(args.path, method=args.method, save_results=args.save, output_dir=args.output)
|
| 128 |
|
| 129 |
print("\n📊 Extracted Data:")
|
| 130 |
print("=" * 60)
|