Spaces:
Sleeping
Sleeping
Commit
·
aa4f954
1
Parent(s):
31af52a
feat: Add .dockerignore, enhance UI to display receipt number and robustly handle bill-to, and update README with an additional dataset.
Browse files- .dockerignore +53 -0
- README.md +2 -2
- app.py +13 -1
- src/extraction.py +0 -1
.dockerignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python
|
| 6 |
+
__pycache__
|
| 7 |
+
*.py[cod]
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
.Python
|
| 11 |
+
*.so
|
| 12 |
+
.eggs
|
| 13 |
+
*.egg-info
|
| 14 |
+
.mypy_cache
|
| 15 |
+
.pytest_cache
|
| 16 |
+
|
| 17 |
+
# Virtual environments
|
| 18 |
+
venv
|
| 19 |
+
.venv
|
| 20 |
+
env
|
| 21 |
+
|
| 22 |
+
# IDE
|
| 23 |
+
.vscode
|
| 24 |
+
.idea
|
| 25 |
+
*.swp
|
| 26 |
+
*.swo
|
| 27 |
+
|
| 28 |
+
# Data and outputs (large files)
|
| 29 |
+
data/
|
| 30 |
+
outputs/
|
| 31 |
+
temp/
|
| 32 |
+
|
| 33 |
+
# Tests (not needed in production)
|
| 34 |
+
tests/
|
| 35 |
+
|
| 36 |
+
# Documentation
|
| 37 |
+
docs/
|
| 38 |
+
*.md
|
| 39 |
+
!README.md
|
| 40 |
+
|
| 41 |
+
# Jupyter notebooks
|
| 42 |
+
*.ipynb
|
| 43 |
+
.ipynb_checkpoints
|
| 44 |
+
|
| 45 |
+
# Docker
|
| 46 |
+
Dockerfile
|
| 47 |
+
docker-compose*.yml
|
| 48 |
+
.dockerignore
|
| 49 |
+
|
| 50 |
+
# Misc
|
| 51 |
+
.env
|
| 52 |
+
.env.*
|
| 53 |
+
*.log
|
README.md
CHANGED
|
@@ -278,7 +278,7 @@ invoice-processor-ml/
|
|
| 278 |
│ └── pipeline.py # Main orchestrator for the pipeline and CLI
|
| 279 |
│
|
| 280 |
│
|
| 281 |
-
├── tests/
|
| 282 |
│ ├── test_preprocessing.py # Tests for the preprocessing module
|
| 283 |
│ ├── test_ocr.py # Tests for the OCR module
|
| 284 |
│ └── test_pipeline.py # End-to-end pipeline tests
|
|
@@ -292,7 +292,7 @@ invoice-processor-ml/
|
|
| 292 |
|
| 293 |
- **Model**: `microsoft/layoutlmv3-base` (125M params)
|
| 294 |
- **Task**: Token Classification (NER) with 9 labels: `O, B/I-COMPANY, B/I-ADDRESS, B/I-DATE, B/I-TOTAL`
|
| 295 |
-
- **Dataset**: SROIE (ICDAR 2019, English retail receipts)
|
| 296 |
- **Training**: RTX 3050 6GB, PyTorch 2.x, Transformers 4.x
|
| 297 |
- **Result**: Best F1 ≈ 0.922 on validation (epoch 5 saved)
|
| 298 |
|
|
|
|
| 278 |
│ └── pipeline.py # Main orchestrator for the pipeline and CLI
|
| 279 |
│
|
| 280 |
│
|
| 281 |
+
├── tests/
|
| 282 |
│ ├── test_preprocessing.py # Tests for the preprocessing module
|
| 283 |
│ ├── test_ocr.py # Tests for the OCR module
|
| 284 |
│ └── test_pipeline.py # End-to-end pipeline tests
|
|
|
|
| 292 |
|
| 293 |
- **Model**: `microsoft/layoutlmv3-base` (125M params)
|
| 294 |
- **Task**: Token Classification (NER) with 9 labels: `O, B/I-COMPANY, B/I-ADDRESS, B/I-DATE, B/I-TOTAL`
|
| 295 |
+
- **Dataset**: SROIE (ICDAR 2019, English retail receipts), mychen76/invoices-and-receipts_ocr_v1 (English)
|
| 296 |
- **Training**: RTX 3050 6GB, PyTorch 2.x, Transformers 4.x
|
| 297 |
- **Result**: Best F1 ≈ 0.922 on validation (epoch 5 saved)
|
| 298 |
|
app.py
CHANGED
|
@@ -228,7 +228,19 @@ with tab1:
|
|
| 228 |
|
| 229 |
# Use an expander for longer text fields like address
|
| 230 |
with st.expander("Show More Details"):
|
| 231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
|
| 233 |
|
| 234 |
# Line items table
|
|
|
|
| 228 |
|
| 229 |
# Use an expander for longer text fields like address
|
| 230 |
with st.expander("Show More Details"):
|
| 231 |
+
# Handle receipt_number
|
| 232 |
+
st.markdown(f"**🧾 Receipt Number:** {data.get('receipt_number') or 'N/A'}")
|
| 233 |
+
|
| 234 |
+
# Handle bill_to (can be string from ML or dict from rules)
|
| 235 |
+
bill_to = data.get('bill_to')
|
| 236 |
+
if isinstance(bill_to, dict):
|
| 237 |
+
bill_to_display = bill_to.get('name') or 'N/A'
|
| 238 |
+
elif isinstance(bill_to, str):
|
| 239 |
+
bill_to_display = bill_to
|
| 240 |
+
else:
|
| 241 |
+
bill_to_display = 'N/A'
|
| 242 |
+
st.markdown(f"**👤 Bill To:** {bill_to_display}")
|
| 243 |
+
|
| 244 |
st.markdown(f"**📍 Vendor Address:** {data.get('address') or 'N/A'}")
|
| 245 |
|
| 246 |
# Line items table
|
src/extraction.py
CHANGED
|
@@ -138,7 +138,6 @@ def extract_bill_to(text: str) -> Optional[Dict[str, str]]:
|
|
| 138 |
return None
|
| 139 |
|
| 140 |
def extract_line_items(text: str) -> List[Dict[str, Any]]:
|
| 141 |
-
# (Keeping your existing logic simple for now)
|
| 142 |
return []
|
| 143 |
|
| 144 |
def structure_output(text: str) -> Dict[str, Any]:
|
|
|
|
| 138 |
return None
|
| 139 |
|
| 140 |
def extract_line_items(text: str) -> List[Dict[str, Any]]:
|
|
|
|
| 141 |
return []
|
| 142 |
|
| 143 |
def structure_output(text: str) -> Dict[str, Any]:
|