adding url support
Browse files- .dockerignore +28 -0
- .github/workflows/ci-cd.yml +82 -0
- .gitignore +57 -0
- COMPLETE_GUIDE.md +488 -0
- DEPLOYMENT.md +298 -0
- Dockerfile +36 -0
- LICENSE +21 -0
- QUICKSTART.md +271 -0
- README.md +160 -3
- STRUCTURE.md +269 -0
- app/__init__.py +6 -0
- app/redaction.py +316 -0
- client_example.py +142 -0
- client_supabase.py +9 -0
- docker-compose.yml +48 -0
- main.py +332 -0
- model/.gitattributes +35 -0
- model/README.md +54 -0
- model/config.json +59 -0
- model/pytorch_model.bin +3 -0
- model/special_tokens_map.json +7 -0
- model/tokenizer.json +0 -0
- model/tokenizer_config.json +13 -0
- model/vocab.txt +0 -0
- outputs/.gitkeep +0 -0
- requirements.txt +13 -0
- tests/test_api.py +67 -0
- uploads/.gitkeep +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.so
|
| 7 |
+
*.egg
|
| 8 |
+
*.egg-info
|
| 9 |
+
dist
|
| 10 |
+
build
|
| 11 |
+
.git
|
| 12 |
+
.gitignore
|
| 13 |
+
.env
|
| 14 |
+
.venv
|
| 15 |
+
venv/
|
| 16 |
+
env/
|
| 17 |
+
*.log
|
| 18 |
+
.DS_Store
|
| 19 |
+
.pytest_cache
|
| 20 |
+
.coverage
|
| 21 |
+
htmlcov/
|
| 22 |
+
uploads/*
|
| 23 |
+
outputs/*
|
| 24 |
+
!uploads/.gitkeep
|
| 25 |
+
!outputs/.gitkeep
|
| 26 |
+
*.pdf
|
| 27 |
+
README.md
|
| 28 |
+
.github/
|
.github/workflows/ci-cd.yml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI/CD Pipeline
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main, develop ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
test:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v3
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v4
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.10'
|
| 20 |
+
|
| 21 |
+
- name: Install system dependencies
|
| 22 |
+
run: |
|
| 23 |
+
sudo apt-get update
|
| 24 |
+
sudo apt-get install -y tesseract-ocr poppler-utils
|
| 25 |
+
|
| 26 |
+
- name: Install Python dependencies
|
| 27 |
+
run: |
|
| 28 |
+
python -m pip install --upgrade pip
|
| 29 |
+
pip install -r requirements.txt
|
| 30 |
+
pip install pytest pytest-cov httpx
|
| 31 |
+
|
| 32 |
+
- name: Run tests
|
| 33 |
+
run: |
|
| 34 |
+
pytest tests/ -v --cov=app --cov-report=xml
|
| 35 |
+
|
| 36 |
+
- name: Upload coverage
|
| 37 |
+
uses: codecov/codecov-action@v3
|
| 38 |
+
with:
|
| 39 |
+
file: ./coverage.xml
|
| 40 |
+
fail_ci_if_error: false
|
| 41 |
+
|
| 42 |
+
docker-build:
|
| 43 |
+
runs-on: ubuntu-latest
|
| 44 |
+
needs: test
|
| 45 |
+
|
| 46 |
+
steps:
|
| 47 |
+
- uses: actions/checkout@v3
|
| 48 |
+
|
| 49 |
+
- name: Set up Docker Buildx
|
| 50 |
+
uses: docker/setup-buildx-action@v2
|
| 51 |
+
|
| 52 |
+
- name: Build Docker image
|
| 53 |
+
run: |
|
| 54 |
+
docker build -t pdf-redaction-api:test .
|
| 55 |
+
|
| 56 |
+
- name: Test Docker image
|
| 57 |
+
run: |
|
| 58 |
+
docker run -d -p 7860:7860 --name test-api pdf-redaction-api:test
|
| 59 |
+
sleep 10
|
| 60 |
+
curl -f http://localhost:7860/health || exit 1
|
| 61 |
+
docker stop test-api
|
| 62 |
+
|
| 63 |
+
deploy-huggingface:
|
| 64 |
+
runs-on: ubuntu-latest
|
| 65 |
+
needs: [test, docker-build]
|
| 66 |
+
if: github.ref == 'refs/heads/main'
|
| 67 |
+
|
| 68 |
+
steps:
|
| 69 |
+
- uses: actions/checkout@v3
|
| 70 |
+
|
| 71 |
+
- name: Deploy to HuggingFace Spaces
|
| 72 |
+
env:
|
| 73 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 74 |
+
run: |
|
| 75 |
+
git config --global user.email "github-actions@github.com"
|
| 76 |
+
git config --global user.name "GitHub Actions"
|
| 77 |
+
|
| 78 |
+
# Add HuggingFace remote if it doesn't exist
|
| 79 |
+
git remote add hf https://user:$HF_TOKEN@huggingface.co/spaces/${{ secrets.HF_SPACE }} || true
|
| 80 |
+
|
| 81 |
+
# Push to HuggingFace
|
| 82 |
+
git push hf main:main
|
.gitignore
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual environments
|
| 24 |
+
redact/
|
| 25 |
+
venv/
|
| 26 |
+
env/
|
| 27 |
+
ENV/
|
| 28 |
+
.venv
|
| 29 |
+
|
| 30 |
+
# IDE
|
| 31 |
+
.vscode/
|
| 32 |
+
.idea/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
*~
|
| 36 |
+
|
| 37 |
+
# OS
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Project specific
|
| 42 |
+
uploads/*.pdf
|
| 43 |
+
outputs/*.pdf
|
| 44 |
+
*.log
|
| 45 |
+
|
| 46 |
+
# Environment
|
| 47 |
+
.env
|
| 48 |
+
.env.local
|
| 49 |
+
|
| 50 |
+
# Testing
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
.coverage
|
| 53 |
+
htmlcov/
|
| 54 |
+
|
| 55 |
+
# Model cache
|
| 56 |
+
cache/
|
| 57 |
+
models/
|
COMPLETE_GUIDE.md
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Complete FastAPI Deployment Package
|
| 2 |
+
|
| 3 |
+
## 📦 What You've Got
|
| 4 |
+
|
| 5 |
+
A production-ready FastAPI application for PDF redaction with Named Entity Recognition, ready to deploy on HuggingFace Spaces or any cloud platform.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 📁 Directory Structure
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
pdf-redaction-api/
|
| 13 |
+
│
|
| 14 |
+
├── 📄 main.py # FastAPI application
|
| 15 |
+
├── 🐳 Dockerfile # Production container
|
| 16 |
+
├── 🐳 docker-compose.yml # Local development
|
| 17 |
+
├── 📋 requirements.txt # Python dependencies
|
| 18 |
+
│
|
| 19 |
+
├── 📱 app/
|
| 20 |
+
│ ├── __init__.py
|
| 21 |
+
│ └── redaction.py # Core redaction engine
|
| 22 |
+
│
|
| 23 |
+
├── 📂 uploads/ # Temporary uploads
|
| 24 |
+
│ └── .gitkeep
|
| 25 |
+
│
|
| 26 |
+
├── 📂 outputs/ # Redacted PDFs
|
| 27 |
+
│ └── .gitkeep
|
| 28 |
+
│
|
| 29 |
+
├── 🧪 tests/
|
| 30 |
+
│ └── test_api.py # API tests
|
| 31 |
+
│
|
| 32 |
+
├── 📚 Documentation/
|
| 33 |
+
│ ├── README.md # Main docs (for HF Spaces)
|
| 34 |
+
│ ├── DEPLOYMENT.md # Deployment guide
|
| 35 |
+
│ ├── QUICKSTART.md # Quick start guide
|
| 36 |
+
│ └── STRUCTURE.md # Project structure
|
| 37 |
+
│
|
| 38 |
+
├── 🔧 Configuration/
|
| 39 |
+
│ ├── .env.example # Environment variables
|
| 40 |
+
│ ├── .gitignore # Git ignore
|
| 41 |
+
│ └── .dockerignore # Docker ignore
|
| 42 |
+
│
|
| 43 |
+
├── 🤖 .github/
|
| 44 |
+
│ └── workflows/
|
| 45 |
+
│ └── ci-cd.yml # GitHub Actions CI/CD
|
| 46 |
+
│
|
| 47 |
+
├── 📝 client_example.py # Example API client
|
| 48 |
+
└── 📜 LICENSE # MIT License
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## ✨ Features
|
| 54 |
+
|
| 55 |
+
### Core Functionality
|
| 56 |
+
✅ PDF upload and processing
|
| 57 |
+
✅ OCR with pytesseract (configurable DPI)
|
| 58 |
+
✅ Named Entity Recognition (NER)
|
| 59 |
+
✅ Accurate coordinate-based redaction
|
| 60 |
+
✅ Multiple entity type support
|
| 61 |
+
✅ Downloadable redacted PDFs
|
| 62 |
+
|
| 63 |
+
### API Features
|
| 64 |
+
✅ RESTful API with FastAPI
|
| 65 |
+
✅ Automatic OpenAPI documentation
|
| 66 |
+
✅ File upload handling
|
| 67 |
+
✅ Background task cleanup
|
| 68 |
+
✅ Health checks
|
| 69 |
+
✅ Statistics endpoint
|
| 70 |
+
✅ CORS support
|
| 71 |
+
|
| 72 |
+
### DevOps
|
| 73 |
+
✅ Docker containerization
|
| 74 |
+
✅ Docker Compose for local dev
|
| 75 |
+
✅ GitHub Actions CI/CD
|
| 76 |
+
✅ HuggingFace Spaces ready
|
| 77 |
+
✅ Comprehensive testing
|
| 78 |
+
✅ Logging and monitoring
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
## 🎯 Quick Deployment Paths
|
| 83 |
+
|
| 84 |
+
### Option 1: HuggingFace Spaces (Recommended for Demo)
|
| 85 |
+
|
| 86 |
+
**Time: 10 minutes**
|
| 87 |
+
|
| 88 |
+
```bash
|
| 89 |
+
# 1. Create Space on HuggingFace (select Docker SDK)
|
| 90 |
+
# 2. Clone your space
|
| 91 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 92 |
+
cd pdf-redaction-api
|
| 93 |
+
|
| 94 |
+
# 3. Copy all files
|
| 95 |
+
cp -r /path/to/pdf-redaction-api/* .
|
| 96 |
+
|
| 97 |
+
# 4. Deploy
|
| 98 |
+
git add .
|
| 99 |
+
git commit -m "Initial deployment"
|
| 100 |
+
git push
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
**Your API will be at:** `https://YOUR_USERNAME-pdf-redaction-api.hf.space`
|
| 104 |
+
|
| 105 |
+
**Cost:** FREE (with CPU Basic tier)
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
### Option 2: Docker Locally
|
| 110 |
+
|
| 111 |
+
**Time: 5 minutes**
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
# Build
|
| 115 |
+
docker build -t pdf-redaction-api .
|
| 116 |
+
|
| 117 |
+
# Run
|
| 118 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 119 |
+
|
| 120 |
+
# Test
|
| 121 |
+
curl http://localhost:7860/health
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
### Option 3: Direct Python
|
| 127 |
+
|
| 128 |
+
**Time: 3 minutes**
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
# Install dependencies
|
| 132 |
+
sudo apt-get install tesseract-ocr poppler-utils
|
| 133 |
+
pip install -r requirements.txt
|
| 134 |
+
|
| 135 |
+
# Run
|
| 136 |
+
python main.py
|
| 137 |
+
|
| 138 |
+
# Access at http://localhost:7860
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## 🔌 API Endpoints
|
| 144 |
+
|
| 145 |
+
### Core Endpoints
|
| 146 |
+
|
| 147 |
+
| Method | Endpoint | Description |
|
| 148 |
+
|--------|----------|-------------|
|
| 149 |
+
| POST | `/redact` | Upload and redact PDF |
|
| 150 |
+
| GET | `/download/{job_id}` | Download redacted PDF |
|
| 151 |
+
| GET | `/health` | Health check |
|
| 152 |
+
| GET | `/stats` | API statistics |
|
| 153 |
+
| DELETE | `/cleanup/{job_id}` | Manual cleanup |
|
| 154 |
+
| GET | `/docs` | Interactive API docs |
|
| 155 |
+
|
| 156 |
+
### Example Usage
|
| 157 |
+
|
| 158 |
+
**cURL:**
|
| 159 |
+
```bash
|
| 160 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 161 |
+
-F "file=@document.pdf" \
|
| 162 |
+
-F "dpi=300"
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**Python:**
|
| 166 |
+
```python
|
| 167 |
+
import requests
|
| 168 |
+
|
| 169 |
+
response = requests.post(
|
| 170 |
+
"http://localhost:7860/redact",
|
| 171 |
+
files={"file": open("document.pdf", "rb")},
|
| 172 |
+
params={"dpi": 300}
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
job_id = response.json()["job_id"]
|
| 176 |
+
redacted = requests.get(f"http://localhost:7860/download/{job_id}")
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
## 🎨 Architecture
|
| 182 |
+
|
| 183 |
+
```
|
| 184 |
+
┌─────────────────────────────────────────────────────────┐
|
| 185 |
+
│ CLIENT REQUEST │
|
| 186 |
+
│ (Upload PDF via POST /redact) │
|
| 187 |
+
└─────────────────────────────────────────────────────────┘
|
| 188 |
+
↓
|
| 189 |
+
┌─────────────────────────────────────────────────────────┐
|
| 190 |
+
│ FASTAPI (main.py) │
|
| 191 |
+
│ • Validate file │
|
| 192 |
+
│ • Generate job_id │
|
| 193 |
+
│ • Save to uploads/ │
|
| 194 |
+
└─────────────────────────────────────────────────────────┘
|
| 195 |
+
↓
|
| 196 |
+
┌─────────────────────────────────────────────────────────┐
|
| 197 |
+
│ PDFRedactor (app/redaction.py) │
|
| 198 |
+
│ │
|
| 199 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 200 |
+
│ │ 1. OCR (pytesseract) │ │
|
| 201 |
+
│ │ • Convert PDF → Images (pdf2image) │ │
|
| 202 |
+
│ │ • Extract text + bounding boxes │ │
|
| 203 |
+
│ │ • Store image dimensions │ │
|
| 204 |
+
│ └─────────────────────────────────────────┘ │
|
| 205 |
+
│ ↓ │
|
| 206 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 207 |
+
│ │ 2. NER (HuggingFace Transformers) │ │
|
| 208 |
+
│ │ • Load model │ │
|
| 209 |
+
│ │ • Identify entities in text │ │
|
| 210 |
+
│ │ • Return entity types + positions │ │
|
| 211 |
+
│ └─────────────────────────────────────────┘ │
|
| 212 |
+
│ ↓ │
|
| 213 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 214 |
+
│ │ 3. Mapping │ │
|
| 215 |
+
│ │ • Create character span index │ │
|
| 216 |
+
│ │ • Match NER entities to OCR boxes │ │
|
| 217 |
+
│ └─────────────────────────────────────────┘ │
|
| 218 |
+
│ ↓ │
|
| 219 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 220 |
+
│ │ 4. Redaction (pypdf) │ │
|
| 221 |
+
│ │ • Scale image coords → PDF coords │ │
|
| 222 |
+
│ │ • Create black rectangle annotations │ │
|
| 223 |
+
│ │ • Write redacted PDF │ │
|
| 224 |
+
│ └─────────────────────────────────────────┘ │
|
| 225 |
+
└─────────────────────────────────────────────────────────┘
|
| 226 |
+
↓
|
| 227 |
+
┌─────────────────────────────────────────────────────────┐
|
| 228 |
+
│ RESPONSE │
|
| 229 |
+
│ • job_id │
|
| 230 |
+
│ • List of entities │
|
| 231 |
+
│ • Download URL │
|
| 232 |
+
└─────────────────────────────────────────────────────────┘
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
---
|
| 236 |
+
|
| 237 |
+
## 🔐 Security Considerations
|
| 238 |
+
|
| 239 |
+
### Current Implementation
|
| 240 |
+
- ✅ File validation (PDF only)
|
| 241 |
+
- ✅ Temporary file cleanup
|
| 242 |
+
- ✅ CORS middleware
|
| 243 |
+
- ✅ Error handling
|
| 244 |
+
|
| 245 |
+
### For Production (TODO)
|
| 246 |
+
- ⚠️ Add API key authentication
|
| 247 |
+
- ⚠️ Implement rate limiting
|
| 248 |
+
- ⚠️ Add file size limits
|
| 249 |
+
- ⚠️ Use HTTPS only
|
| 250 |
+
- ⚠️ Implement user quotas
|
| 251 |
+
- ⚠️ Add input sanitization
|
| 252 |
+
|
| 253 |
+
**Example API Key Auth:**
|
| 254 |
+
```python
|
| 255 |
+
# Add to main.py
|
| 256 |
+
from fastapi import Security, HTTPException
|
| 257 |
+
from fastapi.security import APIKeyHeader
|
| 258 |
+
|
| 259 |
+
API_KEY = "your-secret-key"
|
| 260 |
+
api_key_header = APIKeyHeader(name="X-API-Key")
|
| 261 |
+
|
| 262 |
+
def verify_api_key(key: str = Security(api_key_header)):
|
| 263 |
+
if key != API_KEY:
|
| 264 |
+
raise HTTPException(401, "Invalid API Key")
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 📊 Performance Tuning
|
| 270 |
+
|
| 271 |
+
### DPI Settings
|
| 272 |
+
|
| 273 |
+
| DPI | Quality | Speed | Use Case |
|
| 274 |
+
|-----|---------|-------|----------|
|
| 275 |
+
| 150 | Low | Fast | Quick previews |
|
| 276 |
+
| 200 | Medium | Medium | General use |
|
| 277 |
+
| 300 | High | Slow | **Recommended** |
|
| 278 |
+
| 600 | Very High | Very Slow | Critical documents |
|
| 279 |
+
|
| 280 |
+
### Hardware Requirements
|
| 281 |
+
|
| 282 |
+
**Minimum (Free Tier):**
|
| 283 |
+
- CPU: 2 cores
|
| 284 |
+
- RAM: 2GB
|
| 285 |
+
- Storage: 1GB
|
| 286 |
+
|
| 287 |
+
**Recommended (Production):**
|
| 288 |
+
- CPU: 4+ cores
|
| 289 |
+
- RAM: 8GB
|
| 290 |
+
- Storage: 10GB
|
| 291 |
+
- GPU: Optional (speeds up NER)
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## 🧪 Testing
|
| 296 |
+
|
| 297 |
+
```bash
|
| 298 |
+
# Install test dependencies
|
| 299 |
+
pip install pytest pytest-cov httpx
|
| 300 |
+
|
| 301 |
+
# Run tests
|
| 302 |
+
pytest tests/ -v
|
| 303 |
+
|
| 304 |
+
# With coverage
|
| 305 |
+
pytest tests/ --cov=app --cov-report=html
|
| 306 |
+
|
| 307 |
+
# View coverage report
|
| 308 |
+
open htmlcov/index.html
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
## 📈 Monitoring
|
| 314 |
+
|
| 315 |
+
### Built-in Endpoints
|
| 316 |
+
|
| 317 |
+
**Health Check:**
|
| 318 |
+
```bash
|
| 319 |
+
curl http://localhost:7860/health
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
**Statistics:**
|
| 323 |
+
```bash
|
| 324 |
+
curl http://localhost:7860/stats
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
### Logs
|
| 328 |
+
|
| 329 |
+
**Development:**
|
| 330 |
+
```bash
|
| 331 |
+
python main.py
|
| 332 |
+
# Logs appear in console
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
**Docker:**
|
| 336 |
+
```bash
|
| 337 |
+
docker logs -f container_name
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
**HuggingFace Spaces:**
|
| 341 |
+
- View in Space dashboard → Logs tab
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## 💰 Cost Estimation
|
| 346 |
+
|
| 347 |
+
### HuggingFace Spaces
|
| 348 |
+
|
| 349 |
+
| Tier | CPU | RAM | Price | Use Case |
|
| 350 |
+
|------|-----|-----|-------|----------|
|
| 351 |
+
| Basic | 2 | 16GB | **FREE** | Demo, testing |
|
| 352 |
+
| CPU Upgrade | 4 | 32GB | $0.50/hr | Production |
|
| 353 |
+
| GPU T4 | - | - | $0.60/hr | Heavy load |
|
| 354 |
+
| GPU A10G | - | - | $1.50/hr | Enterprise |
|
| 355 |
+
|
| 356 |
+
**Monthly Costs (if always on):**
|
| 357 |
+
- Free: $0
|
| 358 |
+
- CPU Upgrade: ~$360/month
|
| 359 |
+
- GPU T4: ~$432/month
|
| 360 |
+
|
| 361 |
+
**Recommendation:** Start free, upgrade based on usage
|
| 362 |
+
|
| 363 |
+
### Alternatives
|
| 364 |
+
|
| 365 |
+
**AWS ECS Fargate:** ~$30-100/month
|
| 366 |
+
**Google Cloud Run:** Pay per request (~$10-50/month)
|
| 367 |
+
**DigitalOcean App:** $12-24/month
|
| 368 |
+
**Self-hosted VPS:** $5-20/month
|
| 369 |
+
|
| 370 |
+
---
|
| 371 |
+
|
| 372 |
+
## 🔄 CI/CD Pipeline
|
| 373 |
+
|
| 374 |
+
### Automated with GitHub Actions
|
| 375 |
+
|
| 376 |
+
```
|
| 377 |
+
Push to GitHub
|
| 378 |
+
↓
|
| 379 |
+
[Run Tests]
|
| 380 |
+
↓
|
| 381 |
+
[Build Docker]
|
| 382 |
+
↓
|
| 383 |
+
[Test Container]
|
| 384 |
+
↓
|
| 385 |
+
[Deploy to HuggingFace]
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
**Setup:**
|
| 389 |
+
1. Add secrets in GitHub repo settings:
|
| 390 |
+
- `HF_TOKEN`: HuggingFace access token
|
| 391 |
+
- `HF_SPACE`: Your space name (username/space-name)
|
| 392 |
+
|
| 393 |
+
2. Push to main branch → Auto-deploy! ✨
|
| 394 |
+
|
| 395 |
+
---
|
| 396 |
+
|
| 397 |
+
## 📚 Documentation Access
|
| 398 |
+
|
| 399 |
+
| Document | Purpose |
|
| 400 |
+
|----------|---------|
|
| 401 |
+
| `README.md` | Overview, API docs, usage examples |
|
| 402 |
+
| `QUICKSTART.md` | 5-minute setup guide |
|
| 403 |
+
| `DEPLOYMENT.md` | Production deployment |
|
| 404 |
+
| `STRUCTURE.md` | Code organization |
|
| 405 |
+
| `/docs` endpoint | Interactive API documentation |
|
| 406 |
+
|
| 407 |
+
---
|
| 408 |
+
|
| 409 |
+
## 🎓 Learning Resources
|
| 410 |
+
|
| 411 |
+
### FastAPI
|
| 412 |
+
- Docs: https://fastapi.tiangolo.com
|
| 413 |
+
- Tutorial: https://fastapi.tiangolo.com/tutorial
|
| 414 |
+
|
| 415 |
+
### HuggingFace
|
| 416 |
+
- Spaces: https://huggingface.co/docs/hub/spaces
|
| 417 |
+
- Transformers: https://huggingface.co/docs/transformers
|
| 418 |
+
|
| 419 |
+
### Docker
|
| 420 |
+
- Getting Started: https://docs.docker.com/get-started
|
| 421 |
+
|
| 422 |
+
---
|
| 423 |
+
|
| 424 |
+
## 🐛 Troubleshooting
|
| 425 |
+
|
| 426 |
+
### Common Issues
|
| 427 |
+
|
| 428 |
+
**Problem:** "Tesseract not found"
|
| 429 |
+
**Solution:** `apt-get install tesseract-ocr`
|
| 430 |
+
|
| 431 |
+
**Problem:** "Poppler not found"
|
| 432 |
+
**Solution:** `apt-get install poppler-utils`
|
| 433 |
+
|
| 434 |
+
**Problem:** Slow processing
|
| 435 |
+
**Solution:** Lower DPI to 150-200
|
| 436 |
+
|
| 437 |
+
**Problem:** Out of memory
|
| 438 |
+
**Solution:** Upgrade hardware or reduce DPI
|
| 439 |
+
|
| 440 |
+
**Problem:** Model not loading
|
| 441 |
+
**Solution:** Check internet, wait for download
|
| 442 |
+
|
| 443 |
+
### Debug Mode
|
| 444 |
+
|
| 445 |
+
```python
|
| 446 |
+
# In main.py, add debug mode
|
| 447 |
+
if __name__ == "__main__":
|
| 448 |
+
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True, log_level="debug")
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
---
|
| 452 |
+
|
| 453 |
+
## ✅ Checklist for Production
|
| 454 |
+
|
| 455 |
+
- [ ] Test all endpoints thoroughly
|
| 456 |
+
- [ ] Add API key authentication
|
| 457 |
+
- [ ] Implement rate limiting
|
| 458 |
+
- [ ] Set up monitoring (Sentry, DataDog, etc.)
|
| 459 |
+
- [ ] Configure auto-scaling
|
| 460 |
+
- [ ] Set up backups
|
| 461 |
+
- [ ] Add usage analytics
|
| 462 |
+
- [ ] Create user documentation
|
| 463 |
+
- [ ] Set up SSL/TLS (HF provides by default)
|
| 464 |
+
- [ ] Test with large files
|
| 465 |
+
- [ ] Load testing
|
| 466 |
+
- [ ] Security audit
|
| 467 |
+
- [ ] Legal compliance (GDPR, etc.)
|
| 468 |
+
|
| 469 |
+
---
|
| 470 |
+
|
| 471 |
+
## 🎉 You're Ready!
|
| 472 |
+
|
| 473 |
+
Your FastAPI PDF Redaction application is complete and ready to deploy!
|
| 474 |
+
|
| 475 |
+
### Next Steps:
|
| 476 |
+
1. ✨ Deploy to HuggingFace Spaces (easiest)
|
| 477 |
+
2. 🧪 Test with real PDFs
|
| 478 |
+
3. 📊 Monitor usage
|
| 479 |
+
4. 🔒 Add security for production
|
| 480 |
+
5. 🚀 Scale as needed
|
| 481 |
+
|
| 482 |
+
### Support:
|
| 483 |
+
- 📖 Read the documentation
|
| 484 |
+
- 🐛 Check troubleshooting guide
|
| 485 |
+
- 💬 HuggingFace community forums
|
| 486 |
+
- 📧 Create issues on your repo
|
| 487 |
+
|
| 488 |
+
**Happy Deploying! 🚀**
|
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deployment Guide for HuggingFace Spaces
|
| 2 |
+
|
| 3 |
+
## Prerequisites
|
| 4 |
+
|
| 5 |
+
1. **HuggingFace Account**: Sign up at https://huggingface.co/
|
| 6 |
+
2. **Git**: Installed on your local machine
|
| 7 |
+
3. **Git LFS**: For large file storage (optional)
|
| 8 |
+
|
| 9 |
+
## Step-by-Step Deployment
|
| 10 |
+
|
| 11 |
+
### 1. Create a New Space
|
| 12 |
+
|
| 13 |
+
1. Go to https://huggingface.co/spaces
|
| 14 |
+
2. Click "Create new Space"
|
| 15 |
+
3. Fill in the details:
|
| 16 |
+
- **Space name**: `pdf-redaction-api` (or your preferred name)
|
| 17 |
+
- **License**: MIT
|
| 18 |
+
- **SDK**: Docker
|
| 19 |
+
- **Hardware**: CPU Basic (free tier) or upgrade if needed
|
| 20 |
+
4. Click "Create Space"
|
| 21 |
+
|
| 22 |
+
### 2. Clone Your Space Repository
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 26 |
+
cd pdf-redaction-api
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### 3. Copy All Files to the Repository
|
| 30 |
+
|
| 31 |
+
Copy all files from this project to your cloned space:
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# Copy all files
|
| 35 |
+
cp -r /path/to/pdf-redaction-api/* .
|
| 36 |
+
|
| 37 |
+
# Check the files
|
| 38 |
+
ls -la
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
You should see:
|
| 42 |
+
- `main.py`
|
| 43 |
+
- `app/`
|
| 44 |
+
- `Dockerfile`
|
| 45 |
+
- `requirements.txt`
|
| 46 |
+
- `README.md`
|
| 47 |
+
- `.gitignore`
|
| 48 |
+
- `.dockerignore`
|
| 49 |
+
- `uploads/` (with .gitkeep)
|
| 50 |
+
- `outputs/` (with .gitkeep)
|
| 51 |
+
|
| 52 |
+
### 4. Commit and Push
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
# Add all files
|
| 56 |
+
git add .
|
| 57 |
+
|
| 58 |
+
# Commit
|
| 59 |
+
git commit -m "Initial deployment of PDF Redaction API"
|
| 60 |
+
|
| 61 |
+
# Push to HuggingFace
|
| 62 |
+
git push
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### 5. Monitor Deployment
|
| 66 |
+
|
| 67 |
+
1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
|
| 68 |
+
2. You'll see the build logs
|
| 69 |
+
3. Wait for the build to complete (usually 5-10 minutes)
|
| 70 |
+
4. Once complete, your API will be live!
|
| 71 |
+
|
| 72 |
+
### 6. Test Your Deployment
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
# Check health
|
| 76 |
+
curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
|
| 77 |
+
|
| 78 |
+
# Test with a PDF
|
| 79 |
+
curl -X POST "https://YOUR_USERNAME-pdf-redaction-api.hf.space/redact" \
|
| 80 |
+
-F "file=@test.pdf" \
|
| 81 |
+
-F "dpi=300"
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## Configuration Options
|
| 85 |
+
|
| 86 |
+
### Hardware Upgrades
|
| 87 |
+
|
| 88 |
+
For better performance, consider upgrading your Space hardware:
|
| 89 |
+
|
| 90 |
+
1. Go to Space Settings
|
| 91 |
+
2. Click on "Hardware"
|
| 92 |
+
3. Choose:
|
| 93 |
+
- **CPU Basic** (Free): Good for testing, slower processing
|
| 94 |
+
- **CPU Upgrade** (~$0.50/hour): Faster processing
|
| 95 |
+
- **GPU** (~$0.60-3/hour): Best for large documents
|
| 96 |
+
|
| 97 |
+
### Environment Variables
|
| 98 |
+
|
| 99 |
+
Add environment variables in Space Settings if needed:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
HF_HOME=/app/cache
|
| 103 |
+
PYTHONUNBUFFERED=1
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Persistent Storage
|
| 107 |
+
|
| 108 |
+
For persistent file storage:
|
| 109 |
+
|
| 110 |
+
1. Go to Space Settings
|
| 111 |
+
2. Enable "Persistent Storage"
|
| 112 |
+
3. This keeps uploaded/processed files between restarts
|
| 113 |
+
|
| 114 |
+
## Custom Domain (Optional)
|
| 115 |
+
|
| 116 |
+
To use a custom domain:
|
| 117 |
+
|
| 118 |
+
1. Go to Space Settings
|
| 119 |
+
2. Click "Domains"
|
| 120 |
+
3. Add your custom domain
|
| 121 |
+
4. Follow DNS configuration instructions
|
| 122 |
+
|
| 123 |
+
## Monitoring and Logs
|
| 124 |
+
|
| 125 |
+
### View Logs
|
| 126 |
+
|
| 127 |
+
1. Go to your Space page
|
| 128 |
+
2. Click on "Logs" tab
|
| 129 |
+
3. Monitor real-time logs
|
| 130 |
+
|
| 131 |
+
### Check Resource Usage
|
| 132 |
+
|
| 133 |
+
1. Click on "Insights" tab
|
| 134 |
+
2. View CPU/Memory usage
|
| 135 |
+
3. Monitor request patterns
|
| 136 |
+
|
| 137 |
+
## Security Considerations
|
| 138 |
+
|
| 139 |
+
### For Production Use
|
| 140 |
+
|
| 141 |
+
1. **Add Authentication**:
|
| 142 |
+
- Implement API key authentication
|
| 143 |
+
- Use OAuth2 for user management
|
| 144 |
+
|
| 145 |
+
2. **Rate Limiting**:
|
| 146 |
+
- Add rate limiting to prevent abuse
|
| 147 |
+
- Use slowapi or similar libraries
|
| 148 |
+
|
| 149 |
+
3. **File Size Limits**:
|
| 150 |
+
- Restrict upload file sizes
|
| 151 |
+
- Implement timeout for long-running requests
|
| 152 |
+
|
| 153 |
+
4. **HTTPS Only**:
|
| 154 |
+
- HuggingFace Spaces provides HTTPS by default
|
| 155 |
+
- Ensure all requests use HTTPS
|
| 156 |
+
|
| 157 |
+
Example with API key authentication:
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
from fastapi import Security, HTTPException, status
|
| 161 |
+
from fastapi.security import APIKeyHeader
|
| 162 |
+
|
| 163 |
+
API_KEY = "your-secret-key"
|
| 164 |
+
api_key_header = APIKeyHeader(name="X-API-Key")
|
| 165 |
+
|
| 166 |
+
def verify_api_key(api_key: str = Security(api_key_header)):
|
| 167 |
+
if api_key != API_KEY:
|
| 168 |
+
raise HTTPException(
|
| 169 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 170 |
+
detail="Invalid API Key"
|
| 171 |
+
)
|
| 172 |
+
return api_key
|
| 173 |
+
|
| 174 |
+
# Add to endpoints
|
| 175 |
+
@app.post("/redact")
|
| 176 |
+
async def redact_pdf(
|
| 177 |
+
file: UploadFile = File(...),
|
| 178 |
+
api_key: str = Security(verify_api_key)
|
| 179 |
+
):
|
| 180 |
+
# Your code here
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## Troubleshooting
|
| 184 |
+
|
| 185 |
+
### Build Fails
|
| 186 |
+
|
| 187 |
+
**Problem**: Docker build fails
|
| 188 |
+
|
| 189 |
+
**Solution**:
|
| 190 |
+
- Check Dockerfile syntax
|
| 191 |
+
- Ensure all dependencies are in requirements.txt
|
| 192 |
+
- Review build logs for specific errors
|
| 193 |
+
|
| 194 |
+
### Out of Memory
|
| 195 |
+
|
| 196 |
+
**Problem**: API crashes with OOM errors
|
| 197 |
+
|
| 198 |
+
**Solution**:
|
| 199 |
+
- Reduce default DPI to 200
|
| 200 |
+
- Upgrade to larger hardware
|
| 201 |
+
- Implement request queuing
|
| 202 |
+
|
| 203 |
+
### Slow Processing
|
| 204 |
+
|
| 205 |
+
**Problem**: Redaction takes too long
|
| 206 |
+
|
| 207 |
+
**Solution**:
|
| 208 |
+
- Lower DPI (150-200 for faster processing)
|
| 209 |
+
- Upgrade to GPU hardware
|
| 210 |
+
- Optimize batch processing
|
| 211 |
+
|
| 212 |
+
### Model Download Issues
|
| 213 |
+
|
| 214 |
+
**Problem**: Model fails to download
|
| 215 |
+
|
| 216 |
+
**Solution**:
|
| 217 |
+
- Check HuggingFace model availability
|
| 218 |
+
- Verify internet access in Space
|
| 219 |
+
- Pre-download model and include in Docker image
|
| 220 |
+
|
| 221 |
+
## Updating Your Space
|
| 222 |
+
|
| 223 |
+
To update your deployed API:
|
| 224 |
+
|
| 225 |
+
```bash
|
| 226 |
+
# Make changes locally
|
| 227 |
+
# Test changes
|
| 228 |
+
|
| 229 |
+
# Commit and push
|
| 230 |
+
git add .
|
| 231 |
+
git commit -m "Update: description of changes"
|
| 232 |
+
git push
|
| 233 |
+
|
| 234 |
+
# HuggingFace will automatically rebuild
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
## Cost Estimation
|
| 238 |
+
|
| 239 |
+
### Free Tier
|
| 240 |
+
- CPU Basic
|
| 241 |
+
- Limited to 2 CPU cores
|
| 242 |
+
- 16GB RAM
|
| 243 |
+
- Good for: Testing, low-traffic demos
|
| 244 |
+
|
| 245 |
+
### Paid Tiers
|
| 246 |
+
- CPU Upgrade: ~$0.50/hour (~$360/month if always on)
|
| 247 |
+
- GPU T4: ~$0.60/hour (~$432/month)
|
| 248 |
+
- GPU A10G: ~$1.50/hour (~$1,080/month)
|
| 249 |
+
|
| 250 |
+
**Recommendation**: Start with free tier, upgrade based on usage
|
| 251 |
+
|
| 252 |
+
## Alternative Deployment Options
|
| 253 |
+
|
| 254 |
+
### 1. Deploy on Your Own Server
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
# Build Docker image
|
| 258 |
+
docker build -t pdf-redaction-api .
|
| 259 |
+
|
| 260 |
+
# Run container
|
| 261 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
### 2. Deploy on Cloud Platforms
|
| 265 |
+
|
| 266 |
+
- **AWS ECS/Fargate**: For scalable production
|
| 267 |
+
- **Google Cloud Run**: Serverless container deployment
|
| 268 |
+
- **Azure Container Instances**: Easy container deployment
|
| 269 |
+
- **DigitalOcean App Platform**: Simple PaaS deployment
|
| 270 |
+
|
| 271 |
+
### 3. Deploy on Render.com
|
| 272 |
+
|
| 273 |
+
1. Connect your GitHub repo
|
| 274 |
+
2. Select "Docker" as environment
|
| 275 |
+
3. Deploy automatically
|
| 276 |
+
|
| 277 |
+
## Support
|
| 278 |
+
|
| 279 |
+
For issues:
|
| 280 |
+
1. Check HuggingFace Spaces documentation
|
| 281 |
+
2. Review logs in Space dashboard
|
| 282 |
+
3. Test locally with Docker first
|
| 283 |
+
4. Open issue on your repository
|
| 284 |
+
|
| 285 |
+
## Next Steps
|
| 286 |
+
|
| 287 |
+
After successful deployment:
|
| 288 |
+
|
| 289 |
+
1. ✅ Test all API endpoints
|
| 290 |
+
2. ✅ Set up monitoring
|
| 291 |
+
3. ✅ Configure custom domain (optional)
|
| 292 |
+
4. ✅ Add authentication for production
|
| 293 |
+
5. ✅ Implement rate limiting
|
| 294 |
+
6. ✅ Set up error tracking (e.g., Sentry)
|
| 295 |
+
7. ✅ Create API documentation with examples
|
| 296 |
+
8. ✅ Add usage analytics
|
| 297 |
+
|
| 298 |
+
Your API is now live and ready to use! 🚀
|
Dockerfile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
tesseract-ocr \
|
| 9 |
+
tesseract-ocr-eng \
|
| 10 |
+
poppler-utils \
|
| 11 |
+
libgl1 \
|
| 12 |
+
libglib2.0-0 \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy requirements first for better caching
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Create necessary directories
|
| 25 |
+
RUN mkdir -p uploads outputs
|
| 26 |
+
|
| 27 |
+
# Expose port (HuggingFace Spaces uses 7860)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Set environment variables
|
| 31 |
+
ENV PYTHONUNBUFFERED=1
|
| 32 |
+
ENV HF_HOME=/app/cache
|
| 33 |
+
|
| 34 |
+
# Run the application
|
| 35 |
+
CMD ["uvicorn", "main:app", "--host", "localhost", "--port", "2700"]
|
| 36 |
+
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 PDF Redaction API
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
QUICKSTART.md
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quick Start Guide 🚀
|
| 2 |
+
|
| 3 |
+
## Local Development (5 minutes)
|
| 4 |
+
|
| 5 |
+
### 1. Install System Dependencies
|
| 6 |
+
|
| 7 |
+
**Ubuntu/Debian:**
|
| 8 |
+
```bash
|
| 9 |
+
sudo apt-get update
|
| 10 |
+
sudo apt-get install -y tesseract-ocr poppler-utils
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
**macOS:**
|
| 14 |
+
```bash
|
| 15 |
+
brew install tesseract poppler
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
**Windows:**
|
| 19 |
+
- Download Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
|
| 20 |
+
- Download Poppler: https://github.com/oschwartz10612/poppler-windows/releases
|
| 21 |
+
|
| 22 |
+
### 2. Install Python Dependencies
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
pip install -r requirements.txt
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### 3. Run the Server
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python main.py
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
The API will be available at: `http://localhost:7860`
|
| 35 |
+
|
| 36 |
+
### 4. Test with cURL
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
# Health check
|
| 40 |
+
curl http://localhost:7860/health
|
| 41 |
+
|
| 42 |
+
# Redact a PDF
|
| 43 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 44 |
+
-F "file=@your_document.pdf" \
|
| 45 |
+
-F "dpi=300"
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### 5. Access API Documentation
|
| 49 |
+
|
| 50 |
+
Open in browser: `http://localhost:7860/docs`
|
| 51 |
+
|
| 52 |
+
## Using Docker (3 minutes)
|
| 53 |
+
|
| 54 |
+
### 1. Build Image
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
docker build -t pdf-redaction-api .
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### 2. Run Container
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### 3. Test
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
curl http://localhost:7860/health
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Deploy to HuggingFace Spaces (10 minutes)
|
| 73 |
+
|
| 74 |
+
### 1. Create Space
|
| 75 |
+
|
| 76 |
+
1. Go to https://huggingface.co/spaces
|
| 77 |
+
2. Click "Create new Space"
|
| 78 |
+
3. Name: `pdf-redaction-api`
|
| 79 |
+
4. SDK: **Docker**
|
| 80 |
+
5. Click "Create Space"
|
| 81 |
+
|
| 82 |
+
### 2. Push Code
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
# Clone your space
|
| 86 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 87 |
+
cd pdf-redaction-api
|
| 88 |
+
|
| 89 |
+
# Copy all project files
|
| 90 |
+
cp -r /path/to/project/* .
|
| 91 |
+
|
| 92 |
+
# Commit and push
|
| 93 |
+
git add .
|
| 94 |
+
git commit -m "Initial deployment"
|
| 95 |
+
git push
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### 3. Wait for Build
|
| 99 |
+
|
| 100 |
+
Monitor at: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
|
| 101 |
+
|
| 102 |
+
### 4. Test Your Deployed API
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
## Example Usage
|
| 109 |
+
|
| 110 |
+
### Python Client
|
| 111 |
+
|
| 112 |
+
```python
|
| 113 |
+
import requests
|
| 114 |
+
|
| 115 |
+
# Upload and redact
|
| 116 |
+
files = {"file": open("document.pdf", "rb")}
|
| 117 |
+
response = requests.post(
|
| 118 |
+
"http://localhost:7860/redact",
|
| 119 |
+
files=files,
|
| 120 |
+
params={"dpi": 300}
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
result = response.json()
|
| 124 |
+
job_id = result["job_id"]
|
| 125 |
+
|
| 126 |
+
# Download redacted PDF
|
| 127 |
+
redacted = requests.get(f"http://localhost:7860/download/{job_id}")
|
| 128 |
+
with open("redacted.pdf", "wb") as f:
|
| 129 |
+
f.write(redacted.content)
|
| 130 |
+
|
| 131 |
+
print(f"Redacted {len(result['entities'])} entities")
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
### JavaScript/Node.js
|
| 135 |
+
|
| 136 |
+
```javascript
|
| 137 |
+
const FormData = require('form-data');
|
| 138 |
+
const fs = require('fs');
|
| 139 |
+
const axios = require('axios');
|
| 140 |
+
|
| 141 |
+
async function redactPDF() {
|
| 142 |
+
const form = new FormData();
|
| 143 |
+
form.append('file', fs.createReadStream('document.pdf'));
|
| 144 |
+
|
| 145 |
+
// Upload and redact
|
| 146 |
+
const response = await axios.post(
|
| 147 |
+
'http://localhost:7860/redact',
|
| 148 |
+
form,
|
| 149 |
+
{
|
| 150 |
+
headers: form.getHeaders(),
|
| 151 |
+
params: { dpi: 300 }
|
| 152 |
+
}
|
| 153 |
+
);
|
| 154 |
+
|
| 155 |
+
const { job_id } = response.data;
|
| 156 |
+
|
| 157 |
+
// Download redacted PDF
|
| 158 |
+
const redacted = await axios.get(
|
| 159 |
+
`http://localhost:7860/download/${job_id}`,
|
| 160 |
+
{ responseType: 'arraybuffer' }
|
| 161 |
+
);
|
| 162 |
+
|
| 163 |
+
fs.writeFileSync('redacted.pdf', redacted.data);
|
| 164 |
+
console.log('Redaction complete!');
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
redactPDF();
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
### cURL Advanced
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
# Redact only specific entity types
|
| 174 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 175 |
+
-F "file=@document.pdf" \
|
| 176 |
+
-F "dpi=300" \
|
| 177 |
+
-F "entity_types=PER,ORG"
|
| 178 |
+
|
| 179 |
+
# Get statistics
|
| 180 |
+
curl http://localhost:7860/stats
|
| 181 |
+
|
| 182 |
+
# Download specific file
|
| 183 |
+
curl -O -J http://localhost:7860/download/JOB_ID_HERE
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
## Common Use Cases
|
| 187 |
+
|
| 188 |
+
### 1. Redact All Personal Information
|
| 189 |
+
|
| 190 |
+
```python
|
| 191 |
+
response = requests.post(
|
| 192 |
+
"http://localhost:7860/redact",
|
| 193 |
+
files={"file": open("resume.pdf", "rb")},
|
| 194 |
+
params={"dpi": 300}
|
| 195 |
+
)
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
### 2. Redact Only Names and Organizations
|
| 199 |
+
|
| 200 |
+
```python
|
| 201 |
+
response = requests.post(
|
| 202 |
+
"http://localhost:7860/redact",
|
| 203 |
+
files={"file": open("contract.pdf", "rb")},
|
| 204 |
+
params={
|
| 205 |
+
"dpi": 300,
|
| 206 |
+
"entity_types": "PER,ORG"
|
| 207 |
+
}
|
| 208 |
+
)
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### 3. Fast Processing (Lower Quality)
|
| 212 |
+
|
| 213 |
+
```python
|
| 214 |
+
response = requests.post(
|
| 215 |
+
"http://localhost:7860/redact",
|
| 216 |
+
files={"file": open("large_doc.pdf", "rb")},
|
| 217 |
+
params={"dpi": 150} # Faster but less accurate
|
| 218 |
+
)
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
### 4. High Quality (Slower)
|
| 222 |
+
|
| 223 |
+
```python
|
| 224 |
+
response = requests.post(
|
| 225 |
+
"http://localhost:7860/redact",
|
| 226 |
+
files={"file": open("important.pdf", "rb")},
|
| 227 |
+
params={"dpi": 600} # Best quality, slowest
|
| 228 |
+
)
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
## Troubleshooting
|
| 232 |
+
|
| 233 |
+
### "Model not loaded"
|
| 234 |
+
**Problem**: NER model failed to load
|
| 235 |
+
**Solution**: Check internet connection, wait for model download
|
| 236 |
+
|
| 237 |
+
### "Tesseract not found"
|
| 238 |
+
**Problem**: OCR engine not installed
|
| 239 |
+
**Solution**: Install tesseract-ocr system package
|
| 240 |
+
|
| 241 |
+
### "Poppler not found"
|
| 242 |
+
**Problem**: PDF converter not installed
|
| 243 |
+
**Solution**: Install poppler-utils system package
|
| 244 |
+
|
| 245 |
+
### Slow processing
|
| 246 |
+
**Problem**: Redaction takes too long
|
| 247 |
+
**Solution**: Lower DPI to 150-200
|
| 248 |
+
|
| 249 |
+
### Out of memory
|
| 250 |
+
**Problem**: Large PDF crashes the API
|
| 251 |
+
**Solution**:
|
| 252 |
+
- Process one page at a time
|
| 253 |
+
- Increase container memory
|
| 254 |
+
- Lower DPI
|
| 255 |
+
|
| 256 |
+
## Next Steps
|
| 257 |
+
|
| 258 |
+
- ✅ Read full [README.md](README.md) for API details
|
| 259 |
+
- ✅ Check [DEPLOYMENT.md](DEPLOYMENT.md) for production setup
|
| 260 |
+
- ✅ Review [STRUCTURE.md](STRUCTURE.md) for code organization
|
| 261 |
+
- ✅ Run tests: `pytest tests/`
|
| 262 |
+
- ✅ Add authentication for production use
|
| 263 |
+
- ✅ Set up monitoring and logging
|
| 264 |
+
|
| 265 |
+
## Support
|
| 266 |
+
|
| 267 |
+
- 📖 API Docs: `http://localhost:7860/docs`
|
| 268 |
+
- 🐛 Issues: Create on your repository
|
| 269 |
+
- 💬 HuggingFace: Community forums
|
| 270 |
+
|
| 271 |
+
Happy redacting! 🔒
|
README.md
CHANGED
|
@@ -1,10 +1,167 @@
|
|
| 1 |
---
|
| 2 |
title: PDF Redaction API
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: blue
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: PDF Redaction API
|
| 3 |
+
emoji: 🔒
|
| 4 |
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# PDF Redaction API 🔒
|
| 12 |
+
|
| 13 |
+
Automatically redact sensitive information from PDF documents using Named Entity Recognition (NER).
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- 🤖 **Powered by NER**: Uses state-of-the-art Named Entity Recognition
|
| 18 |
+
- 📄 **PDF Support**: Upload and process PDF documents
|
| 19 |
+
- 🎯 **Accurate Redaction**: Correctly positioned black rectangles over sensitive text
|
| 20 |
+
- 🚀 **Fast Processing**: Optimized OCR and NER pipeline
|
| 21 |
+
- 🔧 **Configurable**: Adjust DPI and filter entity types
|
| 22 |
+
|
| 23 |
+
## API Endpoints
|
| 24 |
+
|
| 25 |
+
### `POST /redact`
|
| 26 |
+
|
| 27 |
+
Upload a PDF file and get it redacted.
|
| 28 |
+
|
| 29 |
+
**Parameters:**
|
| 30 |
+
- `file`: PDF file (required)
|
| 31 |
+
- `dpi`: OCR quality (default: 300)
|
| 32 |
+
- `entity_types`: Comma-separated entity types to redact (optional)
|
| 33 |
+
|
| 34 |
+
**Example using cURL:**
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
curl -X POST "https://your-space.hf.space/redact" \
|
| 38 |
+
-F "file=@document.pdf" \
|
| 39 |
+
-F "dpi=300"
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
**Example using Python:**
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
import requests
|
| 46 |
+
|
| 47 |
+
url = "https://your-space.hf.space/redact"
|
| 48 |
+
files = {"file": open("document.pdf", "rb")}
|
| 49 |
+
params = {"dpi": 300}
|
| 50 |
+
|
| 51 |
+
response = requests.post(url, files=files, params=params)
|
| 52 |
+
result = response.json()
|
| 53 |
+
|
| 54 |
+
# Download redacted file
|
| 55 |
+
job_id = result["job_id"]
|
| 56 |
+
download_url = f"https://your-space.hf.space/download/{job_id}"
|
| 57 |
+
redacted_pdf = requests.get(download_url)
|
| 58 |
+
|
| 59 |
+
with open("redacted.pdf", "wb") as f:
|
| 60 |
+
f.write(redacted_pdf.content)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### `GET /download/{job_id}`
|
| 64 |
+
|
| 65 |
+
Download the redacted PDF file.
|
| 66 |
+
|
| 67 |
+
### `GET /health`
|
| 68 |
+
|
| 69 |
+
Check API health and model status.
|
| 70 |
+
|
| 71 |
+
### `GET /stats`
|
| 72 |
+
|
| 73 |
+
Get API statistics.
|
| 74 |
+
|
| 75 |
+
## Response Format
|
| 76 |
+
|
| 77 |
+
```json
|
| 78 |
+
{
|
| 79 |
+
"job_id": "uuid-here",
|
| 80 |
+
"status": "completed",
|
| 81 |
+
"message": "Successfully redacted 5 entities",
|
| 82 |
+
"entities": [
|
| 83 |
+
{
|
| 84 |
+
"entity_type": "PER",
|
| 85 |
+
"entity_text": "John Doe",
|
| 86 |
+
"page": 1,
|
| 87 |
+
"word_count": 2
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"redacted_file_url": "/download/uuid-here"
|
| 91 |
+
}
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Entity Types
|
| 95 |
+
|
| 96 |
+
Common entity types detected:
|
| 97 |
+
- `PER`: Person names
|
| 98 |
+
- `ORG`: Organizations
|
| 99 |
+
- `LOC`: Locations
|
| 100 |
+
- `DATE`: Dates
|
| 101 |
+
- `EMAIL`: Email addresses
|
| 102 |
+
- `PHONE`: Phone numbers
|
| 103 |
+
- And more...
|
| 104 |
+
|
| 105 |
+
## Local Development
|
| 106 |
+
|
| 107 |
+
### Prerequisites
|
| 108 |
+
|
| 109 |
+
- Python 3.10+
|
| 110 |
+
- Tesseract OCR
|
| 111 |
+
- Poppler utils
|
| 112 |
+
|
| 113 |
+
### Installation
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
# Install system dependencies (Ubuntu/Debian)
|
| 117 |
+
sudo apt-get install tesseract-ocr poppler-utils
|
| 118 |
+
|
| 119 |
+
# Install Python dependencies
|
| 120 |
+
pip install -r requirements.txt
|
| 121 |
+
|
| 122 |
+
# Run the server
|
| 123 |
+
python main.py
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
The API will be available at `http://localhost:7860`
|
| 127 |
+
|
| 128 |
+
### Using Docker
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
# Build the image
|
| 132 |
+
docker build -t pdf-redaction-api .
|
| 133 |
+
|
| 134 |
+
# Run the container
|
| 135 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Configuration
|
| 139 |
+
|
| 140 |
+
Adjust the DPI parameter based on your needs:
|
| 141 |
+
- `150`: Fast processing, lower quality
|
| 142 |
+
- `300`: Recommended balance (default)
|
| 143 |
+
- `600`: High quality, slower processing
|
| 144 |
+
|
| 145 |
+
## Limitations
|
| 146 |
+
|
| 147 |
+
- Maximum file size: Dependent on Space resources
|
| 148 |
+
- Processing time increases with page count and DPI
|
| 149 |
+
- Files are automatically cleaned up after processing
|
| 150 |
+
|
| 151 |
+
## Privacy
|
| 152 |
+
|
| 153 |
+
- Uploaded files are processed in-memory and deleted after redaction
|
| 154 |
+
- No data is stored permanently
|
| 155 |
+
- Use your own deployment for sensitive documents
|
| 156 |
+
|
| 157 |
+
## Credits
|
| 158 |
+
|
| 159 |
+
Built with:
|
| 160 |
+
- [FastAPI](https://fastapi.tiangolo.com/)
|
| 161 |
+
- [Transformers](https://huggingface.co/transformers/)
|
| 162 |
+
- [PyPDF](https://github.com/py-pdf/pypdf)
|
| 163 |
+
- [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
|
| 164 |
+
|
| 165 |
+
## License
|
| 166 |
+
|
| 167 |
+
MIT License - See LICENSE file for details
|
STRUCTURE.md
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project Structure
|
| 2 |
+
|
| 3 |
+
```
|
| 4 |
+
pdf-redaction-api/
|
| 5 |
+
│
|
| 6 |
+
├── main.py # FastAPI application entry point
|
| 7 |
+
├── Dockerfile # Docker configuration for deployment
|
| 8 |
+
├── requirements.txt # Python dependencies
|
| 9 |
+
├── README.md # Project documentation (for HuggingFace)
|
| 10 |
+
├── DEPLOYMENT.md # Deployment guide
|
| 11 |
+
├── .gitignore # Git ignore rules
|
| 12 |
+
├── .dockerignore # Docker ignore rules
|
| 13 |
+
│
|
| 14 |
+
├── app/ # Application modules
|
| 15 |
+
│ ├── __init__.py # Package initialization
|
| 16 |
+
│ └── redaction.py # Core redaction logic (PDFRedactor class)
|
| 17 |
+
│
|
| 18 |
+
├── uploads/ # Temporary upload directory
|
| 19 |
+
│ └── .gitkeep # Keep directory in git
|
| 20 |
+
│
|
| 21 |
+
├── outputs/ # Redacted PDF output directory
|
| 22 |
+
│ └── .gitkeep # Keep directory in git
|
| 23 |
+
│
|
| 24 |
+
├── tests/ # Test suite
|
| 25 |
+
│ ├── __init__.py
|
| 26 |
+
│ └── test_api.py # API endpoint tests
|
| 27 |
+
│
|
| 28 |
+
└── client_example.py # Example client for API usage
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
## File Descriptions
|
| 32 |
+
|
| 33 |
+
### Core Files
|
| 34 |
+
|
| 35 |
+
#### `main.py`
|
| 36 |
+
FastAPI application with endpoints:
|
| 37 |
+
- `POST /redact` - Upload and redact PDF
|
| 38 |
+
- `GET /download/{job_id}` - Download redacted PDF
|
| 39 |
+
- `GET /health` - Health check
|
| 40 |
+
- `GET /stats` - API statistics
|
| 41 |
+
- `DELETE /cleanup/{job_id}` - Manual cleanup
|
| 42 |
+
|
| 43 |
+
#### `app/redaction.py`
|
| 44 |
+
Core redaction logic:
|
| 45 |
+
- `PDFRedactor` class
|
| 46 |
+
- OCR processing with pytesseract
|
| 47 |
+
- NER using HuggingFace transformers
|
| 48 |
+
- Entity-to-box mapping
|
| 49 |
+
- PDF redaction with coordinate scaling
|
| 50 |
+
|
| 51 |
+
### Configuration Files
|
| 52 |
+
|
| 53 |
+
#### `requirements.txt`
|
| 54 |
+
Python dependencies:
|
| 55 |
+
- FastAPI & Uvicorn (API framework)
|
| 56 |
+
- Transformers & Torch (NER model)
|
| 57 |
+
- PyPDF (PDF manipulation)
|
| 58 |
+
- pdf2image (PDF to image conversion)
|
| 59 |
+
- pytesseract (OCR)
|
| 60 |
+
- Pillow (Image processing)
|
| 61 |
+
|
| 62 |
+
#### `Dockerfile`
|
| 63 |
+
Multi-stage build:
|
| 64 |
+
1. Install system dependencies (tesseract, poppler)
|
| 65 |
+
2. Install Python dependencies
|
| 66 |
+
3. Copy application code
|
| 67 |
+
4. Configure for port 7860 (HuggingFace default)
|
| 68 |
+
|
| 69 |
+
### Documentation
|
| 70 |
+
|
| 71 |
+
#### `README.md`
|
| 72 |
+
HuggingFace Space documentation:
|
| 73 |
+
- Features overview
|
| 74 |
+
- API endpoint documentation
|
| 75 |
+
- Usage examples (cURL, Python)
|
| 76 |
+
- Response format
|
| 77 |
+
- Local development setup
|
| 78 |
+
|
| 79 |
+
#### `DEPLOYMENT.md`
|
| 80 |
+
Step-by-step deployment guide:
|
| 81 |
+
- HuggingFace Spaces setup
|
| 82 |
+
- Git workflow
|
| 83 |
+
- Configuration options
|
| 84 |
+
- Security considerations
|
| 85 |
+
- Troubleshooting
|
| 86 |
+
- Cost estimation
|
| 87 |
+
|
| 88 |
+
### Testing & Examples
|
| 89 |
+
|
| 90 |
+
#### `tests/test_api.py`
|
| 91 |
+
Unit tests for API endpoints:
|
| 92 |
+
- Health check tests
|
| 93 |
+
- Upload validation tests
|
| 94 |
+
- Error handling tests
|
| 95 |
+
|
| 96 |
+
#### `client_example.py`
|
| 97 |
+
Example client implementation:
|
| 98 |
+
- Upload PDF
|
| 99 |
+
- Download redacted file
|
| 100 |
+
- Health check
|
| 101 |
+
- Statistics
|
| 102 |
+
|
| 103 |
+
## Data Flow
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
┌─────────────────────────────────────────────────────────┐
|
| 107 |
+
│ 1. Client uploads PDF │
|
| 108 |
+
│ POST /redact with file │
|
| 109 |
+
└─────────────────────────────────────────────────────────┘
|
| 110 |
+
↓
|
| 111 |
+
┌─────────────────────────────────────────────────────────┐
|
| 112 |
+
│ 2. FastAPI (main.py) │
|
| 113 |
+
│ - Validates file │
|
| 114 |
+
│ - Generates job_id │
|
| 115 |
+
│ - Saves to uploads/ │
|
| 116 |
+
└─────────────────────────────────────────────────────────┘
|
| 117 |
+
↓
|
| 118 |
+
┌─────────────────────────────────────────────────────────┐
|
| 119 |
+
│ 3. PDFRedactor (app/redaction.py) │
|
| 120 |
+
│ - perform_ocr() → Extract text + boxes │
|
| 121 |
+
│ - run_ner() → Identify entities │
|
| 122 |
+
│ - map_entities_to_boxes() → Link entities to coords │
|
| 123 |
+
│ - create_redacted_pdf() → Generate output │
|
| 124 |
+
└─────────────────────────────────────────────────────────┘
|
| 125 |
+
↓
|
| 126 |
+
┌─────────────────────────────────────────────────────────┐
|
| 127 |
+
│ 4. Response │
|
| 128 |
+
│ - Return job_id and entity list │
|
| 129 |
+
│ - Save redacted PDF to outputs/ │
|
| 130 |
+
└─────────────────────────────────────────────────────────┘
|
| 131 |
+
↓
|
| 132 |
+
┌─────────────────────────────────────────────────────────┐
|
| 133 |
+
│ 5. Client downloads │
|
| 134 |
+
│ GET /download/{job_id} │
|
| 135 |
+
└─────────────────────────────────────────────────────────┘
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Key Components
|
| 139 |
+
|
| 140 |
+
### 1. FastAPI Application (`main.py`)
|
| 141 |
+
|
| 142 |
+
**Endpoints:**
|
| 143 |
+
- RESTful API design
|
| 144 |
+
- File upload handling
|
| 145 |
+
- Background task cleanup
|
| 146 |
+
- CORS middleware for web access
|
| 147 |
+
|
| 148 |
+
**Features:**
|
| 149 |
+
- Automatic OpenAPI documentation at `/docs`
|
| 150 |
+
- JSON response models with Pydantic
|
| 151 |
+
- Error handling with HTTP exceptions
|
| 152 |
+
- Request validation
|
| 153 |
+
|
| 154 |
+
### 2. Redaction Engine (`app/redaction.py`)
|
| 155 |
+
|
| 156 |
+
**Pipeline Steps:**
|
| 157 |
+
|
| 158 |
+
1. **OCR Processing**
|
| 159 |
+
- Convert PDF pages to images (pdf2image)
|
| 160 |
+
- Extract text and bounding boxes (pytesseract)
|
| 161 |
+
- Store image dimensions for coordinate scaling
|
| 162 |
+
|
| 163 |
+
2. **NER Processing**
|
| 164 |
+
- Load HuggingFace model
|
| 165 |
+
- Identify entities in text
|
| 166 |
+
- Return entity types and character positions
|
| 167 |
+
|
| 168 |
+
3. **Mapping**
|
| 169 |
+
- Create character span index for OCR words
|
| 170 |
+
- Match NER entities to OCR bounding boxes
|
| 171 |
+
- Handle partial word matches
|
| 172 |
+
|
| 173 |
+
4. **Redaction**
|
| 174 |
+
- Scale OCR image coordinates to PDF points
|
| 175 |
+
- Create black rectangle annotations
|
| 176 |
+
- Write redacted PDF with pypdf
|
| 177 |
+
|
| 178 |
+
### 3. Docker Container
|
| 179 |
+
|
| 180 |
+
**Layers:**
|
| 181 |
+
- Base: Python 3.10 slim
|
| 182 |
+
- System packages: tesseract-ocr, poppler-utils
|
| 183 |
+
- Python packages: From requirements.txt
|
| 184 |
+
- Application code: Copied last for better caching
|
| 185 |
+
|
| 186 |
+
**Optimizations:**
|
| 187 |
+
- Multi-stage build (not used here, but possible)
|
| 188 |
+
- Minimal base image
|
| 189 |
+
- Cached dependency layers
|
| 190 |
+
- .dockerignore to reduce context size
|
| 191 |
+
|
| 192 |
+
## Environment Variables
|
| 193 |
+
|
| 194 |
+
Default configuration (can be overridden):
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
PYTHONUNBUFFERED=1 # Immediate log output
|
| 198 |
+
HF_HOME=/app/cache # HuggingFace cache directory
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Port Configuration
|
| 202 |
+
|
| 203 |
+
- **Development**: 7860 (configurable in main.py)
|
| 204 |
+
- **Production (HF Spaces)**: 7860 (required)
|
| 205 |
+
|
| 206 |
+
## Directory Permissions
|
| 207 |
+
|
| 208 |
+
Ensure write permissions for:
|
| 209 |
+
- `uploads/` - Temporary PDF storage
|
| 210 |
+
- `outputs/` - Redacted PDF storage
|
| 211 |
+
- `cache/` - Model cache (created automatically)
|
| 212 |
+
|
| 213 |
+
## Adding New Features
|
| 214 |
+
|
| 215 |
+
### Add New Endpoint
|
| 216 |
+
|
| 217 |
+
1. Define in `main.py`:
|
| 218 |
+
```python
|
| 219 |
+
@app.get("/new-endpoint")
|
| 220 |
+
async def new_endpoint():
|
| 221 |
+
return {"message": "Hello"}
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
2. Add response model if needed
|
| 225 |
+
3. Update README.md documentation
|
| 226 |
+
4. Add tests in `tests/test_api.py`
|
| 227 |
+
|
| 228 |
+
### Add New Redaction Option
|
| 229 |
+
|
| 230 |
+
1. Modify `PDFRedactor` class in `app/redaction.py`
|
| 231 |
+
2. Add parameter to `redact_document()` method
|
| 232 |
+
3. Update API endpoint in `main.py`
|
| 233 |
+
4. Document in README.md
|
| 234 |
+
|
| 235 |
+
### Add Authentication
|
| 236 |
+
|
| 237 |
+
1. Install: `pip install python-jose passlib`
|
| 238 |
+
2. Create `app/auth.py` with JWT logic
|
| 239 |
+
3. Add middleware to `main.py`
|
| 240 |
+
4. Protect endpoints with dependencies
|
| 241 |
+
|
| 242 |
+
## Best Practices
|
| 243 |
+
|
| 244 |
+
1. **Logging**: Use `logger` for all important events
|
| 245 |
+
2. **Error Handling**: Catch exceptions and return meaningful errors
|
| 246 |
+
3. **Validation**: Use Pydantic models for request/response validation
|
| 247 |
+
4. **Cleanup**: Always clean up temporary files
|
| 248 |
+
5. **Documentation**: Keep README.md and code comments updated
|
| 249 |
+
6. **Testing**: Add tests for new features
|
| 250 |
+
|
| 251 |
+
## Performance Considerations
|
| 252 |
+
|
| 253 |
+
### Bottlenecks
|
| 254 |
+
1. OCR processing (most time-consuming)
|
| 255 |
+
2. Model inference (NER)
|
| 256 |
+
3. File I/O
|
| 257 |
+
|
| 258 |
+
### Optimizations
|
| 259 |
+
- Lower DPI for faster OCR (trade-off with accuracy)
|
| 260 |
+
- Cache loaded models in memory
|
| 261 |
+
- Use async file operations
|
| 262 |
+
- Implement request queuing for high load
|
| 263 |
+
- Consider GPU for NER model
|
| 264 |
+
|
| 265 |
+
### Scaling
|
| 266 |
+
- Horizontal: Multiple container instances
|
| 267 |
+
- Vertical: Larger CPU/RAM allocation
|
| 268 |
+
- Caching: Redis for temporary results
|
| 269 |
+
- Queue: Celery for background processing
|
app/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
App module for PDF redaction API
|
| 3 |
+
"""
|
| 4 |
+
from .redaction import PDFRedactor
|
| 5 |
+
|
| 6 |
+
__all__ = ['PDFRedactor']
|
app/redaction.py
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Redaction module using NER
|
| 3 |
+
"""
|
| 4 |
+
from pdf2image import convert_from_path
|
| 5 |
+
import pytesseract
|
| 6 |
+
from pypdf import PdfReader, PdfWriter
|
| 7 |
+
from pypdf.generic import DictionaryObject, ArrayObject, NameObject, NumberObject
|
| 8 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PDFRedactor:
|
| 16 |
+
"""PDF Redaction using Named Entity Recognition"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_name: str = "./model"):
|
| 19 |
+
"""
|
| 20 |
+
Initialize the PDF Redactor
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
model_name: HuggingFace model name for NER
|
| 24 |
+
"""
|
| 25 |
+
self.model_name = model_name
|
| 26 |
+
self.ner_pipeline = None
|
| 27 |
+
self._load_model()
|
| 28 |
+
|
| 29 |
+
def _load_model(self):
|
| 30 |
+
"""Load the NER model"""
|
| 31 |
+
try:
|
| 32 |
+
logger.info(f"Loading NER model: {self.model_name}")
|
| 33 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 34 |
+
model = AutoModelForTokenClassification.from_pretrained(self.model_name)
|
| 35 |
+
|
| 36 |
+
self.ner_pipeline = pipeline("token-classification", model=model,
|
| 37 |
+
tokenizer=tokenizer)
|
| 38 |
+
logger.info("NER model loaded successfully")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
logger.error(f"Error loading NER model: {str(e)}")
|
| 41 |
+
raise
|
| 42 |
+
|
| 43 |
+
def is_model_loaded(self) -> bool:
|
| 44 |
+
"""Check if the model is loaded"""
|
| 45 |
+
return self.ner_pipeline is not None
|
| 46 |
+
|
| 47 |
+
def perform_ocr(self, pdf_path: str, dpi: int = 300) -> List[Dict]:
|
| 48 |
+
"""
|
| 49 |
+
Perform OCR on PDF and extract word bounding boxes
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
pdf_path: Path to the PDF file
|
| 53 |
+
dpi: DPI for PDF to image conversion
|
| 54 |
+
|
| 55 |
+
Returns:
|
| 56 |
+
List of word data with bounding boxes and image dimensions
|
| 57 |
+
"""
|
| 58 |
+
logger.info(f"Starting OCR on {pdf_path} at {dpi} DPI")
|
| 59 |
+
all_words_data = []
|
| 60 |
+
|
| 61 |
+
try:
|
| 62 |
+
images = convert_from_path(pdf_path, dpi=dpi)
|
| 63 |
+
logger.info(f"Converted PDF to {len(images)} images")
|
| 64 |
+
|
| 65 |
+
for page_num, image in enumerate(images):
|
| 66 |
+
# Get image dimensions
|
| 67 |
+
image_width, image_height = image.size
|
| 68 |
+
|
| 69 |
+
# Perform OCR
|
| 70 |
+
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
| 71 |
+
|
| 72 |
+
num_words = len(data['text'])
|
| 73 |
+
for i in range(num_words):
|
| 74 |
+
word_text = data['text'][i].strip()
|
| 75 |
+
confidence = int(data['conf'][i])
|
| 76 |
+
|
| 77 |
+
# Filter out empty or low-confidence words
|
| 78 |
+
if word_text and confidence > 0:
|
| 79 |
+
all_words_data.append({
|
| 80 |
+
'text': word_text,
|
| 81 |
+
'box': (data['left'][i], data['top'][i],
|
| 82 |
+
data['width'][i], data['height'][i]),
|
| 83 |
+
'page': page_num + 1,
|
| 84 |
+
'confidence': confidence,
|
| 85 |
+
'image_width': image_width,
|
| 86 |
+
'image_height': image_height
|
| 87 |
+
})
|
| 88 |
+
|
| 89 |
+
logger.info(f"Processed page {page_num + 1}: {len([w for w in all_words_data if w['page'] == page_num + 1])} words")
|
| 90 |
+
|
| 91 |
+
logger.info(f"OCR complete: {len(all_words_data)} total words extracted")
|
| 92 |
+
return all_words_data
|
| 93 |
+
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Error during OCR: {str(e)}")
|
| 96 |
+
raise
|
| 97 |
+
|
| 98 |
+
def run_ner(self, text: str) -> List[Dict]:
|
| 99 |
+
"""
|
| 100 |
+
Run NER on text
|
| 101 |
+
|
| 102 |
+
Args:
|
| 103 |
+
text: Input text
|
| 104 |
+
|
| 105 |
+
Returns:
|
| 106 |
+
List of identified entities
|
| 107 |
+
"""
|
| 108 |
+
if not self.ner_pipeline:
|
| 109 |
+
raise RuntimeError("NER model not loaded")
|
| 110 |
+
|
| 111 |
+
logger.info(f"Running NER on text of length {len(text)}")
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
results = self.ner_pipeline(text)
|
| 115 |
+
logger.info(f"NER identified {len(results)} entities")
|
| 116 |
+
return results
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.error(f"Error during NER: {str(e)}")
|
| 119 |
+
raise
|
| 120 |
+
|
| 121 |
+
def map_entities_to_boxes(self, ner_results: List[Dict],
|
| 122 |
+
ocr_data: List[Dict]) -> List[Dict]:
|
| 123 |
+
"""
|
| 124 |
+
Map NER entities to OCR bounding boxes
|
| 125 |
+
|
| 126 |
+
Args:
|
| 127 |
+
ner_results: List of NER entities
|
| 128 |
+
ocr_data: List of OCR word data
|
| 129 |
+
|
| 130 |
+
Returns:
|
| 131 |
+
List of mapped entities with bounding boxes
|
| 132 |
+
"""
|
| 133 |
+
logger.info("Mapping NER entities to OCR bounding boxes")
|
| 134 |
+
mapped_entities = []
|
| 135 |
+
|
| 136 |
+
# Create character span mapping
|
| 137 |
+
ocr_word_char_spans = []
|
| 138 |
+
current_char_index = 0
|
| 139 |
+
|
| 140 |
+
for ocr_data_idx, word_info in enumerate(ocr_data):
|
| 141 |
+
word_text = word_info['text']
|
| 142 |
+
length = len(word_text)
|
| 143 |
+
|
| 144 |
+
ocr_word_char_spans.append({
|
| 145 |
+
'ocr_data_idx': ocr_data_idx,
|
| 146 |
+
'start_char': current_char_index,
|
| 147 |
+
'end_char': current_char_index + length
|
| 148 |
+
})
|
| 149 |
+
current_char_index += length + 1
|
| 150 |
+
|
| 151 |
+
# Map each NER entity to OCR words
|
| 152 |
+
for ner_entity in ner_results:
|
| 153 |
+
ner_entity_type = ner_entity['entity']
|
| 154 |
+
ner_start = ner_entity['start']
|
| 155 |
+
ner_end = ner_entity['end']
|
| 156 |
+
ner_word = ner_entity['word']
|
| 157 |
+
|
| 158 |
+
matching_ocr_words = []
|
| 159 |
+
|
| 160 |
+
for ocr_word_span in ocr_word_char_spans:
|
| 161 |
+
ocr_start = ocr_word_span['start_char']
|
| 162 |
+
ocr_end = ocr_word_span['end_char']
|
| 163 |
+
|
| 164 |
+
# Check for overlap
|
| 165 |
+
if max(ocr_start, ner_start) < min(ocr_end, ner_end):
|
| 166 |
+
matching_ocr_words.append(ocr_data[ocr_word_span['ocr_data_idx']])
|
| 167 |
+
|
| 168 |
+
if matching_ocr_words:
|
| 169 |
+
mapped_entities.append({
|
| 170 |
+
'entity_type': ner_entity_type,
|
| 171 |
+
'entity_text': ner_word,
|
| 172 |
+
'words': matching_ocr_words
|
| 173 |
+
})
|
| 174 |
+
|
| 175 |
+
logger.info(f"Mapped {len(mapped_entities)} entities to bounding boxes")
|
| 176 |
+
return mapped_entities
|
| 177 |
+
|
| 178 |
+
def create_redacted_pdf(self, original_pdf_path: str,
|
| 179 |
+
mapped_entities: List[Dict],
|
| 180 |
+
output_path: str) -> str:
|
| 181 |
+
"""
|
| 182 |
+
Create redacted PDF with black rectangles over entities
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
original_pdf_path: Path to original PDF
|
| 186 |
+
mapped_entities: List of entities with bounding boxes
|
| 187 |
+
output_path: Path for output PDF
|
| 188 |
+
|
| 189 |
+
Returns:
|
| 190 |
+
Path to redacted PDF
|
| 191 |
+
"""
|
| 192 |
+
logger.info(f"Creating redacted PDF: {output_path}")
|
| 193 |
+
|
| 194 |
+
try:
|
| 195 |
+
reader = PdfReader(original_pdf_path)
|
| 196 |
+
writer = PdfWriter()
|
| 197 |
+
|
| 198 |
+
for page_num in range(len(reader.pages)):
|
| 199 |
+
page = reader.pages[page_num]
|
| 200 |
+
media_box = page.mediabox
|
| 201 |
+
page_width = float(media_box.width)
|
| 202 |
+
page_height = float(media_box.height)
|
| 203 |
+
|
| 204 |
+
writer.add_page(page)
|
| 205 |
+
|
| 206 |
+
page_entities = 0
|
| 207 |
+
for entity_info in mapped_entities:
|
| 208 |
+
for word_info in entity_info['words']:
|
| 209 |
+
if word_info['page'] == page_num + 1:
|
| 210 |
+
x, y, w, h = word_info['box']
|
| 211 |
+
|
| 212 |
+
# Get image dimensions
|
| 213 |
+
image_width = word_info['image_width']
|
| 214 |
+
image_height = word_info['image_height']
|
| 215 |
+
|
| 216 |
+
# Scale coordinates
|
| 217 |
+
scale_x = page_width / image_width
|
| 218 |
+
scale_y = page_height / image_height
|
| 219 |
+
|
| 220 |
+
x_scaled = x * scale_x
|
| 221 |
+
y_scaled = y * scale_y
|
| 222 |
+
w_scaled = w * scale_x
|
| 223 |
+
h_scaled = h * scale_y
|
| 224 |
+
|
| 225 |
+
# Convert to PDF coordinates
|
| 226 |
+
llx = x_scaled
|
| 227 |
+
lly = page_height - (y_scaled + h_scaled)
|
| 228 |
+
urx = x_scaled + w_scaled
|
| 229 |
+
ury = page_height - y_scaled
|
| 230 |
+
|
| 231 |
+
# Create redaction annotation
|
| 232 |
+
redaction_annotation = DictionaryObject()
|
| 233 |
+
redaction_annotation.update({
|
| 234 |
+
NameObject("/Type"): NameObject("/Annot"),
|
| 235 |
+
NameObject("/Subtype"): NameObject("/Square"),
|
| 236 |
+
NameObject("/Rect"): ArrayObject([
|
| 237 |
+
NumberObject(llx),
|
| 238 |
+
NumberObject(lly),
|
| 239 |
+
NumberObject(urx),
|
| 240 |
+
NumberObject(ury),
|
| 241 |
+
]),
|
| 242 |
+
NameObject("/C"): ArrayObject([
|
| 243 |
+
NumberObject(0), NumberObject(0), NumberObject(0)
|
| 244 |
+
]),
|
| 245 |
+
NameObject("/IC"): ArrayObject([
|
| 246 |
+
NumberObject(0), NumberObject(0), NumberObject(0)
|
| 247 |
+
]),
|
| 248 |
+
NameObject("/BS"): DictionaryObject({
|
| 249 |
+
NameObject("/W"): NumberObject(0)
|
| 250 |
+
})
|
| 251 |
+
})
|
| 252 |
+
|
| 253 |
+
writer.add_annotation(page_number=page_num,
|
| 254 |
+
annotation=redaction_annotation)
|
| 255 |
+
page_entities += 1
|
| 256 |
+
|
| 257 |
+
logger.info(f"Page {page_num + 1}: Added {page_entities} redactions")
|
| 258 |
+
|
| 259 |
+
# Write output
|
| 260 |
+
with open(output_path, "wb") as output_file:
|
| 261 |
+
writer.write(output_file)
|
| 262 |
+
|
| 263 |
+
logger.info(f"Redacted PDF created successfully: {output_path}")
|
| 264 |
+
return output_path
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.error(f"Error creating redacted PDF: {str(e)}")
|
| 268 |
+
raise
|
| 269 |
+
|
| 270 |
+
def redact_document(self, pdf_path: str, output_path: str,
|
| 271 |
+
dpi: int = 300,
|
| 272 |
+
entity_filter: Optional[List[str]] = None) -> Dict:
|
| 273 |
+
"""
|
| 274 |
+
Complete redaction pipeline
|
| 275 |
+
|
| 276 |
+
Args:
|
| 277 |
+
pdf_path: Path to input PDF
|
| 278 |
+
output_path: Path for output PDF
|
| 279 |
+
dpi: DPI for OCR
|
| 280 |
+
entity_filter: List of entity types to redact (None = all)
|
| 281 |
+
|
| 282 |
+
Returns:
|
| 283 |
+
Dictionary with redaction results
|
| 284 |
+
"""
|
| 285 |
+
logger.info(f"Starting redaction pipeline for {pdf_path}")
|
| 286 |
+
|
| 287 |
+
# Step 1: OCR
|
| 288 |
+
ocr_data = self.perform_ocr(pdf_path, dpi)
|
| 289 |
+
|
| 290 |
+
# Step 2: Extract text
|
| 291 |
+
full_text = " ".join([word['text'] for word in ocr_data])
|
| 292 |
+
|
| 293 |
+
# Step 3: NER
|
| 294 |
+
ner_results = self.run_ner(full_text)
|
| 295 |
+
|
| 296 |
+
# Step 4: Map entities to boxes
|
| 297 |
+
mapped_entities = self.map_entities_to_boxes(ner_results, ocr_data)
|
| 298 |
+
|
| 299 |
+
# Step 5: Filter entities if requested
|
| 300 |
+
if entity_filter:
|
| 301 |
+
mapped_entities = [
|
| 302 |
+
e for e in mapped_entities
|
| 303 |
+
if e['entity_type'] in entity_filter
|
| 304 |
+
]
|
| 305 |
+
logger.info(f"Filtered to {len(mapped_entities)} entities of types: {entity_filter}")
|
| 306 |
+
|
| 307 |
+
# Step 6: Create redacted PDF
|
| 308 |
+
self.create_redacted_pdf(pdf_path, mapped_entities, output_path)
|
| 309 |
+
|
| 310 |
+
return {
|
| 311 |
+
'output_path': output_path,
|
| 312 |
+
'total_words': len(ocr_data),
|
| 313 |
+
'total_entities': len(ner_results),
|
| 314 |
+
'redacted_entities': len(mapped_entities),
|
| 315 |
+
'entities': mapped_entities
|
| 316 |
+
}
|
client_example.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example client for PDF Redaction API
|
| 3 |
+
"""
|
| 4 |
+
import requests
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf",
|
| 10 |
+
dpi: int = 300, entity_types: str = None):
|
| 11 |
+
"""
|
| 12 |
+
Redact a PDF file using the API
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
api_url: Base URL of the API
|
| 16 |
+
pdf_path: Path to the PDF file to redact
|
| 17 |
+
output_path: Path to save the redacted PDF
|
| 18 |
+
dpi: DPI for OCR processing
|
| 19 |
+
entity_types: Comma-separated list of entity types to redact
|
| 20 |
+
"""
|
| 21 |
+
# Check if file exists
|
| 22 |
+
if not Path(pdf_path).exists():
|
| 23 |
+
print(f"Error: File {pdf_path} not found")
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
print(f"Uploading {pdf_path}...")
|
| 27 |
+
|
| 28 |
+
# Prepare request
|
| 29 |
+
files = {"file": open(pdf_path, "rb")}
|
| 30 |
+
params = {"dpi": dpi}
|
| 31 |
+
|
| 32 |
+
if entity_types:
|
| 33 |
+
params["entity_types"] = entity_types
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Upload and redact
|
| 37 |
+
response = requests.post(f"{api_url}/redact", files=files, params=params)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
|
| 40 |
+
result = response.json()
|
| 41 |
+
print(f"\nStatus: {result['status']}")
|
| 42 |
+
print(f"Message: {result['message']}")
|
| 43 |
+
|
| 44 |
+
# Display found entities
|
| 45 |
+
if result.get('entities'):
|
| 46 |
+
print("\nEntities redacted:")
|
| 47 |
+
for i, entity in enumerate(result['entities'], 1):
|
| 48 |
+
print(f" {i}. {entity['entity_type']}: {entity['entity_text']} "
|
| 49 |
+
f"(Page {entity['page']}, {entity['word_count']} words)")
|
| 50 |
+
|
| 51 |
+
# Download redacted file
|
| 52 |
+
job_id = result['job_id']
|
| 53 |
+
print(f"\nDownloading redacted PDF...")
|
| 54 |
+
|
| 55 |
+
download_response = requests.get(f"{api_url}/download/{job_id}")
|
| 56 |
+
download_response.raise_for_status()
|
| 57 |
+
|
| 58 |
+
# Save file
|
| 59 |
+
with open(output_path, "wb") as f:
|
| 60 |
+
f.write(download_response.content)
|
| 61 |
+
|
| 62 |
+
print(f"✓ Redacted PDF saved to: {output_path}")
|
| 63 |
+
|
| 64 |
+
# Cleanup (optional)
|
| 65 |
+
# requests.delete(f"{api_url}/cleanup/{job_id}")
|
| 66 |
+
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
except requests.exceptions.RequestException as e:
|
| 70 |
+
print(f"Error: {e}")
|
| 71 |
+
return False
|
| 72 |
+
finally:
|
| 73 |
+
files["file"].close()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def check_health(api_url: str):
|
| 77 |
+
"""Check API health"""
|
| 78 |
+
try:
|
| 79 |
+
response = requests.get(f"{api_url}/health")
|
| 80 |
+
response.raise_for_status()
|
| 81 |
+
data = response.json()
|
| 82 |
+
|
| 83 |
+
print(f"API Status: {data['status']}")
|
| 84 |
+
print(f"Version: {data['version']}")
|
| 85 |
+
print(f"Model Loaded: {data['model_loaded']}")
|
| 86 |
+
|
| 87 |
+
return True
|
| 88 |
+
except requests.exceptions.RequestException as e:
|
| 89 |
+
print(f"Error checking health: {e}")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_stats(api_url: str):
|
| 94 |
+
"""Get API statistics"""
|
| 95 |
+
try:
|
| 96 |
+
response = requests.get(f"{api_url}/stats")
|
| 97 |
+
response.raise_for_status()
|
| 98 |
+
data = response.json()
|
| 99 |
+
|
| 100 |
+
print("API Statistics:")
|
| 101 |
+
print(f" Pending uploads: {data['pending_uploads']}")
|
| 102 |
+
print(f" Processed files: {data['processed_files']}")
|
| 103 |
+
print(f" Model loaded: {data['model_loaded']}")
|
| 104 |
+
|
| 105 |
+
return True
|
| 106 |
+
except requests.exceptions.RequestException as e:
|
| 107 |
+
print(f"Error getting stats: {e}")
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
# Example usage
|
| 113 |
+
|
| 114 |
+
# For local development
|
| 115 |
+
API_URL = "http://localhost:7860"
|
| 116 |
+
|
| 117 |
+
# For HuggingFace Spaces (replace with your space URL)
|
| 118 |
+
# API_URL = "https://your-username-pdf-redaction-api.hf.space"
|
| 119 |
+
|
| 120 |
+
if len(sys.argv) < 2:
|
| 121 |
+
print("Usage:")
|
| 122 |
+
print(" python client_example.py <pdf_file> [output_file] [dpi]")
|
| 123 |
+
print("\nOr check health:")
|
| 124 |
+
print(" python client_example.py --health")
|
| 125 |
+
print("\nOr get stats:")
|
| 126 |
+
print(" python client_example.py --stats")
|
| 127 |
+
sys.exit(1)
|
| 128 |
+
|
| 129 |
+
if sys.argv[1] == "--health":
|
| 130 |
+
check_health(API_URL)
|
| 131 |
+
elif sys.argv[1] == "--stats":
|
| 132 |
+
get_stats(API_URL)
|
| 133 |
+
else:
|
| 134 |
+
pdf_path = sys.argv[1]
|
| 135 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
|
| 136 |
+
dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
|
| 137 |
+
|
| 138 |
+
# Optional: Filter specific entity types
|
| 139 |
+
# entity_types = "PER,ORG" # Only redact persons and organizations
|
| 140 |
+
entity_types = None # Redact all entity types
|
| 141 |
+
|
| 142 |
+
redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)
|
client_supabase.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from supabase import create_client, Client
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 7 |
+
SUPABASE_KEY = os.getenv("SERVICE_ROLE_KEY") # server-side key
|
| 8 |
+
|
| 9 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
api:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
ports:
|
| 9 |
+
- "7860:7860"
|
| 10 |
+
volumes:
|
| 11 |
+
# Mount code for development (hot reload)
|
| 12 |
+
- .:/app
|
| 13 |
+
# Persistent storage for uploads/outputs
|
| 14 |
+
- ./uploads:/app/uploads
|
| 15 |
+
- ./outputs:/app/outputs
|
| 16 |
+
environment:
|
| 17 |
+
- PYTHONUNBUFFERED=1
|
| 18 |
+
- HF_HOME=/app/cache
|
| 19 |
+
- LOG_LEVEL=DEBUG
|
| 20 |
+
command: uvicorn main:app --host 0.0.0.0 --port 7860 --reload
|
| 21 |
+
restart: unless-stopped
|
| 22 |
+
healthcheck:
|
| 23 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
|
| 24 |
+
interval: 30s
|
| 25 |
+
timeout: 10s
|
| 26 |
+
retries: 3
|
| 27 |
+
start_period: 40s
|
| 28 |
+
|
| 29 |
+
# Optional: Add nginx for production
|
| 30 |
+
# nginx:
|
| 31 |
+
# image: nginx:alpine
|
| 32 |
+
# ports:
|
| 33 |
+
# - "80:80"
|
| 34 |
+
# volumes:
|
| 35 |
+
# - ./nginx.conf:/etc/nginx/nginx.conf
|
| 36 |
+
# depends_on:
|
| 37 |
+
# - api
|
| 38 |
+
|
| 39 |
+
# Optional: Add Redis for caching
|
| 40 |
+
# redis:
|
| 41 |
+
# image: redis:alpine
|
| 42 |
+
# ports:
|
| 43 |
+
# - "6379:6379"
|
| 44 |
+
# volumes:
|
| 45 |
+
# - redis-data:/data
|
| 46 |
+
|
| 47 |
+
# volumes:
|
| 48 |
+
# redis-data:
|
main.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for PDF redaction using NER
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
|
| 5 |
+
from fastapi.responses import FileResponse
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
from typing import List, Optional, Dict
|
| 9 |
+
import uvicorn
|
| 10 |
+
import os
|
| 11 |
+
import uuid
|
| 12 |
+
import shutil
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import logging
|
| 15 |
+
|
| 16 |
+
from app.redaction import PDFRedactor
|
| 17 |
+
from client_supabase import supabase # Supabase client in separate file
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logging.basicConfig(level=logging.INFO)
|
| 21 |
+
logger = logging.getLogger(__name__)
|
| 22 |
+
|
| 23 |
+
# Initialize FastAPI app
|
| 24 |
+
app = FastAPI(
|
| 25 |
+
title="PDF Redaction API",
|
| 26 |
+
description="Redact sensitive information from PDFs using Named Entity Recognition",
|
| 27 |
+
version="1.0.0"
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# CORS middleware
|
| 31 |
+
app.add_middleware(
|
| 32 |
+
CORSMiddleware,
|
| 33 |
+
allow_origins=["*"],
|
| 34 |
+
allow_credentials=True,
|
| 35 |
+
allow_methods=["*"],
|
| 36 |
+
allow_headers=["*"],
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Create directories
|
| 40 |
+
UPLOAD_DIR = Path("uploads")
|
| 41 |
+
OUTPUT_DIR = Path("outputs")
|
| 42 |
+
UPLOAD_DIR.mkdir(exist_ok=True)
|
| 43 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 44 |
+
|
| 45 |
+
# Initialize redactor
|
| 46 |
+
redactor = PDFRedactor()
|
| 47 |
+
|
| 48 |
+
# ---------------- In-Memory Redaction Status Tracker ----------------
|
| 49 |
+
# request_id -> status (pending | processing | completed | failed)
|
| 50 |
+
redaction_status: Dict[str, str] = {}
|
| 51 |
+
|
| 52 |
+
# ---------------- Response Models ----------------
|
| 53 |
+
class RedactionEntity(BaseModel):
|
| 54 |
+
entity_type: str
|
| 55 |
+
entity_text: str
|
| 56 |
+
page: int
|
| 57 |
+
word_count: int
|
| 58 |
+
|
| 59 |
+
class RedactionResponse(BaseModel):
|
| 60 |
+
job_id: str
|
| 61 |
+
status: str
|
| 62 |
+
message: str
|
| 63 |
+
entities: Optional[List[RedactionEntity]] = None
|
| 64 |
+
redacted_file_url: Optional[str] = None
|
| 65 |
+
|
| 66 |
+
class RedactionStatusResponse(BaseModel):
|
| 67 |
+
request_id: str
|
| 68 |
+
status: str
|
| 69 |
+
files: List[str]
|
| 70 |
+
message: str
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class HealthResponse(BaseModel):
|
| 74 |
+
status: str
|
| 75 |
+
version: str
|
| 76 |
+
model_loaded: bool
|
| 77 |
+
|
| 78 |
+
# ---------------- Helper Functions ----------------
|
| 79 |
+
def get_public_url(bucket: str, storage_path: str) -> str:
|
| 80 |
+
return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
|
| 81 |
+
def cleanup_files(job_id: str):
|
| 82 |
+
"""Clean up temporary files after a delay"""
|
| 83 |
+
try:
|
| 84 |
+
upload_path = UPLOAD_DIR / f"{job_id}.pdf"
|
| 85 |
+
if upload_path.exists():
|
| 86 |
+
upload_path.unlink()
|
| 87 |
+
logger.info(f"Cleaned up files for job {job_id}")
|
| 88 |
+
except Exception as e:
|
| 89 |
+
logger.error(f"Error cleaning up files for job {job_id}: {str(e)}")
|
| 90 |
+
|
| 91 |
+
def cleanup_temp_files(paths: List[Path]):
|
| 92 |
+
for path in paths:
|
| 93 |
+
if path.exists():
|
| 94 |
+
path.unlink()
|
| 95 |
+
|
| 96 |
+
def download_file_from_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 97 |
+
logger.info(f"Downloading {storage_path} to {local_path}")
|
| 98 |
+
data = supabase.storage.from_(bucket).download(storage_path)
|
| 99 |
+
if not data:
|
| 100 |
+
raise Exception(f"Failed to download {storage_path}")
|
| 101 |
+
with local_path.open("wb") as f:
|
| 102 |
+
f.write(data)
|
| 103 |
+
|
| 104 |
+
def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 105 |
+
logger.info(f"Uploading {local_path} to {storage_path}")
|
| 106 |
+
|
| 107 |
+
with local_path.open("rb") as f:
|
| 108 |
+
content = f.read()
|
| 109 |
+
|
| 110 |
+
supabase.storage.from_(bucket).upload(
|
| 111 |
+
path=storage_path,
|
| 112 |
+
file=content,
|
| 113 |
+
file_options={
|
| 114 |
+
"upsert": "true",
|
| 115 |
+
"content-type": "application/pdf"
|
| 116 |
+
}
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
def redact_request(request_id: str, bucket: str = "doc_storage"):
|
| 120 |
+
"""Background task: redact all files for a given request_id"""
|
| 121 |
+
try:
|
| 122 |
+
redaction_status[request_id] = "processing"
|
| 123 |
+
|
| 124 |
+
# Fetch all files for this request_id
|
| 125 |
+
response = (
|
| 126 |
+
supabase
|
| 127 |
+
.from_("request_files")
|
| 128 |
+
.select("id, storage_path")
|
| 129 |
+
.eq("request_id", request_id)
|
| 130 |
+
.execute()
|
| 131 |
+
)
|
| 132 |
+
|
| 133 |
+
files = response.data
|
| 134 |
+
|
| 135 |
+
if not files:
|
| 136 |
+
raise Exception(f"No files found for request {request_id}")
|
| 137 |
+
if not files:
|
| 138 |
+
raise Exception(f"No files found for request {request_id}")
|
| 139 |
+
|
| 140 |
+
for file in files:
|
| 141 |
+
storage_path = file["storage_path"]
|
| 142 |
+
local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
|
| 143 |
+
local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
|
| 144 |
+
|
| 145 |
+
# Download from Supabase
|
| 146 |
+
download_file_from_supabase(bucket, storage_path, local_upload)
|
| 147 |
+
|
| 148 |
+
# Redact
|
| 149 |
+
redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
|
| 150 |
+
|
| 151 |
+
# Upload redacted back to same path
|
| 152 |
+
upload_file_to_supabase(bucket, storage_path, local_output)
|
| 153 |
+
|
| 154 |
+
# Cleanup local files
|
| 155 |
+
cleanup_temp_files([local_upload, local_output])
|
| 156 |
+
|
| 157 |
+
redaction_status[request_id] = "completed"
|
| 158 |
+
|
| 159 |
+
except Exception as e:
|
| 160 |
+
logger.error(f"Redaction failed for {request_id}: {str(e)}")
|
| 161 |
+
redaction_status[request_id] = "failed"
|
| 162 |
+
|
| 163 |
+
# ----------------- Existing Endpoints -----------------
|
| 164 |
+
@app.get("/", response_model=HealthResponse)
|
| 165 |
+
async def root():
|
| 166 |
+
return HealthResponse(
|
| 167 |
+
status="healthy",
|
| 168 |
+
version="1.0.0",
|
| 169 |
+
model_loaded=redactor.is_model_loaded()
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
@app.get("/health", response_model=HealthResponse)
|
| 173 |
+
async def health_check():
|
| 174 |
+
return HealthResponse(
|
| 175 |
+
status="healthy",
|
| 176 |
+
version="1.0.0",
|
| 177 |
+
model_loaded=redactor.is_model_loaded()
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
@app.post("/redact", response_model=RedactionResponse)
|
| 181 |
+
async def redact_pdf(
|
| 182 |
+
background_tasks: BackgroundTasks,
|
| 183 |
+
file: UploadFile = File(...),
|
| 184 |
+
dpi: int = 300,
|
| 185 |
+
entity_types: Optional[str] = None
|
| 186 |
+
):
|
| 187 |
+
if not file.filename.endswith('.pdf'):
|
| 188 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 189 |
+
job_id = str(uuid.uuid4())
|
| 190 |
+
upload_path = UPLOAD_DIR / f"{job_id}.pdf"
|
| 191 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 192 |
+
try:
|
| 193 |
+
with upload_path.open("wb") as buffer:
|
| 194 |
+
shutil.copyfileobj(file.file, buffer)
|
| 195 |
+
|
| 196 |
+
entity_filter = None
|
| 197 |
+
if entity_types:
|
| 198 |
+
entity_filter = [et.strip() for et in entity_types.split(',')]
|
| 199 |
+
|
| 200 |
+
result = redactor.redact_document(
|
| 201 |
+
pdf_path=str(upload_path),
|
| 202 |
+
output_path=str(output_path),
|
| 203 |
+
dpi=dpi,
|
| 204 |
+
entity_filter=entity_filter
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
response_entities = [
|
| 208 |
+
RedactionEntity(
|
| 209 |
+
entity_type=e['entity_type'],
|
| 210 |
+
entity_text=e['entity_text'],
|
| 211 |
+
page=e['words'][0]['page'] if e['words'] else 0,
|
| 212 |
+
word_count=len(e['words'])
|
| 213 |
+
) for e in result['entities']
|
| 214 |
+
]
|
| 215 |
+
|
| 216 |
+
background_tasks.add_task(cleanup_files, job_id)
|
| 217 |
+
|
| 218 |
+
return RedactionResponse(
|
| 219 |
+
job_id=job_id,
|
| 220 |
+
status="completed",
|
| 221 |
+
message=f"Successfully redacted {len(result['entities'])} entities",
|
| 222 |
+
entities=response_entities,
|
| 223 |
+
redacted_file_url=f"/download/{job_id}"
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logger.error(f"Error processing job {job_id}: {str(e)}")
|
| 228 |
+
if upload_path.exists():
|
| 229 |
+
upload_path.unlink()
|
| 230 |
+
if output_path.exists():
|
| 231 |
+
output_path.unlink()
|
| 232 |
+
raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
|
| 233 |
+
|
| 234 |
+
@app.get("/download/{job_id}")
|
| 235 |
+
async def download_redacted_pdf(job_id: str):
|
| 236 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 237 |
+
if not output_path.exists():
|
| 238 |
+
raise HTTPException(status_code=404, detail="Redacted file not found")
|
| 239 |
+
return FileResponse(
|
| 240 |
+
path=output_path,
|
| 241 |
+
media_type="application/pdf",
|
| 242 |
+
filename=f"redacted_{job_id}.pdf"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
@app.delete("/cleanup/{job_id}")
|
| 246 |
+
async def cleanup_job(job_id: str):
|
| 247 |
+
try:
|
| 248 |
+
cleanup_files(job_id)
|
| 249 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 250 |
+
if output_path.exists():
|
| 251 |
+
output_path.unlink()
|
| 252 |
+
return {"message": f"Successfully cleaned up files for job {job_id}"}
|
| 253 |
+
except Exception as e:
|
| 254 |
+
raise HTTPException(status_code=500, detail=f"Error cleaning up: {str(e)}")
|
| 255 |
+
|
| 256 |
+
@app.get("/stats")
|
| 257 |
+
async def get_stats():
|
| 258 |
+
upload_count = len(list(UPLOAD_DIR.glob("*.pdf")))
|
| 259 |
+
output_count = len(list(OUTPUT_DIR.glob("*.pdf")))
|
| 260 |
+
return {
|
| 261 |
+
"pending_uploads": upload_count,
|
| 262 |
+
"processed_files": output_count,
|
| 263 |
+
"model_loaded": redactor.is_model_loaded()
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
# ----------------- NEW Endpoints -----------------
|
| 267 |
+
@app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
|
| 268 |
+
async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
|
| 269 |
+
if redaction_status.get(request_id) == "processing":
|
| 270 |
+
return RedactionStatusResponse(
|
| 271 |
+
request_id=request_id,
|
| 272 |
+
status="processing",
|
| 273 |
+
files=[],
|
| 274 |
+
message="Redaction already in progress"
|
| 275 |
+
)
|
| 276 |
+
redaction_status[request_id] = "pending"
|
| 277 |
+
background_tasks.add_task(redact_request, request_id)
|
| 278 |
+
return RedactionStatusResponse(
|
| 279 |
+
request_id=request_id,
|
| 280 |
+
status="pending",
|
| 281 |
+
files=[],
|
| 282 |
+
message="Redaction started in background"
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
@app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
|
| 286 |
+
async def get_redaction_status(request_id: str):
|
| 287 |
+
status = redaction_status.get(request_id, "not_found")
|
| 288 |
+
|
| 289 |
+
# Default empty response
|
| 290 |
+
files: List[str] = []
|
| 291 |
+
|
| 292 |
+
if status == "completed":
|
| 293 |
+
# Fetch file paths from DB
|
| 294 |
+
response = (
|
| 295 |
+
supabase
|
| 296 |
+
.from_("request_files")
|
| 297 |
+
.select("storage_path")
|
| 298 |
+
.eq("request_id", request_id)
|
| 299 |
+
.execute()
|
| 300 |
+
)
|
| 301 |
+
|
| 302 |
+
if response.data:
|
| 303 |
+
files = [
|
| 304 |
+
get_public_url("doc_storage", row["storage_path"])
|
| 305 |
+
for row in response.data
|
| 306 |
+
]
|
| 307 |
+
|
| 308 |
+
message = (
|
| 309 |
+
"Redaction completed"
|
| 310 |
+
if status == "completed"
|
| 311 |
+
else "Redaction pending"
|
| 312 |
+
if status == "pending"
|
| 313 |
+
else "Redaction failed"
|
| 314 |
+
if status == "failed"
|
| 315 |
+
else "Request not found"
|
| 316 |
+
)
|
| 317 |
+
|
| 318 |
+
return RedactionStatusResponse(
|
| 319 |
+
request_id=request_id,
|
| 320 |
+
status=status,
|
| 321 |
+
files=files,
|
| 322 |
+
message=message
|
| 323 |
+
)
|
| 324 |
+
|
| 325 |
+
# ----------------- Run Server -----------------
|
| 326 |
+
if __name__ == "__main__":
|
| 327 |
+
uvicorn.run(
|
| 328 |
+
"main:app",
|
| 329 |
+
host="localhost",
|
| 330 |
+
port=2700,
|
| 331 |
+
reload=False
|
| 332 |
+
)
|
model/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
model/README.md
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
pipeline_tag: token-classification
|
| 3 |
+
---
|
| 4 |
+
# Model Card for Model ID
|
| 5 |
+
|
| 6 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 7 |
+
|
| 8 |
+
The NER model developed using BERT is designed to recognize named entities in text for multiple languages, including Arabic, French, and English. It is adaptable to new labels, allowing users to extend its capabilities beyond the initial set of 10 predefined labels. which are: 'Person_Name', 'Brand_vehicule', 'Model_vehicule', 'Organization_Name', 'location', 'phone_number', 'IBAN', 'credit_card', 'date_time', 'email', 'Identification_Number'
|
| 9 |
+
## Model Details
|
| 10 |
+
|
| 11 |
+
### Model Description
|
| 12 |
+
|
| 13 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
- **Developed by:** yahya mdarhri
|
| 18 |
+
- **Model type:** TOKEN CLASSIFICATION
|
| 19 |
+
- **Finetuned from model :** bert-base-multilingual-cased
|
| 20 |
+
- **License:** OPEN SOURCE
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
## Uses
|
| 24 |
+
|
| 25 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 26 |
+
Named Entity Recognition (NER): The primary purpose of this model is to perform Named Entity Recognition (NER) in text data. It identifies and categorizes entities such as names of people, organizations, locations, dates, and more.
|
| 27 |
+
Multilingual Support: The model is designed to support multiple languages, including Arabic, French, and English. It can be used by NLP practitioners, researchers, and developers working with text data in these languages.
|
| 28 |
+
Adaptability: Users can adapt the model to recognize new entity labels by providing labeled training data for the desired categories. This feature makes it versatile for various NER tasks.
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
## Bias, Risks, and Limitations
|
| 34 |
+
|
| 35 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 36 |
+
|
| 37 |
+
Bias and Fairness: Users and affected parties should be aware of potential biases in entity recognition, especially when it comes to personal names or other sensitive categories. Efforts should be made to minimize bias and ensure fairness in entity recognition.
|
| 38 |
+
|
| 39 |
+
Privacy: The model should be used responsibly to protect the privacy of individuals and organizations. When handling personally identifiable information (PII), data protection laws and privacy guidelines should be followed.
|
| 40 |
+
|
| 41 |
+
Transparency: Transparency in how the model operates, including its training data and evaluation metrics, is crucial to build trust with users and affected parties.
|
| 42 |
+
|
| 43 |
+
User Consent: If the model is used in applications where user data is processed, obtaining informed consent from users for data processing is essential.
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
We value your feedback! Please share your thoughts on this model. Thank you!
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
## Model Card Contact
|
| 50 |
+
I build custom AI models and solutions. If you're interested in collaboration or have specific requirements, feel free to reach out.
|
| 51 |
+
yahyamdarhri00@gmail.com
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
model/config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "C:\\Users\\pc\\OneDrive\\Documents\\GitHub\\apprentissage_actif\\./results",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForTokenClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"classifier_dropout": null,
|
| 8 |
+
"directionality": "bidi",
|
| 9 |
+
"hidden_act": "gelu",
|
| 10 |
+
"hidden_dropout_prob": 0.1,
|
| 11 |
+
"hidden_size": 768,
|
| 12 |
+
"id2label": {
|
| 13 |
+
"0": "O",
|
| 14 |
+
"1": "Person_Name",
|
| 15 |
+
"2": "Brand_vehicule",
|
| 16 |
+
"3": "Model_vehicule",
|
| 17 |
+
"4": "Organization_Name",
|
| 18 |
+
"5": "location",
|
| 19 |
+
"6": "phone_number",
|
| 20 |
+
"7": "IBAN",
|
| 21 |
+
"8": "credit_card",
|
| 22 |
+
"9": "date_time",
|
| 23 |
+
"10": "email",
|
| 24 |
+
"11": "Identification_Number"
|
| 25 |
+
},
|
| 26 |
+
"initializer_range": 0.02,
|
| 27 |
+
"intermediate_size": 3072,
|
| 28 |
+
"label2id": {
|
| 29 |
+
"Brand_vehicule": 2,
|
| 30 |
+
"IBAN": 7,
|
| 31 |
+
"Identification_Number": 11,
|
| 32 |
+
"Model_vehicule": 3,
|
| 33 |
+
"O": 0,
|
| 34 |
+
"Organization_Name": 4,
|
| 35 |
+
"Person_Name": 1,
|
| 36 |
+
"credit_card": 8,
|
| 37 |
+
"date_time": 9,
|
| 38 |
+
"email": 10,
|
| 39 |
+
"location": 5,
|
| 40 |
+
"phone_number": 6
|
| 41 |
+
},
|
| 42 |
+
"layer_norm_eps": 1e-12,
|
| 43 |
+
"max_position_embeddings": 512,
|
| 44 |
+
"model_type": "bert",
|
| 45 |
+
"num_attention_heads": 12,
|
| 46 |
+
"num_hidden_layers": 12,
|
| 47 |
+
"pad_token_id": 0,
|
| 48 |
+
"pooler_fc_size": 768,
|
| 49 |
+
"pooler_num_attention_heads": 12,
|
| 50 |
+
"pooler_num_fc_layers": 3,
|
| 51 |
+
"pooler_size_per_head": 128,
|
| 52 |
+
"pooler_type": "first_token_transform",
|
| 53 |
+
"position_embedding_type": "absolute",
|
| 54 |
+
"torch_dtype": "float32",
|
| 55 |
+
"transformers_version": "4.30.2",
|
| 56 |
+
"type_vocab_size": 2,
|
| 57 |
+
"use_cache": true,
|
| 58 |
+
"vocab_size": 119547
|
| 59 |
+
}
|
model/pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:985fcceb62b1be6e40a5dcca2789694fe6f933ca310591b852f6a074479f4b5a
|
| 3 |
+
size 709160429
|
model/special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"clean_up_tokenization_spaces": true,
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_lower_case": false,
|
| 5 |
+
"mask_token": "[MASK]",
|
| 6 |
+
"model_max_length": 512,
|
| 7 |
+
"pad_token": "[PAD]",
|
| 8 |
+
"sep_token": "[SEP]",
|
| 9 |
+
"strip_accents": null,
|
| 10 |
+
"tokenize_chinese_chars": true,
|
| 11 |
+
"tokenizer_class": "BertTokenizer",
|
| 12 |
+
"unk_token": "[UNK]"
|
| 13 |
+
}
|
model/vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
outputs/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.109.0
|
| 2 |
+
uvicorn[standard]==0.27.0
|
| 3 |
+
python-multipart==0.0.6
|
| 4 |
+
transformers==4.38
|
| 5 |
+
torch==2.2.2
|
| 6 |
+
pypdf==4.0.1
|
| 7 |
+
pdf2image==1.17.0
|
| 8 |
+
pytesseract==0.3.10
|
| 9 |
+
Pillow==10.2.0
|
| 10 |
+
pydantic==2.5.3
|
| 11 |
+
python-dotenv==1.0.0
|
| 12 |
+
supabase
|
| 13 |
+
numpy==1.26.4
|
tests/test_api.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Test cases for PDF Redaction API
|
| 3 |
+
"""
|
| 4 |
+
import pytest
|
| 5 |
+
from fastapi.testclient import TestClient
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path
|
| 10 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 11 |
+
|
| 12 |
+
from main import app
|
| 13 |
+
|
| 14 |
+
client = TestClient(app)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def test_health_check():
|
| 18 |
+
"""Test health check endpoint"""
|
| 19 |
+
response = client.get("/health")
|
| 20 |
+
assert response.status_code == 200
|
| 21 |
+
data = response.json()
|
| 22 |
+
assert data["status"] == "healthy"
|
| 23 |
+
assert "model_loaded" in data
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def test_root():
|
| 27 |
+
"""Test root endpoint"""
|
| 28 |
+
response = client.get("/")
|
| 29 |
+
assert response.status_code == 200
|
| 30 |
+
data = response.json()
|
| 31 |
+
assert data["status"] == "healthy"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def test_stats():
|
| 35 |
+
"""Test stats endpoint"""
|
| 36 |
+
response = client.get("/stats")
|
| 37 |
+
assert response.status_code == 200
|
| 38 |
+
data = response.json()
|
| 39 |
+
assert "pending_uploads" in data
|
| 40 |
+
assert "processed_files" in data
|
| 41 |
+
assert "model_loaded" in data
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_redact_no_file():
|
| 45 |
+
"""Test redaction without file"""
|
| 46 |
+
response = client.post("/redact")
|
| 47 |
+
assert response.status_code == 422 # Unprocessable entity
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def test_redact_wrong_file_type():
|
| 51 |
+
"""Test redaction with wrong file type"""
|
| 52 |
+
files = {"file": ("test.txt", b"test content", "text/plain")}
|
| 53 |
+
response = client.post("/redact", files=files)
|
| 54 |
+
assert response.status_code == 400
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def test_download_nonexistent():
|
| 58 |
+
"""Test downloading non-existent file"""
|
| 59 |
+
response = client.get("/download/nonexistent-id")
|
| 60 |
+
assert response.status_code == 404
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Add more tests as needed
|
| 64 |
+
# - Test with actual PDF file
|
| 65 |
+
# - Test with different DPI values
|
| 66 |
+
# - Test with entity type filtering
|
| 67 |
+
# - Test cleanup functionality
|
uploads/.gitkeep
ADDED
|
File without changes
|