Resolving conflicts
Browse files- .dockerignore +28 -0
- .github/workflows/ci-cd.yml +82 -0
- .gitignore +59 -0
- COMPLETE_GUIDE.md +488 -0
- DEPLOYMENT.md +298 -0
- Dockerfile +36 -0
- LICENSE +21 -0
- QUICKSTART.md +271 -0
- README.md +162 -5
- STRUCTURE.md +269 -0
- app/__init__.py +6 -0
- app/redaction.py +327 -0
- client_example.py +142 -0
- client_supabase.py +9 -0
- docker-compose.yml +48 -0
- main.py +344 -0
- outputs/.gitkeep +0 -0
- requirements.txt +14 -0
- uploads/.gitkeep +0 -0
.dockerignore
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__
|
| 2 |
+
*.pyc
|
| 3 |
+
*.pyo
|
| 4 |
+
*.pyd
|
| 5 |
+
.Python
|
| 6 |
+
*.so
|
| 7 |
+
*.egg
|
| 8 |
+
*.egg-info
|
| 9 |
+
dist
|
| 10 |
+
build
|
| 11 |
+
.git
|
| 12 |
+
.gitignore
|
| 13 |
+
.env
|
| 14 |
+
.venv
|
| 15 |
+
venv/
|
| 16 |
+
env/
|
| 17 |
+
*.log
|
| 18 |
+
.DS_Store
|
| 19 |
+
.pytest_cache
|
| 20 |
+
.coverage
|
| 21 |
+
htmlcov/
|
| 22 |
+
uploads/*
|
| 23 |
+
outputs/*
|
| 24 |
+
!uploads/.gitkeep
|
| 25 |
+
!outputs/.gitkeep
|
| 26 |
+
*.pdf
|
| 27 |
+
README.md
|
| 28 |
+
.github/
|
.github/workflows/ci-cd.yml
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI/CD Pipeline
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ main, develop ]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [ main ]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
test:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
|
| 13 |
+
steps:
|
| 14 |
+
- uses: actions/checkout@v3
|
| 15 |
+
|
| 16 |
+
- name: Set up Python
|
| 17 |
+
uses: actions/setup-python@v4
|
| 18 |
+
with:
|
| 19 |
+
python-version: '3.10'
|
| 20 |
+
|
| 21 |
+
- name: Install system dependencies
|
| 22 |
+
run: |
|
| 23 |
+
sudo apt-get update
|
| 24 |
+
sudo apt-get install -y tesseract-ocr poppler-utils
|
| 25 |
+
|
| 26 |
+
- name: Install Python dependencies
|
| 27 |
+
run: |
|
| 28 |
+
python -m pip install --upgrade pip
|
| 29 |
+
pip install -r requirements.txt
|
| 30 |
+
pip install pytest pytest-cov httpx
|
| 31 |
+
|
| 32 |
+
- name: Run tests
|
| 33 |
+
run: |
|
| 34 |
+
pytest tests/ -v --cov=app --cov-report=xml
|
| 35 |
+
|
| 36 |
+
- name: Upload coverage
|
| 37 |
+
uses: codecov/codecov-action@v3
|
| 38 |
+
with:
|
| 39 |
+
file: ./coverage.xml
|
| 40 |
+
fail_ci_if_error: false
|
| 41 |
+
|
| 42 |
+
docker-build:
|
| 43 |
+
runs-on: ubuntu-latest
|
| 44 |
+
needs: test
|
| 45 |
+
|
| 46 |
+
steps:
|
| 47 |
+
- uses: actions/checkout@v3
|
| 48 |
+
|
| 49 |
+
- name: Set up Docker Buildx
|
| 50 |
+
uses: docker/setup-buildx-action@v2
|
| 51 |
+
|
| 52 |
+
- name: Build Docker image
|
| 53 |
+
run: |
|
| 54 |
+
docker build -t pdf-redaction-api:test .
|
| 55 |
+
|
| 56 |
+
- name: Test Docker image
|
| 57 |
+
run: |
|
| 58 |
+
docker run -d -p 7860:7860 --name test-api pdf-redaction-api:test
|
| 59 |
+
sleep 10
|
| 60 |
+
curl -f http://localhost:7860/health || exit 1
|
| 61 |
+
docker stop test-api
|
| 62 |
+
|
| 63 |
+
deploy-huggingface:
|
| 64 |
+
runs-on: ubuntu-latest
|
| 65 |
+
needs: [test, docker-build]
|
| 66 |
+
if: github.ref == 'refs/heads/main'
|
| 67 |
+
|
| 68 |
+
steps:
|
| 69 |
+
- uses: actions/checkout@v3
|
| 70 |
+
|
| 71 |
+
- name: Deploy to HuggingFace Spaces
|
| 72 |
+
env:
|
| 73 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 74 |
+
run: |
|
| 75 |
+
git config --global user.email "github-actions@github.com"
|
| 76 |
+
git config --global user.name "GitHub Actions"
|
| 77 |
+
|
| 78 |
+
# Add HuggingFace remote if it doesn't exist
|
| 79 |
+
git remote add hf https://user:$HF_TOKEN@huggingface.co/spaces/${{ secrets.HF_SPACE }} || true
|
| 80 |
+
|
| 81 |
+
# Push to HuggingFace
|
| 82 |
+
git push hf main:main
|
.gitignore
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
build/
|
| 8 |
+
develop-eggs/
|
| 9 |
+
dist/
|
| 10 |
+
downloads/
|
| 11 |
+
eggs/
|
| 12 |
+
.eggs/
|
| 13 |
+
lib/
|
| 14 |
+
lib64/
|
| 15 |
+
parts/
|
| 16 |
+
sdist/
|
| 17 |
+
var/
|
| 18 |
+
wheels/
|
| 19 |
+
*.egg-info/
|
| 20 |
+
.installed.cfg
|
| 21 |
+
*.egg
|
| 22 |
+
|
| 23 |
+
# Virtual environments
|
| 24 |
+
redact/
|
| 25 |
+
venv/
|
| 26 |
+
env/
|
| 27 |
+
ENV/
|
| 28 |
+
.venv
|
| 29 |
+
|
| 30 |
+
# IDE
|
| 31 |
+
.vscode/
|
| 32 |
+
.idea/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
*~
|
| 36 |
+
|
| 37 |
+
# OS
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Project specific
|
| 42 |
+
uploads/*.pdf
|
| 43 |
+
outputs/*.pdf
|
| 44 |
+
*.log
|
| 45 |
+
|
| 46 |
+
# Environment
|
| 47 |
+
.env
|
| 48 |
+
.env.local
|
| 49 |
+
|
| 50 |
+
# Testing
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
.coverage
|
| 53 |
+
htmlcov/
|
| 54 |
+
|
| 55 |
+
# Model cache
|
| 56 |
+
cache/
|
| 57 |
+
models/
|
| 58 |
+
|
| 59 |
+
tests
|
COMPLETE_GUIDE.md
ADDED
|
@@ -0,0 +1,488 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🚀 Complete FastAPI Deployment Package
|
| 2 |
+
|
| 3 |
+
## 📦 What You've Got
|
| 4 |
+
|
| 5 |
+
A production-ready FastAPI application for PDF redaction with Named Entity Recognition, ready to deploy on HuggingFace Spaces or any cloud platform.
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## 📁 Directory Structure
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
pdf-redaction-api/
|
| 13 |
+
│
|
| 14 |
+
├── 📄 main.py # FastAPI application
|
| 15 |
+
├── 🐳 Dockerfile # Production container
|
| 16 |
+
├── 🐳 docker-compose.yml # Local development
|
| 17 |
+
├── 📋 requirements.txt # Python dependencies
|
| 18 |
+
│
|
| 19 |
+
├── 📱 app/
|
| 20 |
+
│ ├── __init__.py
|
| 21 |
+
│ └── redaction.py # Core redaction engine
|
| 22 |
+
│
|
| 23 |
+
├── 📂 uploads/ # Temporary uploads
|
| 24 |
+
│ └── .gitkeep
|
| 25 |
+
│
|
| 26 |
+
├── 📂 outputs/ # Redacted PDFs
|
| 27 |
+
│ └── .gitkeep
|
| 28 |
+
│
|
| 29 |
+
├── 🧪 tests/
|
| 30 |
+
│ └── test_api.py # API tests
|
| 31 |
+
│
|
| 32 |
+
├── 📚 Documentation/
|
| 33 |
+
│ ├── README.md # Main docs (for HF Spaces)
|
| 34 |
+
│ ├── DEPLOYMENT.md # Deployment guide
|
| 35 |
+
│ ├── QUICKSTART.md # Quick start guide
|
| 36 |
+
│ └── STRUCTURE.md # Project structure
|
| 37 |
+
│
|
| 38 |
+
├── 🔧 Configuration/
|
| 39 |
+
│ ├── .env.example # Environment variables
|
| 40 |
+
│ ├── .gitignore # Git ignore
|
| 41 |
+
│ └── .dockerignore # Docker ignore
|
| 42 |
+
│
|
| 43 |
+
├── 🤖 .github/
|
| 44 |
+
│ └── workflows/
|
| 45 |
+
│ └── ci-cd.yml # GitHub Actions CI/CD
|
| 46 |
+
│
|
| 47 |
+
├── 📝 client_example.py # Example API client
|
| 48 |
+
└── 📜 LICENSE # MIT License
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## ✨ Features
|
| 54 |
+
|
| 55 |
+
### Core Functionality
|
| 56 |
+
✅ PDF upload and processing
|
| 57 |
+
✅ OCR with pytesseract (configurable DPI)
|
| 58 |
+
✅ Named Entity Recognition (NER)
|
| 59 |
+
✅ Accurate coordinate-based redaction
|
| 60 |
+
✅ Multiple entity type support
|
| 61 |
+
✅ Downloadable redacted PDFs
|
| 62 |
+
|
| 63 |
+
### API Features
|
| 64 |
+
✅ RESTful API with FastAPI
|
| 65 |
+
✅ Automatic OpenAPI documentation
|
| 66 |
+
✅ File upload handling
|
| 67 |
+
✅ Background task cleanup
|
| 68 |
+
✅ Health checks
|
| 69 |
+
✅ Statistics endpoint
|
| 70 |
+
✅ CORS support
|
| 71 |
+
|
| 72 |
+
### DevOps
|
| 73 |
+
✅ Docker containerization
|
| 74 |
+
✅ Docker Compose for local dev
|
| 75 |
+
✅ GitHub Actions CI/CD
|
| 76 |
+
✅ HuggingFace Spaces ready
|
| 77 |
+
✅ Comprehensive testing
|
| 78 |
+
✅ Logging and monitoring
|
| 79 |
+
|
| 80 |
+
---
|
| 81 |
+
|
| 82 |
+
## 🎯 Quick Deployment Paths
|
| 83 |
+
|
| 84 |
+
### Option 1: HuggingFace Spaces (Recommended for Demo)
|
| 85 |
+
|
| 86 |
+
**Time: 10 minutes**
|
| 87 |
+
|
| 88 |
+
```bash
|
| 89 |
+
# 1. Create Space on HuggingFace (select Docker SDK)
|
| 90 |
+
# 2. Clone your space
|
| 91 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 92 |
+
cd pdf-redaction-api
|
| 93 |
+
|
| 94 |
+
# 3. Copy all files
|
| 95 |
+
cp -r /path/to/pdf-redaction-api/* .
|
| 96 |
+
|
| 97 |
+
# 4. Deploy
|
| 98 |
+
git add .
|
| 99 |
+
git commit -m "Initial deployment"
|
| 100 |
+
git push
|
| 101 |
+
```
|
| 102 |
+
|
| 103 |
+
**Your API will be at:** `https://YOUR_USERNAME-pdf-redaction-api.hf.space`
|
| 104 |
+
|
| 105 |
+
**Cost:** FREE (with CPU Basic tier)
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
### Option 2: Docker Locally
|
| 110 |
+
|
| 111 |
+
**Time: 5 minutes**
|
| 112 |
+
|
| 113 |
+
```bash
|
| 114 |
+
# Build
|
| 115 |
+
docker build -t pdf-redaction-api .
|
| 116 |
+
|
| 117 |
+
# Run
|
| 118 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 119 |
+
|
| 120 |
+
# Test
|
| 121 |
+
curl http://localhost:7860/health
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
### Option 3: Direct Python
|
| 127 |
+
|
| 128 |
+
**Time: 3 minutes**
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
# Install dependencies
|
| 132 |
+
sudo apt-get install tesseract-ocr poppler-utils
|
| 133 |
+
pip install -r requirements.txt
|
| 134 |
+
|
| 135 |
+
# Run
|
| 136 |
+
python main.py
|
| 137 |
+
|
| 138 |
+
# Access at http://localhost:7860
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
---
|
| 142 |
+
|
| 143 |
+
## 🔌 API Endpoints
|
| 144 |
+
|
| 145 |
+
### Core Endpoints
|
| 146 |
+
|
| 147 |
+
| Method | Endpoint | Description |
|
| 148 |
+
|--------|----------|-------------|
|
| 149 |
+
| POST | `/redact` | Upload and redact PDF |
|
| 150 |
+
| GET | `/download/{job_id}` | Download redacted PDF |
|
| 151 |
+
| GET | `/health` | Health check |
|
| 152 |
+
| GET | `/stats` | API statistics |
|
| 153 |
+
| DELETE | `/cleanup/{job_id}` | Manual cleanup |
|
| 154 |
+
| GET | `/docs` | Interactive API docs |
|
| 155 |
+
|
| 156 |
+
### Example Usage
|
| 157 |
+
|
| 158 |
+
**cURL:**
|
| 159 |
+
```bash
|
| 160 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 161 |
+
-F "file=@document.pdf" \
|
| 162 |
+
-F "dpi=300"
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
**Python:**
|
| 166 |
+
```python
|
| 167 |
+
import requests
|
| 168 |
+
|
| 169 |
+
response = requests.post(
|
| 170 |
+
"http://localhost:7860/redact",
|
| 171 |
+
files={"file": open("document.pdf", "rb")},
|
| 172 |
+
params={"dpi": 300}
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
job_id = response.json()["job_id"]
|
| 176 |
+
redacted = requests.get(f"http://localhost:7860/download/{job_id}")
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
---
|
| 180 |
+
|
| 181 |
+
## 🎨 Architecture
|
| 182 |
+
|
| 183 |
+
```
|
| 184 |
+
┌─────────────────────────────────────────────────────────┐
|
| 185 |
+
│ CLIENT REQUEST │
|
| 186 |
+
│ (Upload PDF via POST /redact) │
|
| 187 |
+
└─────────────────────────────────────────────────────────┘
|
| 188 |
+
↓
|
| 189 |
+
┌─────────────────────────────────────────────────────────┐
|
| 190 |
+
│ FASTAPI (main.py) │
|
| 191 |
+
│ • Validate file │
|
| 192 |
+
│ • Generate job_id │
|
| 193 |
+
│ • Save to uploads/ │
|
| 194 |
+
└─────────────────────────────────────────────────────────┘
|
| 195 |
+
↓
|
| 196 |
+
┌─────────────────────────────────────────────────────────┐
|
| 197 |
+
│ PDFRedactor (app/redaction.py) │
|
| 198 |
+
│ │
|
| 199 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 200 |
+
│ │ 1. OCR (pytesseract) │ │
|
| 201 |
+
│ │ • Convert PDF → Images (pdf2image) │ │
|
| 202 |
+
│ │ • Extract text + bounding boxes │ │
|
| 203 |
+
│ │ • Store image dimensions │ │
|
| 204 |
+
│ └─────────────────────────────────────────┘ │
|
| 205 |
+
│ ↓ │
|
| 206 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 207 |
+
│ │ 2. NER (HuggingFace Transformers) │ │
|
| 208 |
+
│ │ • Load model │ │
|
| 209 |
+
│ │ • Identify entities in text │ │
|
| 210 |
+
│ │ • Return entity types + positions │ │
|
| 211 |
+
│ └─────────────────────────────────────────┘ │
|
| 212 |
+
│ ↓ │
|
| 213 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 214 |
+
│ │ 3. Mapping │ │
|
| 215 |
+
│ │ • Create character span index │ │
|
| 216 |
+
│ │ • Match NER entities to OCR boxes │ │
|
| 217 |
+
│ └─────────────────────────────────────────┘ │
|
| 218 |
+
│ ↓ │
|
| 219 |
+
│ ┌─────────────────────────────────────────┐ │
|
| 220 |
+
│ │ 4. Redaction (pypdf) │ │
|
| 221 |
+
│ │ • Scale image coords → PDF coords │ │
|
| 222 |
+
│ │ • Create black rectangle annotations │ │
|
| 223 |
+
│ │ • Write redacted PDF │ │
|
| 224 |
+
│ └─────────────────────────────────────────┘ │
|
| 225 |
+
└─────────────────────────────────────────────────────────┘
|
| 226 |
+
↓
|
| 227 |
+
┌─────────────────────────────────────────────────────────┐
|
| 228 |
+
│ RESPONSE │
|
| 229 |
+
│ • job_id │
|
| 230 |
+
│ • List of entities │
|
| 231 |
+
│ • Download URL │
|
| 232 |
+
└─────────────────────────────────────────────────────────┘
|
| 233 |
+
```
|
| 234 |
+
|
| 235 |
+
---
|
| 236 |
+
|
| 237 |
+
## 🔐 Security Considerations
|
| 238 |
+
|
| 239 |
+
### Current Implementation
|
| 240 |
+
- ✅ File validation (PDF only)
|
| 241 |
+
- ✅ Temporary file cleanup
|
| 242 |
+
- ✅ CORS middleware
|
| 243 |
+
- ✅ Error handling
|
| 244 |
+
|
| 245 |
+
### For Production (TODO)
|
| 246 |
+
- ⚠️ Add API key authentication
|
| 247 |
+
- ⚠️ Implement rate limiting
|
| 248 |
+
- ⚠️ Add file size limits
|
| 249 |
+
- ⚠️ Use HTTPS only
|
| 250 |
+
- ⚠️ Implement user quotas
|
| 251 |
+
- ⚠️ Add input sanitization
|
| 252 |
+
|
| 253 |
+
**Example API Key Auth:**
|
| 254 |
+
```python
|
| 255 |
+
# Add to main.py
|
| 256 |
+
from fastapi import Security, HTTPException
|
| 257 |
+
from fastapi.security import APIKeyHeader
|
| 258 |
+
|
| 259 |
+
API_KEY = "your-secret-key"
|
| 260 |
+
api_key_header = APIKeyHeader(name="X-API-Key")
|
| 261 |
+
|
| 262 |
+
def verify_api_key(key: str = Security(api_key_header)):
|
| 263 |
+
if key != API_KEY:
|
| 264 |
+
raise HTTPException(401, "Invalid API Key")
|
| 265 |
+
```
|
| 266 |
+
|
| 267 |
+
---
|
| 268 |
+
|
| 269 |
+
## 📊 Performance Tuning
|
| 270 |
+
|
| 271 |
+
### DPI Settings
|
| 272 |
+
|
| 273 |
+
| DPI | Quality | Speed | Use Case |
|
| 274 |
+
|-----|---------|-------|----------|
|
| 275 |
+
| 150 | Low | Fast | Quick previews |
|
| 276 |
+
| 200 | Medium | Medium | General use |
|
| 277 |
+
| 300 | High | Slow | **Recommended** |
|
| 278 |
+
| 600 | Very High | Very Slow | Critical documents |
|
| 279 |
+
|
| 280 |
+
### Hardware Requirements
|
| 281 |
+
|
| 282 |
+
**Minimum (Free Tier):**
|
| 283 |
+
- CPU: 2 cores
|
| 284 |
+
- RAM: 2GB
|
| 285 |
+
- Storage: 1GB
|
| 286 |
+
|
| 287 |
+
**Recommended (Production):**
|
| 288 |
+
- CPU: 4+ cores
|
| 289 |
+
- RAM: 8GB
|
| 290 |
+
- Storage: 10GB
|
| 291 |
+
- GPU: Optional (speeds up NER)
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## 🧪 Testing
|
| 296 |
+
|
| 297 |
+
```bash
|
| 298 |
+
# Install test dependencies
|
| 299 |
+
pip install pytest pytest-cov httpx
|
| 300 |
+
|
| 301 |
+
# Run tests
|
| 302 |
+
pytest tests/ -v
|
| 303 |
+
|
| 304 |
+
# With coverage
|
| 305 |
+
pytest tests/ --cov=app --cov-report=html
|
| 306 |
+
|
| 307 |
+
# View coverage report
|
| 308 |
+
open htmlcov/index.html
|
| 309 |
+
```
|
| 310 |
+
|
| 311 |
+
---
|
| 312 |
+
|
| 313 |
+
## 📈 Monitoring
|
| 314 |
+
|
| 315 |
+
### Built-in Endpoints
|
| 316 |
+
|
| 317 |
+
**Health Check:**
|
| 318 |
+
```bash
|
| 319 |
+
curl http://localhost:7860/health
|
| 320 |
+
```
|
| 321 |
+
|
| 322 |
+
**Statistics:**
|
| 323 |
+
```bash
|
| 324 |
+
curl http://localhost:7860/stats
|
| 325 |
+
```
|
| 326 |
+
|
| 327 |
+
### Logs
|
| 328 |
+
|
| 329 |
+
**Development:**
|
| 330 |
+
```bash
|
| 331 |
+
python main.py
|
| 332 |
+
# Logs appear in console
|
| 333 |
+
```
|
| 334 |
+
|
| 335 |
+
**Docker:**
|
| 336 |
+
```bash
|
| 337 |
+
docker logs -f container_name
|
| 338 |
+
```
|
| 339 |
+
|
| 340 |
+
**HuggingFace Spaces:**
|
| 341 |
+
- View in Space dashboard → Logs tab
|
| 342 |
+
|
| 343 |
+
---
|
| 344 |
+
|
| 345 |
+
## 💰 Cost Estimation
|
| 346 |
+
|
| 347 |
+
### HuggingFace Spaces
|
| 348 |
+
|
| 349 |
+
| Tier | CPU | RAM | Price | Use Case |
|
| 350 |
+
|------|-----|-----|-------|----------|
|
| 351 |
+
| Basic | 2 | 16GB | **FREE** | Demo, testing |
|
| 352 |
+
| CPU Upgrade | 4 | 32GB | $0.50/hr | Production |
|
| 353 |
+
| GPU T4 | - | - | $0.60/hr | Heavy load |
|
| 354 |
+
| GPU A10G | - | - | $1.50/hr | Enterprise |
|
| 355 |
+
|
| 356 |
+
**Monthly Costs (if always on):**
|
| 357 |
+
- Free: $0
|
| 358 |
+
- CPU Upgrade: ~$360/month
|
| 359 |
+
- GPU T4: ~$432/month
|
| 360 |
+
|
| 361 |
+
**Recommendation:** Start free, upgrade based on usage
|
| 362 |
+
|
| 363 |
+
### Alternatives
|
| 364 |
+
|
| 365 |
+
**AWS ECS Fargate:** ~$30-100/month
|
| 366 |
+
**Google Cloud Run:** Pay per request (~$10-50/month)
|
| 367 |
+
**DigitalOcean App:** $12-24/month
|
| 368 |
+
**Self-hosted VPS:** $5-20/month
|
| 369 |
+
|
| 370 |
+
---
|
| 371 |
+
|
| 372 |
+
## 🔄 CI/CD Pipeline
|
| 373 |
+
|
| 374 |
+
### Automated with GitHub Actions
|
| 375 |
+
|
| 376 |
+
```
|
| 377 |
+
Push to GitHub
|
| 378 |
+
↓
|
| 379 |
+
[Run Tests]
|
| 380 |
+
↓
|
| 381 |
+
[Build Docker]
|
| 382 |
+
↓
|
| 383 |
+
[Test Container]
|
| 384 |
+
↓
|
| 385 |
+
[Deploy to HuggingFace]
|
| 386 |
+
```
|
| 387 |
+
|
| 388 |
+
**Setup:**
|
| 389 |
+
1. Add secrets in GitHub repo settings:
|
| 390 |
+
- `HF_TOKEN`: HuggingFace access token
|
| 391 |
+
- `HF_SPACE`: Your space name (username/space-name)
|
| 392 |
+
|
| 393 |
+
2. Push to main branch → Auto-deploy! ✨
|
| 394 |
+
|
| 395 |
+
---
|
| 396 |
+
|
| 397 |
+
## 📚 Documentation Access
|
| 398 |
+
|
| 399 |
+
| Document | Purpose |
|
| 400 |
+
|----------|---------|
|
| 401 |
+
| `README.md` | Overview, API docs, usage examples |
|
| 402 |
+
| `QUICKSTART.md` | 5-minute setup guide |
|
| 403 |
+
| `DEPLOYMENT.md` | Production deployment |
|
| 404 |
+
| `STRUCTURE.md` | Code organization |
|
| 405 |
+
| `/docs` endpoint | Interactive API documentation |
|
| 406 |
+
|
| 407 |
+
---
|
| 408 |
+
|
| 409 |
+
## 🎓 Learning Resources
|
| 410 |
+
|
| 411 |
+
### FastAPI
|
| 412 |
+
- Docs: https://fastapi.tiangolo.com
|
| 413 |
+
- Tutorial: https://fastapi.tiangolo.com/tutorial
|
| 414 |
+
|
| 415 |
+
### HuggingFace
|
| 416 |
+
- Spaces: https://huggingface.co/docs/hub/spaces
|
| 417 |
+
- Transformers: https://huggingface.co/docs/transformers
|
| 418 |
+
|
| 419 |
+
### Docker
|
| 420 |
+
- Getting Started: https://docs.docker.com/get-started
|
| 421 |
+
|
| 422 |
+
---
|
| 423 |
+
|
| 424 |
+
## 🐛 Troubleshooting
|
| 425 |
+
|
| 426 |
+
### Common Issues
|
| 427 |
+
|
| 428 |
+
**Problem:** "Tesseract not found"
|
| 429 |
+
**Solution:** `apt-get install tesseract-ocr`
|
| 430 |
+
|
| 431 |
+
**Problem:** "Poppler not found"
|
| 432 |
+
**Solution:** `apt-get install poppler-utils`
|
| 433 |
+
|
| 434 |
+
**Problem:** Slow processing
|
| 435 |
+
**Solution:** Lower DPI to 150-200
|
| 436 |
+
|
| 437 |
+
**Problem:** Out of memory
|
| 438 |
+
**Solution:** Upgrade hardware or reduce DPI
|
| 439 |
+
|
| 440 |
+
**Problem:** Model not loading
|
| 441 |
+
**Solution:** Check internet, wait for download
|
| 442 |
+
|
| 443 |
+
### Debug Mode
|
| 444 |
+
|
| 445 |
+
```python
|
| 446 |
+
# In main.py, add debug mode
|
| 447 |
+
if __name__ == "__main__":
|
| 448 |
+
uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True, log_level="debug")
|
| 449 |
+
```
|
| 450 |
+
|
| 451 |
+
---
|
| 452 |
+
|
| 453 |
+
## ✅ Checklist for Production
|
| 454 |
+
|
| 455 |
+
- [ ] Test all endpoints thoroughly
|
| 456 |
+
- [ ] Add API key authentication
|
| 457 |
+
- [ ] Implement rate limiting
|
| 458 |
+
- [ ] Set up monitoring (Sentry, DataDog, etc.)
|
| 459 |
+
- [ ] Configure auto-scaling
|
| 460 |
+
- [ ] Set up backups
|
| 461 |
+
- [ ] Add usage analytics
|
| 462 |
+
- [ ] Create user documentation
|
| 463 |
+
- [ ] Set up SSL/TLS (HF provides by default)
|
| 464 |
+
- [ ] Test with large files
|
| 465 |
+
- [ ] Load testing
|
| 466 |
+
- [ ] Security audit
|
| 467 |
+
- [ ] Legal compliance (GDPR, etc.)
|
| 468 |
+
|
| 469 |
+
---
|
| 470 |
+
|
| 471 |
+
## 🎉 You're Ready!
|
| 472 |
+
|
| 473 |
+
Your FastAPI PDF Redaction application is complete and ready to deploy!
|
| 474 |
+
|
| 475 |
+
### Next Steps:
|
| 476 |
+
1. ✨ Deploy to HuggingFace Spaces (easiest)
|
| 477 |
+
2. 🧪 Test with real PDFs
|
| 478 |
+
3. 📊 Monitor usage
|
| 479 |
+
4. 🔒 Add security for production
|
| 480 |
+
5. 🚀 Scale as needed
|
| 481 |
+
|
| 482 |
+
### Support:
|
| 483 |
+
- 📖 Read the documentation
|
| 484 |
+
- 🐛 Check troubleshooting guide
|
| 485 |
+
- 💬 HuggingFace community forums
|
| 486 |
+
- 📧 Create issues on your repo
|
| 487 |
+
|
| 488 |
+
**Happy Deploying! 🚀**
|
DEPLOYMENT.md
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Deployment Guide for HuggingFace Spaces
|
| 2 |
+
|
| 3 |
+
## Prerequisites
|
| 4 |
+
|
| 5 |
+
1. **HuggingFace Account**: Sign up at https://huggingface.co/
|
| 6 |
+
2. **Git**: Installed on your local machine
|
| 7 |
+
3. **Git LFS**: For large file storage (optional)
|
| 8 |
+
|
| 9 |
+
## Step-by-Step Deployment
|
| 10 |
+
|
| 11 |
+
### 1. Create a New Space
|
| 12 |
+
|
| 13 |
+
1. Go to https://huggingface.co/spaces
|
| 14 |
+
2. Click "Create new Space"
|
| 15 |
+
3. Fill in the details:
|
| 16 |
+
- **Space name**: `pdf-redaction-api` (or your preferred name)
|
| 17 |
+
- **License**: MIT
|
| 18 |
+
- **SDK**: Docker
|
| 19 |
+
- **Hardware**: CPU Basic (free tier) or upgrade if needed
|
| 20 |
+
4. Click "Create Space"
|
| 21 |
+
|
| 22 |
+
### 2. Clone Your Space Repository
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 26 |
+
cd pdf-redaction-api
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
### 3. Copy All Files to the Repository
|
| 30 |
+
|
| 31 |
+
Copy all files from this project to your cloned space:
|
| 32 |
+
|
| 33 |
+
```bash
|
| 34 |
+
# Copy all files
|
| 35 |
+
cp -r /path/to/pdf-redaction-api/* .
|
| 36 |
+
|
| 37 |
+
# Check the files
|
| 38 |
+
ls -la
|
| 39 |
+
```
|
| 40 |
+
|
| 41 |
+
You should see:
|
| 42 |
+
- `main.py`
|
| 43 |
+
- `app/`
|
| 44 |
+
- `Dockerfile`
|
| 45 |
+
- `requirements.txt`
|
| 46 |
+
- `README.md`
|
| 47 |
+
- `.gitignore`
|
| 48 |
+
- `.dockerignore`
|
| 49 |
+
- `uploads/` (with .gitkeep)
|
| 50 |
+
- `outputs/` (with .gitkeep)
|
| 51 |
+
|
| 52 |
+
### 4. Commit and Push
|
| 53 |
+
|
| 54 |
+
```bash
|
| 55 |
+
# Add all files
|
| 56 |
+
git add .
|
| 57 |
+
|
| 58 |
+
# Commit
|
| 59 |
+
git commit -m "Initial deployment of PDF Redaction API"
|
| 60 |
+
|
| 61 |
+
# Push to HuggingFace
|
| 62 |
+
git push
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
### 5. Monitor Deployment
|
| 66 |
+
|
| 67 |
+
1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
|
| 68 |
+
2. You'll see the build logs
|
| 69 |
+
3. Wait for the build to complete (usually 5-10 minutes)
|
| 70 |
+
4. Once complete, your API will be live!
|
| 71 |
+
|
| 72 |
+
### 6. Test Your Deployment
|
| 73 |
+
|
| 74 |
+
```bash
|
| 75 |
+
# Check health
|
| 76 |
+
curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
|
| 77 |
+
|
| 78 |
+
# Test with a PDF
|
| 79 |
+
curl -X POST "https://YOUR_USERNAME-pdf-redaction-api.hf.space/redact" \
|
| 80 |
+
-F "file=@test.pdf" \
|
| 81 |
+
-F "dpi=300"
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
+
## Configuration Options
|
| 85 |
+
|
| 86 |
+
### Hardware Upgrades
|
| 87 |
+
|
| 88 |
+
For better performance, consider upgrading your Space hardware:
|
| 89 |
+
|
| 90 |
+
1. Go to Space Settings
|
| 91 |
+
2. Click on "Hardware"
|
| 92 |
+
3. Choose:
|
| 93 |
+
- **CPU Basic** (Free): Good for testing, slower processing
|
| 94 |
+
- **CPU Upgrade** (~$0.50/hour): Faster processing
|
| 95 |
+
- **GPU** (~$0.60-3/hour): Best for large documents
|
| 96 |
+
|
| 97 |
+
### Environment Variables
|
| 98 |
+
|
| 99 |
+
Add environment variables in Space Settings if needed:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
HF_HOME=/app/cache
|
| 103 |
+
PYTHONUNBUFFERED=1
|
| 104 |
+
```
|
| 105 |
+
|
| 106 |
+
### Persistent Storage
|
| 107 |
+
|
| 108 |
+
For persistent file storage:
|
| 109 |
+
|
| 110 |
+
1. Go to Space Settings
|
| 111 |
+
2. Enable "Persistent Storage"
|
| 112 |
+
3. This keeps uploaded/processed files between restarts
|
| 113 |
+
|
| 114 |
+
## Custom Domain (Optional)
|
| 115 |
+
|
| 116 |
+
To use a custom domain:
|
| 117 |
+
|
| 118 |
+
1. Go to Space Settings
|
| 119 |
+
2. Click "Domains"
|
| 120 |
+
3. Add your custom domain
|
| 121 |
+
4. Follow DNS configuration instructions
|
| 122 |
+
|
| 123 |
+
## Monitoring and Logs
|
| 124 |
+
|
| 125 |
+
### View Logs
|
| 126 |
+
|
| 127 |
+
1. Go to your Space page
|
| 128 |
+
2. Click on "Logs" tab
|
| 129 |
+
3. Monitor real-time logs
|
| 130 |
+
|
| 131 |
+
### Check Resource Usage
|
| 132 |
+
|
| 133 |
+
1. Click on "Insights" tab
|
| 134 |
+
2. View CPU/Memory usage
|
| 135 |
+
3. Monitor request patterns
|
| 136 |
+
|
| 137 |
+
## Security Considerations
|
| 138 |
+
|
| 139 |
+
### For Production Use
|
| 140 |
+
|
| 141 |
+
1. **Add Authentication**:
|
| 142 |
+
- Implement API key authentication
|
| 143 |
+
- Use OAuth2 for user management
|
| 144 |
+
|
| 145 |
+
2. **Rate Limiting**:
|
| 146 |
+
- Add rate limiting to prevent abuse
|
| 147 |
+
- Use slowapi or similar libraries
|
| 148 |
+
|
| 149 |
+
3. **File Size Limits**:
|
| 150 |
+
- Restrict upload file sizes
|
| 151 |
+
- Implement timeout for long-running requests
|
| 152 |
+
|
| 153 |
+
4. **HTTPS Only**:
|
| 154 |
+
- HuggingFace Spaces provides HTTPS by default
|
| 155 |
+
- Ensure all requests use HTTPS
|
| 156 |
+
|
| 157 |
+
Example with API key authentication:
|
| 158 |
+
|
| 159 |
+
```python
|
| 160 |
+
from fastapi import Security, HTTPException, status
|
| 161 |
+
from fastapi.security import APIKeyHeader
|
| 162 |
+
|
| 163 |
+
API_KEY = "your-secret-key"
|
| 164 |
+
api_key_header = APIKeyHeader(name="X-API-Key")
|
| 165 |
+
|
| 166 |
+
def verify_api_key(api_key: str = Security(api_key_header)):
|
| 167 |
+
if api_key != API_KEY:
|
| 168 |
+
raise HTTPException(
|
| 169 |
+
status_code=status.HTTP_401_UNAUTHORIZED,
|
| 170 |
+
detail="Invalid API Key"
|
| 171 |
+
)
|
| 172 |
+
return api_key
|
| 173 |
+
|
| 174 |
+
# Add to endpoints
|
| 175 |
+
@app.post("/redact")
|
| 176 |
+
async def redact_pdf(
|
| 177 |
+
file: UploadFile = File(...),
|
| 178 |
+
api_key: str = Security(verify_api_key)
|
| 179 |
+
):
|
| 180 |
+
# Your code here
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## Troubleshooting
|
| 184 |
+
|
| 185 |
+
### Build Fails
|
| 186 |
+
|
| 187 |
+
**Problem**: Docker build fails
|
| 188 |
+
|
| 189 |
+
**Solution**:
|
| 190 |
+
- Check Dockerfile syntax
|
| 191 |
+
- Ensure all dependencies are in requirements.txt
|
| 192 |
+
- Review build logs for specific errors
|
| 193 |
+
|
| 194 |
+
### Out of Memory
|
| 195 |
+
|
| 196 |
+
**Problem**: API crashes with OOM errors
|
| 197 |
+
|
| 198 |
+
**Solution**:
|
| 199 |
+
- Reduce default DPI to 200
|
| 200 |
+
- Upgrade to larger hardware
|
| 201 |
+
- Implement request queuing
|
| 202 |
+
|
| 203 |
+
### Slow Processing
|
| 204 |
+
|
| 205 |
+
**Problem**: Redaction takes too long
|
| 206 |
+
|
| 207 |
+
**Solution**:
|
| 208 |
+
- Lower DPI (150-200 for faster processing)
|
| 209 |
+
- Upgrade to GPU hardware
|
| 210 |
+
- Optimize batch processing
|
| 211 |
+
|
| 212 |
+
### Model Download Issues
|
| 213 |
+
|
| 214 |
+
**Problem**: Model fails to download
|
| 215 |
+
|
| 216 |
+
**Solution**:
|
| 217 |
+
- Check HuggingFace model availability
|
| 218 |
+
- Verify internet access in Space
|
| 219 |
+
- Pre-download model and include in Docker image
|
| 220 |
+
|
| 221 |
+
## Updating Your Space
|
| 222 |
+
|
| 223 |
+
To update your deployed API:
|
| 224 |
+
|
| 225 |
+
```bash
|
| 226 |
+
# Make changes locally
|
| 227 |
+
# Test changes
|
| 228 |
+
|
| 229 |
+
# Commit and push
|
| 230 |
+
git add .
|
| 231 |
+
git commit -m "Update: description of changes"
|
| 232 |
+
git push
|
| 233 |
+
|
| 234 |
+
# HuggingFace will automatically rebuild
|
| 235 |
+
```
|
| 236 |
+
|
| 237 |
+
## Cost Estimation
|
| 238 |
+
|
| 239 |
+
### Free Tier
|
| 240 |
+
- CPU Basic
|
| 241 |
+
- Limited to 2 CPU cores
|
| 242 |
+
- 16GB RAM
|
| 243 |
+
- Good for: Testing, low-traffic demos
|
| 244 |
+
|
| 245 |
+
### Paid Tiers
|
| 246 |
+
- CPU Upgrade: ~$0.50/hour (~$360/month if always on)
|
| 247 |
+
- GPU T4: ~$0.60/hour (~$432/month)
|
| 248 |
+
- GPU A10G: ~$1.50/hour (~$1,080/month)
|
| 249 |
+
|
| 250 |
+
**Recommendation**: Start with free tier, upgrade based on usage
|
| 251 |
+
|
| 252 |
+
## Alternative Deployment Options
|
| 253 |
+
|
| 254 |
+
### 1. Deploy on Your Own Server
|
| 255 |
+
|
| 256 |
+
```bash
|
| 257 |
+
# Build Docker image
|
| 258 |
+
docker build -t pdf-redaction-api .
|
| 259 |
+
|
| 260 |
+
# Run container
|
| 261 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 262 |
+
```
|
| 263 |
+
|
| 264 |
+
### 2. Deploy on Cloud Platforms
|
| 265 |
+
|
| 266 |
+
- **AWS ECS/Fargate**: For scalable production
|
| 267 |
+
- **Google Cloud Run**: Serverless container deployment
|
| 268 |
+
- **Azure Container Instances**: Easy container deployment
|
| 269 |
+
- **DigitalOcean App Platform**: Simple PaaS deployment
|
| 270 |
+
|
| 271 |
+
### 3. Deploy on Render.com
|
| 272 |
+
|
| 273 |
+
1. Connect your GitHub repo
|
| 274 |
+
2. Select "Docker" as environment
|
| 275 |
+
3. Deploy automatically
|
| 276 |
+
|
| 277 |
+
## Support
|
| 278 |
+
|
| 279 |
+
For issues:
|
| 280 |
+
1. Check HuggingFace Spaces documentation
|
| 281 |
+
2. Review logs in Space dashboard
|
| 282 |
+
3. Test locally with Docker first
|
| 283 |
+
4. Open issue on your repository
|
| 284 |
+
|
| 285 |
+
## Next Steps
|
| 286 |
+
|
| 287 |
+
After successful deployment:
|
| 288 |
+
|
| 289 |
+
1. ✅ Test all API endpoints
|
| 290 |
+
2. ✅ Set up monitoring
|
| 291 |
+
3. ✅ Configure custom domain (optional)
|
| 292 |
+
4. ✅ Add authentication for production
|
| 293 |
+
5. ✅ Implement rate limiting
|
| 294 |
+
6. ✅ Set up error tracking (e.g., Sentry)
|
| 295 |
+
7. ✅ Create API documentation with examples
|
| 296 |
+
8. ✅ Add usage analytics
|
| 297 |
+
|
| 298 |
+
Your API is now live and ready to use! 🚀
|
Dockerfile
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.12-slim
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Install system dependencies
|
| 7 |
+
RUN apt-get update && apt-get install -y \
|
| 8 |
+
tesseract-ocr \
|
| 9 |
+
tesseract-ocr-eng \
|
| 10 |
+
poppler-utils \
|
| 11 |
+
libgl1 \
|
| 12 |
+
libglib2.0-0 \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy requirements first for better caching
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 20 |
+
|
| 21 |
+
# Copy application code
|
| 22 |
+
COPY . .
|
| 23 |
+
|
| 24 |
+
# Create necessary directories
|
| 25 |
+
RUN mkdir -p uploads outputs
|
| 26 |
+
|
| 27 |
+
# Expose port (HuggingFace Spaces uses 7860)
|
| 28 |
+
EXPOSE 7860
|
| 29 |
+
|
| 30 |
+
# Set environment variables
|
| 31 |
+
ENV PYTHONUNBUFFERED=1
|
| 32 |
+
ENV HF_HOME=/app/cache
|
| 33 |
+
|
| 34 |
+
# Run the application
|
| 35 |
+
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 36 |
+
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 PDF Redaction API
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
QUICKSTART.md
ADDED
|
@@ -0,0 +1,271 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Quick Start Guide 🚀
|
| 2 |
+
|
| 3 |
+
## Local Development (5 minutes)
|
| 4 |
+
|
| 5 |
+
### 1. Install System Dependencies
|
| 6 |
+
|
| 7 |
+
**Ubuntu/Debian:**
|
| 8 |
+
```bash
|
| 9 |
+
sudo apt-get update
|
| 10 |
+
sudo apt-get install -y tesseract-ocr poppler-utils
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
**macOS:**
|
| 14 |
+
```bash
|
| 15 |
+
brew install tesseract poppler
|
| 16 |
+
```
|
| 17 |
+
|
| 18 |
+
**Windows:**
|
| 19 |
+
- Download Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
|
| 20 |
+
- Download Poppler: https://github.com/oschwartz10612/poppler-windows/releases
|
| 21 |
+
|
| 22 |
+
### 2. Install Python Dependencies
|
| 23 |
+
|
| 24 |
+
```bash
|
| 25 |
+
pip install -r requirements.txt
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### 3. Run the Server
|
| 29 |
+
|
| 30 |
+
```bash
|
| 31 |
+
python main.py
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
The API will be available at: `http://localhost:7860`
|
| 35 |
+
|
| 36 |
+
### 4. Test with cURL
|
| 37 |
+
|
| 38 |
+
```bash
|
| 39 |
+
# Health check
|
| 40 |
+
curl http://localhost:7860/health
|
| 41 |
+
|
| 42 |
+
# Redact a PDF
|
| 43 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 44 |
+
-F "file=@your_document.pdf" \
|
| 45 |
+
-F "dpi=300"
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
### 5. Access API Documentation
|
| 49 |
+
|
| 50 |
+
Open in browser: `http://localhost:7860/docs`
|
| 51 |
+
|
| 52 |
+
## Using Docker (3 minutes)
|
| 53 |
+
|
| 54 |
+
### 1. Build Image
|
| 55 |
+
|
| 56 |
+
```bash
|
| 57 |
+
docker build -t pdf-redaction-api .
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### 2. Run Container
|
| 61 |
+
|
| 62 |
+
```bash
|
| 63 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### 3. Test
|
| 67 |
+
|
| 68 |
+
```bash
|
| 69 |
+
curl http://localhost:7860/health
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
## Deploy to HuggingFace Spaces (10 minutes)
|
| 73 |
+
|
| 74 |
+
### 1. Create Space
|
| 75 |
+
|
| 76 |
+
1. Go to https://huggingface.co/spaces
|
| 77 |
+
2. Click "Create new Space"
|
| 78 |
+
3. Name: `pdf-redaction-api`
|
| 79 |
+
4. SDK: **Docker**
|
| 80 |
+
5. Click "Create Space"
|
| 81 |
+
|
| 82 |
+
### 2. Push Code
|
| 83 |
+
|
| 84 |
+
```bash
|
| 85 |
+
# Clone your space
|
| 86 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
|
| 87 |
+
cd pdf-redaction-api
|
| 88 |
+
|
| 89 |
+
# Copy all project files
|
| 90 |
+
cp -r /path/to/project/* .
|
| 91 |
+
|
| 92 |
+
# Commit and push
|
| 93 |
+
git add .
|
| 94 |
+
git commit -m "Initial deployment"
|
| 95 |
+
git push
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### 3. Wait for Build
|
| 99 |
+
|
| 100 |
+
Monitor at: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
|
| 101 |
+
|
| 102 |
+
### 4. Test Your Deployed API
|
| 103 |
+
|
| 104 |
+
```bash
|
| 105 |
+
curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
## Example Usage
|
| 109 |
+
|
| 110 |
+
### Python Client
|
| 111 |
+
|
| 112 |
+
```python
|
| 113 |
+
import requests
|
| 114 |
+
|
| 115 |
+
# Upload and redact
|
| 116 |
+
files = {"file": open("document.pdf", "rb")}
|
| 117 |
+
response = requests.post(
|
| 118 |
+
"http://localhost:7860/redact",
|
| 119 |
+
files=files,
|
| 120 |
+
params={"dpi": 300}
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
result = response.json()
|
| 124 |
+
job_id = result["job_id"]
|
| 125 |
+
|
| 126 |
+
# Download redacted PDF
|
| 127 |
+
redacted = requests.get(f"http://localhost:7860/download/{job_id}")
|
| 128 |
+
with open("redacted.pdf", "wb") as f:
|
| 129 |
+
f.write(redacted.content)
|
| 130 |
+
|
| 131 |
+
print(f"Redacted {len(result['entities'])} entities")
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
### JavaScript/Node.js
|
| 135 |
+
|
| 136 |
+
```javascript
|
| 137 |
+
const FormData = require('form-data');
|
| 138 |
+
const fs = require('fs');
|
| 139 |
+
const axios = require('axios');
|
| 140 |
+
|
| 141 |
+
async function redactPDF() {
|
| 142 |
+
const form = new FormData();
|
| 143 |
+
form.append('file', fs.createReadStream('document.pdf'));
|
| 144 |
+
|
| 145 |
+
// Upload and redact
|
| 146 |
+
const response = await axios.post(
|
| 147 |
+
'http://localhost:7860/redact',
|
| 148 |
+
form,
|
| 149 |
+
{
|
| 150 |
+
headers: form.getHeaders(),
|
| 151 |
+
params: { dpi: 300 }
|
| 152 |
+
}
|
| 153 |
+
);
|
| 154 |
+
|
| 155 |
+
const { job_id } = response.data;
|
| 156 |
+
|
| 157 |
+
// Download redacted PDF
|
| 158 |
+
const redacted = await axios.get(
|
| 159 |
+
`http://localhost:7860/download/${job_id}`,
|
| 160 |
+
{ responseType: 'arraybuffer' }
|
| 161 |
+
);
|
| 162 |
+
|
| 163 |
+
fs.writeFileSync('redacted.pdf', redacted.data);
|
| 164 |
+
console.log('Redaction complete!');
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
redactPDF();
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
### cURL Advanced
|
| 171 |
+
|
| 172 |
+
```bash
|
| 173 |
+
# Redact only specific entity types
|
| 174 |
+
curl -X POST "http://localhost:7860/redact" \
|
| 175 |
+
-F "file=@document.pdf" \
|
| 176 |
+
-F "dpi=300" \
|
| 177 |
+
-F "entity_types=PER,ORG"
|
| 178 |
+
|
| 179 |
+
# Get statistics
|
| 180 |
+
curl http://localhost:7860/stats
|
| 181 |
+
|
| 182 |
+
# Download specific file
|
| 183 |
+
curl -O -J http://localhost:7860/download/JOB_ID_HERE
|
| 184 |
+
```
|
| 185 |
+
|
| 186 |
+
## Common Use Cases
|
| 187 |
+
|
| 188 |
+
### 1. Redact All Personal Information
|
| 189 |
+
|
| 190 |
+
```python
|
| 191 |
+
response = requests.post(
|
| 192 |
+
"http://localhost:7860/redact",
|
| 193 |
+
files={"file": open("resume.pdf", "rb")},
|
| 194 |
+
params={"dpi": 300}
|
| 195 |
+
)
|
| 196 |
+
```
|
| 197 |
+
|
| 198 |
+
### 2. Redact Only Names and Organizations
|
| 199 |
+
|
| 200 |
+
```python
|
| 201 |
+
response = requests.post(
|
| 202 |
+
"http://localhost:7860/redact",
|
| 203 |
+
files={"file": open("contract.pdf", "rb")},
|
| 204 |
+
params={
|
| 205 |
+
"dpi": 300,
|
| 206 |
+
"entity_types": "PER,ORG"
|
| 207 |
+
}
|
| 208 |
+
)
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
### 3. Fast Processing (Lower Quality)
|
| 212 |
+
|
| 213 |
+
```python
|
| 214 |
+
response = requests.post(
|
| 215 |
+
"http://localhost:7860/redact",
|
| 216 |
+
files={"file": open("large_doc.pdf", "rb")},
|
| 217 |
+
params={"dpi": 150} # Faster but less accurate
|
| 218 |
+
)
|
| 219 |
+
```
|
| 220 |
+
|
| 221 |
+
### 4. High Quality (Slower)
|
| 222 |
+
|
| 223 |
+
```python
|
| 224 |
+
response = requests.post(
|
| 225 |
+
"http://localhost:7860/redact",
|
| 226 |
+
files={"file": open("important.pdf", "rb")},
|
| 227 |
+
params={"dpi": 600} # Best quality, slowest
|
| 228 |
+
)
|
| 229 |
+
```
|
| 230 |
+
|
| 231 |
+
## Troubleshooting
|
| 232 |
+
|
| 233 |
+
### "Model not loaded"
|
| 234 |
+
**Problem**: NER model failed to load
|
| 235 |
+
**Solution**: Check internet connection, wait for model download
|
| 236 |
+
|
| 237 |
+
### "Tesseract not found"
|
| 238 |
+
**Problem**: OCR engine not installed
|
| 239 |
+
**Solution**: Install tesseract-ocr system package
|
| 240 |
+
|
| 241 |
+
### "Poppler not found"
|
| 242 |
+
**Problem**: PDF converter not installed
|
| 243 |
+
**Solution**: Install poppler-utils system package
|
| 244 |
+
|
| 245 |
+
### Slow processing
|
| 246 |
+
**Problem**: Redaction takes too long
|
| 247 |
+
**Solution**: Lower DPI to 150-200
|
| 248 |
+
|
| 249 |
+
### Out of memory
|
| 250 |
+
**Problem**: Large PDF crashes the API
|
| 251 |
+
**Solution**:
|
| 252 |
+
- Process one page at a time
|
| 253 |
+
- Increase container memory
|
| 254 |
+
- Lower DPI
|
| 255 |
+
|
| 256 |
+
## Next Steps
|
| 257 |
+
|
| 258 |
+
- ✅ Read full [README.md](README.md) for API details
|
| 259 |
+
- ✅ Check [DEPLOYMENT.md](DEPLOYMENT.md) for production setup
|
| 260 |
+
- ✅ Review [STRUCTURE.md](STRUCTURE.md) for code organization
|
| 261 |
+
- ✅ Run tests: `pytest tests/`
|
| 262 |
+
- ✅ Add authentication for production use
|
| 263 |
+
- ✅ Set up monitoring and logging
|
| 264 |
+
|
| 265 |
+
## Support
|
| 266 |
+
|
| 267 |
+
- 📖 API Docs: `http://localhost:7860/docs`
|
| 268 |
+
- 🐛 Issues: Create on your repository
|
| 269 |
+
- 💬 HuggingFace: Community forums
|
| 270 |
+
|
| 271 |
+
Happy redacting! 🔒
|
README.md
CHANGED
|
@@ -1,10 +1,167 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PDF Redaction API
|
| 3 |
+
emoji: 🔒
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# PDF Redaction API 🔒
|
| 12 |
+
|
| 13 |
+
Automatically redact sensitive information from PDF documents using Named Entity Recognition (NER).
|
| 14 |
+
|
| 15 |
+
## Features
|
| 16 |
+
|
| 17 |
+
- 🤖 **Powered by NER**: Uses state-of-the-art Named Entity Recognition
|
| 18 |
+
- 📄 **PDF Support**: Upload and process PDF documents
|
| 19 |
+
- 🎯 **Accurate Redaction**: Correctly positioned black rectangles over sensitive text
|
| 20 |
+
- 🚀 **Fast Processing**: Optimized OCR and NER pipeline
|
| 21 |
+
- 🔧 **Configurable**: Adjust DPI and filter entity types
|
| 22 |
+
|
| 23 |
+
## API Endpoints
|
| 24 |
+
|
| 25 |
+
### `POST /redact`
|
| 26 |
+
|
| 27 |
+
Upload a PDF file and get it redacted.
|
| 28 |
+
|
| 29 |
+
**Parameters:**
|
| 30 |
+
- `file`: PDF file (required)
|
| 31 |
+
- `dpi`: OCR quality (default: 300)
|
| 32 |
+
- `entity_types`: Comma-separated entity types to redact (optional)
|
| 33 |
+
|
| 34 |
+
**Example using cURL:**
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
curl -X POST "https://your-space.hf.space/redact" \
|
| 38 |
+
-F "file=@document.pdf" \
|
| 39 |
+
-F "dpi=300"
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
**Example using Python:**
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
import requests
|
| 46 |
+
|
| 47 |
+
url = "https://your-space.hf.space/redact"
|
| 48 |
+
files = {"file": open("document.pdf", "rb")}
|
| 49 |
+
params = {"dpi": 300}
|
| 50 |
+
|
| 51 |
+
response = requests.post(url, files=files, params=params)
|
| 52 |
+
result = response.json()
|
| 53 |
+
|
| 54 |
+
# Download redacted file
|
| 55 |
+
job_id = result["job_id"]
|
| 56 |
+
download_url = f"https://your-space.hf.space/download/{job_id}"
|
| 57 |
+
redacted_pdf = requests.get(download_url)
|
| 58 |
+
|
| 59 |
+
with open("redacted.pdf", "wb") as f:
|
| 60 |
+
f.write(redacted_pdf.content)
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### `GET /download/{job_id}`
|
| 64 |
+
|
| 65 |
+
Download the redacted PDF file.
|
| 66 |
+
|
| 67 |
+
### `GET /health`
|
| 68 |
+
|
| 69 |
+
Check API health and model status.
|
| 70 |
+
|
| 71 |
+
### `GET /stats`
|
| 72 |
+
|
| 73 |
+
Get API statistics.
|
| 74 |
+
|
| 75 |
+
## Response Format
|
| 76 |
+
|
| 77 |
+
```json
|
| 78 |
+
{
|
| 79 |
+
"job_id": "uuid-here",
|
| 80 |
+
"status": "completed",
|
| 81 |
+
"message": "Successfully redacted 5 entities",
|
| 82 |
+
"entities": [
|
| 83 |
+
{
|
| 84 |
+
"entity_type": "PER",
|
| 85 |
+
"entity_text": "John Doe",
|
| 86 |
+
"page": 1,
|
| 87 |
+
"word_count": 2
|
| 88 |
+
}
|
| 89 |
+
],
|
| 90 |
+
"redacted_file_url": "/download/uuid-here"
|
| 91 |
+
}
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
## Entity Types
|
| 95 |
+
|
| 96 |
+
Common entity types detected:
|
| 97 |
+
- `PER`: Person names
|
| 98 |
+
- `ORG`: Organizations
|
| 99 |
+
- `LOC`: Locations
|
| 100 |
+
- `DATE`: Dates
|
| 101 |
+
- `EMAIL`: Email addresses
|
| 102 |
+
- `PHONE`: Phone numbers
|
| 103 |
+
- And more...
|
| 104 |
+
|
| 105 |
+
## Local Development
|
| 106 |
+
|
| 107 |
+
### Prerequisites
|
| 108 |
+
|
| 109 |
+
- Python 3.10+
|
| 110 |
+
- Tesseract OCR
|
| 111 |
+
- Poppler utils
|
| 112 |
+
|
| 113 |
+
### Installation
|
| 114 |
+
|
| 115 |
+
```bash
|
| 116 |
+
# Install system dependencies (Ubuntu/Debian)
|
| 117 |
+
sudo apt-get install tesseract-ocr poppler-utils
|
| 118 |
+
|
| 119 |
+
# Install Python dependencies
|
| 120 |
+
pip install -r requirements.txt
|
| 121 |
+
|
| 122 |
+
# Run the server
|
| 123 |
+
python main.py
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
The API will be available at `http://localhost:7860`
|
| 127 |
+
|
| 128 |
+
### Using Docker
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
# Build the image
|
| 132 |
+
docker build -t pdf-redaction-api .
|
| 133 |
+
|
| 134 |
+
# Run the container
|
| 135 |
+
docker run -p 7860:7860 pdf-redaction-api
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Configuration
|
| 139 |
+
|
| 140 |
+
Adjust the DPI parameter based on your needs:
|
| 141 |
+
- `150`: Fast processing, lower quality
|
| 142 |
+
- `300`: Recommended balance (default)
|
| 143 |
+
- `600`: High quality, slower processing
|
| 144 |
+
|
| 145 |
+
## Limitations
|
| 146 |
+
|
| 147 |
+
- Maximum file size: Dependent on Space resources
|
| 148 |
+
- Processing time increases with page count and DPI
|
| 149 |
+
- Files are automatically cleaned up after processing
|
| 150 |
+
|
| 151 |
+
## Privacy
|
| 152 |
+
|
| 153 |
+
- Uploaded files are processed in-memory and deleted after redaction
|
| 154 |
+
- No data is stored permanently
|
| 155 |
+
- Use your own deployment for sensitive documents
|
| 156 |
+
|
| 157 |
+
## Credits
|
| 158 |
+
|
| 159 |
+
Built with:
|
| 160 |
+
- [FastAPI](https://fastapi.tiangolo.com/)
|
| 161 |
+
- [Transformers](https://huggingface.co/transformers/)
|
| 162 |
+
- [PyPDF](https://github.com/py-pdf/pypdf)
|
| 163 |
+
- [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
|
| 164 |
+
|
| 165 |
+
## License
|
| 166 |
+
|
| 167 |
+
MIT License - See LICENSE file for details
|
STRUCTURE.md
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Project Structure
|
| 2 |
+
|
| 3 |
+
```
|
| 4 |
+
pdf-redaction-api/
|
| 5 |
+
│
|
| 6 |
+
├── main.py # FastAPI application entry point
|
| 7 |
+
├── Dockerfile # Docker configuration for deployment
|
| 8 |
+
├── requirements.txt # Python dependencies
|
| 9 |
+
├── README.md # Project documentation (for HuggingFace)
|
| 10 |
+
├── DEPLOYMENT.md # Deployment guide
|
| 11 |
+
├── .gitignore # Git ignore rules
|
| 12 |
+
├── .dockerignore # Docker ignore rules
|
| 13 |
+
│
|
| 14 |
+
├── app/ # Application modules
|
| 15 |
+
│ ├── __init__.py # Package initialization
|
| 16 |
+
│ └── redaction.py # Core redaction logic (PDFRedactor class)
|
| 17 |
+
│
|
| 18 |
+
├── uploads/ # Temporary upload directory
|
| 19 |
+
│ └── .gitkeep # Keep directory in git
|
| 20 |
+
│
|
| 21 |
+
├── outputs/ # Redacted PDF output directory
|
| 22 |
+
│ └── .gitkeep # Keep directory in git
|
| 23 |
+
│
|
| 24 |
+
├── tests/ # Test suite
|
| 25 |
+
│ ├── __init__.py
|
| 26 |
+
│ └── test_api.py # API endpoint tests
|
| 27 |
+
│
|
| 28 |
+
└── client_example.py # Example client for API usage
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
## File Descriptions
|
| 32 |
+
|
| 33 |
+
### Core Files
|
| 34 |
+
|
| 35 |
+
#### `main.py`
|
| 36 |
+
FastAPI application with endpoints:
|
| 37 |
+
- `POST /redact` - Upload and redact PDF
|
| 38 |
+
- `GET /download/{job_id}` - Download redacted PDF
|
| 39 |
+
- `GET /health` - Health check
|
| 40 |
+
- `GET /stats` - API statistics
|
| 41 |
+
- `DELETE /cleanup/{job_id}` - Manual cleanup
|
| 42 |
+
|
| 43 |
+
#### `app/redaction.py`
|
| 44 |
+
Core redaction logic:
|
| 45 |
+
- `PDFRedactor` class
|
| 46 |
+
- OCR processing with pytesseract
|
| 47 |
+
- NER using HuggingFace transformers
|
| 48 |
+
- Entity-to-box mapping
|
| 49 |
+
- PDF redaction with coordinate scaling
|
| 50 |
+
|
| 51 |
+
### Configuration Files
|
| 52 |
+
|
| 53 |
+
#### `requirements.txt`
|
| 54 |
+
Python dependencies:
|
| 55 |
+
- FastAPI & Uvicorn (API framework)
|
| 56 |
+
- Transformers & Torch (NER model)
|
| 57 |
+
- PyPDF (PDF manipulation)
|
| 58 |
+
- pdf2image (PDF to image conversion)
|
| 59 |
+
- pytesseract (OCR)
|
| 60 |
+
- Pillow (Image processing)
|
| 61 |
+
|
| 62 |
+
#### `Dockerfile`
|
| 63 |
+
Multi-stage build:
|
| 64 |
+
1. Install system dependencies (tesseract, poppler)
|
| 65 |
+
2. Install Python dependencies
|
| 66 |
+
3. Copy application code
|
| 67 |
+
4. Configure for port 7860 (HuggingFace default)
|
| 68 |
+
|
| 69 |
+
### Documentation
|
| 70 |
+
|
| 71 |
+
#### `README.md`
|
| 72 |
+
HuggingFace Space documentation:
|
| 73 |
+
- Features overview
|
| 74 |
+
- API endpoint documentation
|
| 75 |
+
- Usage examples (cURL, Python)
|
| 76 |
+
- Response format
|
| 77 |
+
- Local development setup
|
| 78 |
+
|
| 79 |
+
#### `DEPLOYMENT.md`
|
| 80 |
+
Step-by-step deployment guide:
|
| 81 |
+
- HuggingFace Spaces setup
|
| 82 |
+
- Git workflow
|
| 83 |
+
- Configuration options
|
| 84 |
+
- Security considerations
|
| 85 |
+
- Troubleshooting
|
| 86 |
+
- Cost estimation
|
| 87 |
+
|
| 88 |
+
### Testing & Examples
|
| 89 |
+
|
| 90 |
+
#### `tests/test_api.py`
|
| 91 |
+
Unit tests for API endpoints:
|
| 92 |
+
- Health check tests
|
| 93 |
+
- Upload validation tests
|
| 94 |
+
- Error handling tests
|
| 95 |
+
|
| 96 |
+
#### `client_example.py`
|
| 97 |
+
Example client implementation:
|
| 98 |
+
- Upload PDF
|
| 99 |
+
- Download redacted file
|
| 100 |
+
- Health check
|
| 101 |
+
- Statistics
|
| 102 |
+
|
| 103 |
+
## Data Flow
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
┌─────────────────────────────────────────────────────────┐
|
| 107 |
+
│ 1. Client uploads PDF │
|
| 108 |
+
│ POST /redact with file │
|
| 109 |
+
└─────────────────────────────────────────────────────────┘
|
| 110 |
+
↓
|
| 111 |
+
┌─────────────────────────────────────────────────────────┐
|
| 112 |
+
│ 2. FastAPI (main.py) │
|
| 113 |
+
│ - Validates file │
|
| 114 |
+
│ - Generates job_id │
|
| 115 |
+
│ - Saves to uploads/ │
|
| 116 |
+
└─────────────────────────────────────────────────────────┘
|
| 117 |
+
↓
|
| 118 |
+
┌─────────────────────────────────────────────────────────┐
|
| 119 |
+
│ 3. PDFRedactor (app/redaction.py) │
|
| 120 |
+
│ - perform_ocr() → Extract text + boxes │
|
| 121 |
+
│ - run_ner() → Identify entities │
|
| 122 |
+
│ - map_entities_to_boxes() → Link entities to coords │
|
| 123 |
+
│ - create_redacted_pdf() → Generate output │
|
| 124 |
+
└─────────────────────────────────────────────────────────┘
|
| 125 |
+
↓
|
| 126 |
+
┌─────────────────────────────────────────────────────────┐
|
| 127 |
+
│ 4. Response │
|
| 128 |
+
│ - Return job_id and entity list │
|
| 129 |
+
│ - Save redacted PDF to outputs/ │
|
| 130 |
+
└─────────────────────────────────────────────────────────┘
|
| 131 |
+
↓
|
| 132 |
+
┌─────────────────────────────────────────────────────────┐
|
| 133 |
+
│ 5. Client downloads │
|
| 134 |
+
│ GET /download/{job_id} │
|
| 135 |
+
└─────────────────────────────────────────────────────────┘
|
| 136 |
+
```
|
| 137 |
+
|
| 138 |
+
## Key Components
|
| 139 |
+
|
| 140 |
+
### 1. FastAPI Application (`main.py`)
|
| 141 |
+
|
| 142 |
+
**Endpoints:**
|
| 143 |
+
- RESTful API design
|
| 144 |
+
- File upload handling
|
| 145 |
+
- Background task cleanup
|
| 146 |
+
- CORS middleware for web access
|
| 147 |
+
|
| 148 |
+
**Features:**
|
| 149 |
+
- Automatic OpenAPI documentation at `/docs`
|
| 150 |
+
- JSON response models with Pydantic
|
| 151 |
+
- Error handling with HTTP exceptions
|
| 152 |
+
- Request validation
|
| 153 |
+
|
| 154 |
+
### 2. Redaction Engine (`app/redaction.py`)
|
| 155 |
+
|
| 156 |
+
**Pipeline Steps:**
|
| 157 |
+
|
| 158 |
+
1. **OCR Processing**
|
| 159 |
+
- Convert PDF pages to images (pdf2image)
|
| 160 |
+
- Extract text and bounding boxes (pytesseract)
|
| 161 |
+
- Store image dimensions for coordinate scaling
|
| 162 |
+
|
| 163 |
+
2. **NER Processing**
|
| 164 |
+
- Load HuggingFace model
|
| 165 |
+
- Identify entities in text
|
| 166 |
+
- Return entity types and character positions
|
| 167 |
+
|
| 168 |
+
3. **Mapping**
|
| 169 |
+
- Create character span index for OCR words
|
| 170 |
+
- Match NER entities to OCR bounding boxes
|
| 171 |
+
- Handle partial word matches
|
| 172 |
+
|
| 173 |
+
4. **Redaction**
|
| 174 |
+
- Scale OCR image coordinates to PDF points
|
| 175 |
+
- Create black rectangle annotations
|
| 176 |
+
- Write redacted PDF with pypdf
|
| 177 |
+
|
| 178 |
+
### 3. Docker Container
|
| 179 |
+
|
| 180 |
+
**Layers:**
|
| 181 |
+
- Base: Python 3.10 slim
|
| 182 |
+
- System packages: tesseract-ocr, poppler-utils
|
| 183 |
+
- Python packages: From requirements.txt
|
| 184 |
+
- Application code: Copied last for better caching
|
| 185 |
+
|
| 186 |
+
**Optimizations:**
|
| 187 |
+
- Multi-stage build (not used here, but possible)
|
| 188 |
+
- Minimal base image
|
| 189 |
+
- Cached dependency layers
|
| 190 |
+
- .dockerignore to reduce context size
|
| 191 |
+
|
| 192 |
+
## Environment Variables
|
| 193 |
+
|
| 194 |
+
Default configuration (can be overridden):
|
| 195 |
+
|
| 196 |
+
```bash
|
| 197 |
+
PYTHONUNBUFFERED=1 # Immediate log output
|
| 198 |
+
HF_HOME=/app/cache # HuggingFace cache directory
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
## Port Configuration
|
| 202 |
+
|
| 203 |
+
- **Development**: 7860 (configurable in main.py)
|
| 204 |
+
- **Production (HF Spaces)**: 7860 (required)
|
| 205 |
+
|
| 206 |
+
## Directory Permissions
|
| 207 |
+
|
| 208 |
+
Ensure write permissions for:
|
| 209 |
+
- `uploads/` - Temporary PDF storage
|
| 210 |
+
- `outputs/` - Redacted PDF storage
|
| 211 |
+
- `cache/` - Model cache (created automatically)
|
| 212 |
+
|
| 213 |
+
## Adding New Features
|
| 214 |
+
|
| 215 |
+
### Add New Endpoint
|
| 216 |
+
|
| 217 |
+
1. Define in `main.py`:
|
| 218 |
+
```python
|
| 219 |
+
@app.get("/new-endpoint")
|
| 220 |
+
async def new_endpoint():
|
| 221 |
+
return {"message": "Hello"}
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
2. Add response model if needed
|
| 225 |
+
3. Update README.md documentation
|
| 226 |
+
4. Add tests in `tests/test_api.py`
|
| 227 |
+
|
| 228 |
+
### Add New Redaction Option
|
| 229 |
+
|
| 230 |
+
1. Modify `PDFRedactor` class in `app/redaction.py`
|
| 231 |
+
2. Add parameter to `redact_document()` method
|
| 232 |
+
3. Update API endpoint in `main.py`
|
| 233 |
+
4. Document in README.md
|
| 234 |
+
|
| 235 |
+
### Add Authentication
|
| 236 |
+
|
| 237 |
+
1. Install: `pip install python-jose passlib`
|
| 238 |
+
2. Create `app/auth.py` with JWT logic
|
| 239 |
+
3. Add middleware to `main.py`
|
| 240 |
+
4. Protect endpoints with dependencies
|
| 241 |
+
|
| 242 |
+
## Best Practices
|
| 243 |
+
|
| 244 |
+
1. **Logging**: Use `logger` for all important events
|
| 245 |
+
2. **Error Handling**: Catch exceptions and return meaningful errors
|
| 246 |
+
3. **Validation**: Use Pydantic models for request/response validation
|
| 247 |
+
4. **Cleanup**: Always clean up temporary files
|
| 248 |
+
5. **Documentation**: Keep README.md and code comments updated
|
| 249 |
+
6. **Testing**: Add tests for new features
|
| 250 |
+
|
| 251 |
+
## Performance Considerations
|
| 252 |
+
|
| 253 |
+
### Bottlenecks
|
| 254 |
+
1. OCR processing (most time-consuming)
|
| 255 |
+
2. Model inference (NER)
|
| 256 |
+
3. File I/O
|
| 257 |
+
|
| 258 |
+
### Optimizations
|
| 259 |
+
- Lower DPI for faster OCR (trade-off with accuracy)
|
| 260 |
+
- Cache loaded models in memory
|
| 261 |
+
- Use async file operations
|
| 262 |
+
- Implement request queuing for high load
|
| 263 |
+
- Consider GPU for NER model
|
| 264 |
+
|
| 265 |
+
### Scaling
|
| 266 |
+
- Horizontal: Multiple container instances
|
| 267 |
+
- Vertical: Larger CPU/RAM allocation
|
| 268 |
+
- Caching: Redis for temporary results
|
| 269 |
+
- Queue: Celery for background processing
|
app/__init__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
App module for PDF redaction API
|
| 3 |
+
"""
|
| 4 |
+
from .redaction import PDFRedactor
|
| 5 |
+
|
| 6 |
+
__all__ = ['PDFRedactor']
|
app/redaction.py
ADDED
|
@@ -0,0 +1,327 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Redaction module using NER
|
| 3 |
+
"""
|
| 4 |
+
from pdf2image import convert_from_path
|
| 5 |
+
import pytesseract
|
| 6 |
+
from pypdf import PdfReader, PdfWriter
|
| 7 |
+
from pypdf.generic import DictionaryObject, ArrayObject, NameObject, NumberObject
|
| 8 |
+
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
|
| 9 |
+
from typing import List, Dict, Optional
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.getLogger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class PDFRedactor:
|
| 16 |
+
"""PDF Redaction using Named Entity Recognition"""
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_name: str = "openai/privacy-filter"):
|
| 19 |
+
"""
|
| 20 |
+
Initialize the PDF Redactor
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
model_name: HuggingFace model ID for NER
|
| 24 |
+
"""
|
| 25 |
+
self.model_name = model_name
|
| 26 |
+
self.ner_pipeline = None
|
| 27 |
+
self._load_model()
|
| 28 |
+
|
| 29 |
+
def _load_model(self):
|
| 30 |
+
"""Load the NER model"""
|
| 31 |
+
try:
|
| 32 |
+
logger.info(f"Loading NER model: {self.model_name}")
|
| 33 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 34 |
+
self.model_name, trust_remote_code=True
|
| 35 |
+
)
|
| 36 |
+
model = AutoModelForTokenClassification.from_pretrained(
|
| 37 |
+
self.model_name, trust_remote_code=True, device_map="auto"
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
self.ner_pipeline = pipeline(
|
| 41 |
+
"token-classification",
|
| 42 |
+
model=model,
|
| 43 |
+
tokenizer=tokenizer,
|
| 44 |
+
aggregation_strategy="simple",
|
| 45 |
+
)
|
| 46 |
+
logger.info("NER model loaded successfully")
|
| 47 |
+
except Exception as e:
|
| 48 |
+
logger.error(f"Error loading NER model: {str(e)}")
|
| 49 |
+
raise
|
| 50 |
+
|
| 51 |
+
def is_model_loaded(self) -> bool:
|
| 52 |
+
"""Check if the model is loaded"""
|
| 53 |
+
return self.ner_pipeline is not None
|
| 54 |
+
|
| 55 |
+
def perform_ocr(self, pdf_path: str, dpi: int = 300) -> List[Dict]:
|
| 56 |
+
"""
|
| 57 |
+
Perform OCR on PDF and extract word bounding boxes
|
| 58 |
+
|
| 59 |
+
Args:
|
| 60 |
+
pdf_path: Path to the PDF file
|
| 61 |
+
dpi: DPI for PDF to image conversion
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
List of word data with bounding boxes and image dimensions
|
| 65 |
+
"""
|
| 66 |
+
logger.info(f"Starting OCR on {pdf_path} at {dpi} DPI")
|
| 67 |
+
all_words_data = []
|
| 68 |
+
|
| 69 |
+
try:
|
| 70 |
+
images = convert_from_path(pdf_path, dpi=dpi)
|
| 71 |
+
logger.info(f"Converted PDF to {len(images)} images")
|
| 72 |
+
|
| 73 |
+
for page_num, image in enumerate(images):
|
| 74 |
+
# Get image dimensions
|
| 75 |
+
image_width, image_height = image.size
|
| 76 |
+
|
| 77 |
+
# Perform OCR
|
| 78 |
+
data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
|
| 79 |
+
logger.info(f"OCR data: {data['text']}")
|
| 80 |
+
|
| 81 |
+
num_words = len(data['text'])
|
| 82 |
+
for i in range(num_words):
|
| 83 |
+
word_text = data['text'][i].strip()
|
| 84 |
+
confidence = int(data['conf'][i])
|
| 85 |
+
|
| 86 |
+
# Filter out empty or low-confidence words
|
| 87 |
+
if word_text and confidence > 0:
|
| 88 |
+
all_words_data.append({
|
| 89 |
+
'text': word_text,
|
| 90 |
+
'box': (data['left'][i], data['top'][i],
|
| 91 |
+
data['width'][i], data['height'][i]),
|
| 92 |
+
'page': page_num + 1,
|
| 93 |
+
'confidence': confidence,
|
| 94 |
+
'image_width': image_width,
|
| 95 |
+
'image_height': image_height
|
| 96 |
+
})
|
| 97 |
+
|
| 98 |
+
logger.info(f"Processed page {page_num + 1}: {len([w for w in all_words_data if w['page'] == page_num + 1])} words")
|
| 99 |
+
|
| 100 |
+
logger.info(f"OCR complete: {len(all_words_data)} total words extracted")
|
| 101 |
+
return all_words_data
|
| 102 |
+
|
| 103 |
+
except Exception as e:
|
| 104 |
+
logger.error(f"Error during OCR: {str(e)}")
|
| 105 |
+
raise
|
| 106 |
+
|
| 107 |
+
def run_ner(self, text: str) -> List[Dict]:
|
| 108 |
+
"""
|
| 109 |
+
Run NER on text
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
text: Input text
|
| 113 |
+
|
| 114 |
+
Returns:
|
| 115 |
+
List of identified entities
|
| 116 |
+
"""
|
| 117 |
+
if not self.ner_pipeline:
|
| 118 |
+
raise RuntimeError("NER model not loaded")
|
| 119 |
+
|
| 120 |
+
logger.info(f"Running NER on text of length {len(text)}")
|
| 121 |
+
|
| 122 |
+
try:
|
| 123 |
+
results = self.ner_pipeline(text)
|
| 124 |
+
logger.info(f"NER identified {len(results)} entities")
|
| 125 |
+
return results
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Error during NER: {str(e)}")
|
| 128 |
+
raise
|
| 129 |
+
|
| 130 |
+
def map_entities_to_boxes(self, ner_results: List[Dict],
|
| 131 |
+
ocr_data: List[Dict]) -> List[Dict]:
|
| 132 |
+
"""
|
| 133 |
+
Map NER entities to OCR bounding boxes
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
ner_results: List of NER entities
|
| 137 |
+
ocr_data: List of OCR word data
|
| 138 |
+
|
| 139 |
+
Returns:
|
| 140 |
+
List of mapped entities with bounding boxes
|
| 141 |
+
"""
|
| 142 |
+
logger.info("Mapping NER entities to OCR bounding boxes")
|
| 143 |
+
mapped_entities = []
|
| 144 |
+
|
| 145 |
+
# Create character span mapping
|
| 146 |
+
ocr_word_char_spans = []
|
| 147 |
+
current_char_index = 0
|
| 148 |
+
|
| 149 |
+
for ocr_data_idx, word_info in enumerate(ocr_data):
|
| 150 |
+
word_text = word_info['text']
|
| 151 |
+
length = len(word_text)
|
| 152 |
+
|
| 153 |
+
ocr_word_char_spans.append({
|
| 154 |
+
'ocr_data_idx': ocr_data_idx,
|
| 155 |
+
'start_char': current_char_index,
|
| 156 |
+
'end_char': current_char_index + length
|
| 157 |
+
})
|
| 158 |
+
current_char_index += length + 1
|
| 159 |
+
|
| 160 |
+
# Map each NER entity to OCR words
|
| 161 |
+
for ner_entity in ner_results:
|
| 162 |
+
ner_entity_type = ner_entity['entity_group']
|
| 163 |
+
ner_start = ner_entity['start']
|
| 164 |
+
ner_end = ner_entity['end']
|
| 165 |
+
ner_word = ner_entity['word']
|
| 166 |
+
|
| 167 |
+
matching_ocr_words = []
|
| 168 |
+
|
| 169 |
+
for ocr_word_span in ocr_word_char_spans:
|
| 170 |
+
ocr_start = ocr_word_span['start_char']
|
| 171 |
+
ocr_end = ocr_word_span['end_char']
|
| 172 |
+
|
| 173 |
+
# Check for overlap
|
| 174 |
+
if max(ocr_start, ner_start) < min(ocr_end, ner_end):
|
| 175 |
+
matching_ocr_words.append(ocr_data[ocr_word_span['ocr_data_idx']])
|
| 176 |
+
|
| 177 |
+
if matching_ocr_words:
|
| 178 |
+
mapped_entities.append({
|
| 179 |
+
'entity_type': ner_entity_type,
|
| 180 |
+
'entity_text': ner_word,
|
| 181 |
+
'words': matching_ocr_words
|
| 182 |
+
})
|
| 183 |
+
|
| 184 |
+
logger.info(f"Mapped {len(mapped_entities)} entities to bounding boxes")
|
| 185 |
+
return mapped_entities
|
| 186 |
+
|
| 187 |
+
def create_redacted_pdf(self, original_pdf_path: str,
|
| 188 |
+
mapped_entities: List[Dict],
|
| 189 |
+
output_path: str) -> str:
|
| 190 |
+
"""
|
| 191 |
+
Create redacted PDF with black rectangles over entities
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
original_pdf_path: Path to original PDF
|
| 195 |
+
mapped_entities: List of entities with bounding boxes
|
| 196 |
+
output_path: Path for output PDF
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Path to redacted PDF
|
| 200 |
+
"""
|
| 201 |
+
logger.info(f"Creating redacted PDF: {output_path}")
|
| 202 |
+
|
| 203 |
+
try:
|
| 204 |
+
reader = PdfReader(original_pdf_path)
|
| 205 |
+
writer = PdfWriter()
|
| 206 |
+
|
| 207 |
+
for page_num in range(len(reader.pages)):
|
| 208 |
+
page = reader.pages[page_num]
|
| 209 |
+
media_box = page.mediabox
|
| 210 |
+
page_width = float(media_box.width)
|
| 211 |
+
page_height = float(media_box.height)
|
| 212 |
+
|
| 213 |
+
writer.add_page(page)
|
| 214 |
+
|
| 215 |
+
page_entities = 0
|
| 216 |
+
for entity_info in mapped_entities:
|
| 217 |
+
for word_info in entity_info['words']:
|
| 218 |
+
if word_info['page'] == page_num + 1:
|
| 219 |
+
x, y, w, h = word_info['box']
|
| 220 |
+
|
| 221 |
+
# Get image dimensions
|
| 222 |
+
image_width = word_info['image_width']
|
| 223 |
+
image_height = word_info['image_height']
|
| 224 |
+
|
| 225 |
+
# Scale coordinates
|
| 226 |
+
scale_x = page_width / image_width
|
| 227 |
+
scale_y = page_height / image_height
|
| 228 |
+
|
| 229 |
+
x_scaled = x * scale_x
|
| 230 |
+
y_scaled = y * scale_y
|
| 231 |
+
w_scaled = w * scale_x
|
| 232 |
+
h_scaled = h * scale_y
|
| 233 |
+
|
| 234 |
+
# Convert to PDF coordinates
|
| 235 |
+
llx = x_scaled
|
| 236 |
+
lly = page_height - (y_scaled + h_scaled)
|
| 237 |
+
urx = x_scaled + w_scaled
|
| 238 |
+
ury = page_height - y_scaled
|
| 239 |
+
|
| 240 |
+
# Create redaction annotation
|
| 241 |
+
redaction_annotation = DictionaryObject()
|
| 242 |
+
redaction_annotation.update({
|
| 243 |
+
NameObject("/Type"): NameObject("/Annot"),
|
| 244 |
+
NameObject("/Subtype"): NameObject("/Square"),
|
| 245 |
+
NameObject("/Rect"): ArrayObject([
|
| 246 |
+
NumberObject(llx),
|
| 247 |
+
NumberObject(lly),
|
| 248 |
+
NumberObject(urx),
|
| 249 |
+
NumberObject(ury),
|
| 250 |
+
]),
|
| 251 |
+
NameObject("/C"): ArrayObject([
|
| 252 |
+
NumberObject(0), NumberObject(0), NumberObject(0)
|
| 253 |
+
]),
|
| 254 |
+
NameObject("/IC"): ArrayObject([
|
| 255 |
+
NumberObject(0), NumberObject(0), NumberObject(0)
|
| 256 |
+
]),
|
| 257 |
+
NameObject("/BS"): DictionaryObject({
|
| 258 |
+
NameObject("/W"): NumberObject(0)
|
| 259 |
+
})
|
| 260 |
+
})
|
| 261 |
+
|
| 262 |
+
writer.add_annotation(page_number=page_num,
|
| 263 |
+
annotation=redaction_annotation)
|
| 264 |
+
page_entities += 1
|
| 265 |
+
|
| 266 |
+
logger.info(f"Page {page_num + 1}: Added {page_entities} redactions")
|
| 267 |
+
|
| 268 |
+
# Write output
|
| 269 |
+
with open(output_path, "wb") as output_file:
|
| 270 |
+
writer.write(output_file)
|
| 271 |
+
|
| 272 |
+
logger.info(f"Redacted PDF created successfully: {output_path}")
|
| 273 |
+
return output_path
|
| 274 |
+
|
| 275 |
+
except Exception as e:
|
| 276 |
+
logger.error(f"Error creating redacted PDF: {str(e)}")
|
| 277 |
+
raise
|
| 278 |
+
|
| 279 |
+
def redact_document(self, pdf_path: str, output_path: str,
|
| 280 |
+
dpi: int = 300,
|
| 281 |
+
entity_filter: Optional[List[str]] = None) -> Dict:
|
| 282 |
+
"""
|
| 283 |
+
Complete redaction pipeline
|
| 284 |
+
|
| 285 |
+
Args:
|
| 286 |
+
pdf_path: Path to input PDF
|
| 287 |
+
output_path: Path for output PDF
|
| 288 |
+
dpi: DPI for OCR
|
| 289 |
+
entity_filter: List of entity types to redact (None = all). Valid
|
| 290 |
+
values: account_number, private_address, private_email,
|
| 291 |
+
private_person, private_phone, private_url, private_date, secret
|
| 292 |
+
|
| 293 |
+
Returns:
|
| 294 |
+
Dictionary with redaction results
|
| 295 |
+
"""
|
| 296 |
+
logger.info(f"Starting redaction pipeline for {pdf_path}")
|
| 297 |
+
|
| 298 |
+
# Step 1: OCR
|
| 299 |
+
ocr_data = self.perform_ocr(pdf_path, dpi)
|
| 300 |
+
|
| 301 |
+
# Step 2: Extract text
|
| 302 |
+
full_text = " ".join([word['text'] for word in ocr_data])
|
| 303 |
+
|
| 304 |
+
# Step 3: NER
|
| 305 |
+
ner_results = self.run_ner(full_text)
|
| 306 |
+
|
| 307 |
+
# Step 4: Map entities to boxes
|
| 308 |
+
mapped_entities = self.map_entities_to_boxes(ner_results, ocr_data)
|
| 309 |
+
|
| 310 |
+
# Step 5: Filter entities if requested
|
| 311 |
+
if entity_filter:
|
| 312 |
+
mapped_entities = [
|
| 313 |
+
e for e in mapped_entities
|
| 314 |
+
if e['entity_type'] in entity_filter
|
| 315 |
+
]
|
| 316 |
+
logger.info(f"Filtered to {len(mapped_entities)} entities of types: {entity_filter}")
|
| 317 |
+
|
| 318 |
+
# Step 6: Create redacted PDF
|
| 319 |
+
self.create_redacted_pdf(pdf_path, mapped_entities, output_path)
|
| 320 |
+
|
| 321 |
+
return {
|
| 322 |
+
'output_path': output_path,
|
| 323 |
+
'total_words': len(ocr_data),
|
| 324 |
+
'total_entities': len(ner_results),
|
| 325 |
+
'redacted_entities': len(mapped_entities),
|
| 326 |
+
'entities': mapped_entities
|
| 327 |
+
}
|
client_example.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Example client for PDF Redaction API
|
| 3 |
+
"""
|
| 4 |
+
import requests
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf",
|
| 10 |
+
dpi: int = 300, entity_types: str = None):
|
| 11 |
+
"""
|
| 12 |
+
Redact a PDF file using the API
|
| 13 |
+
|
| 14 |
+
Args:
|
| 15 |
+
api_url: Base URL of the API
|
| 16 |
+
pdf_path: Path to the PDF file to redact
|
| 17 |
+
output_path: Path to save the redacted PDF
|
| 18 |
+
dpi: DPI for OCR processing
|
| 19 |
+
entity_types: Comma-separated list of entity types to redact
|
| 20 |
+
"""
|
| 21 |
+
# Check if file exists
|
| 22 |
+
if not Path(pdf_path).exists():
|
| 23 |
+
print(f"Error: File {pdf_path} not found")
|
| 24 |
+
return False
|
| 25 |
+
|
| 26 |
+
print(f"Uploading {pdf_path}...")
|
| 27 |
+
|
| 28 |
+
# Prepare request
|
| 29 |
+
files = {"file": open(pdf_path, "rb")}
|
| 30 |
+
params = {"dpi": dpi}
|
| 31 |
+
|
| 32 |
+
if entity_types:
|
| 33 |
+
params["entity_types"] = entity_types
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
# Upload and redact
|
| 37 |
+
response = requests.post(f"{api_url}/redact", files=files, params=params)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
|
| 40 |
+
result = response.json()
|
| 41 |
+
print(f"\nStatus: {result['status']}")
|
| 42 |
+
print(f"Message: {result['message']}")
|
| 43 |
+
|
| 44 |
+
# Display found entities
|
| 45 |
+
if result.get('entities'):
|
| 46 |
+
print("\nEntities redacted:")
|
| 47 |
+
for i, entity in enumerate(result['entities'], 1):
|
| 48 |
+
print(f" {i}. {entity['entity_type']}: {entity['entity_text']} "
|
| 49 |
+
f"(Page {entity['page']}, {entity['word_count']} words)")
|
| 50 |
+
|
| 51 |
+
# Download redacted file
|
| 52 |
+
job_id = result['job_id']
|
| 53 |
+
print(f"\nDownloading redacted PDF...")
|
| 54 |
+
|
| 55 |
+
download_response = requests.get(f"{api_url}/download/{job_id}")
|
| 56 |
+
download_response.raise_for_status()
|
| 57 |
+
|
| 58 |
+
# Save file
|
| 59 |
+
with open(output_path, "wb") as f:
|
| 60 |
+
f.write(download_response.content)
|
| 61 |
+
|
| 62 |
+
print(f"✓ Redacted PDF saved to: {output_path}")
|
| 63 |
+
|
| 64 |
+
# Cleanup (optional)
|
| 65 |
+
# requests.delete(f"{api_url}/cleanup/{job_id}")
|
| 66 |
+
|
| 67 |
+
return True
|
| 68 |
+
|
| 69 |
+
except requests.exceptions.RequestException as e:
|
| 70 |
+
print(f"Error: {e}")
|
| 71 |
+
return False
|
| 72 |
+
finally:
|
| 73 |
+
files["file"].close()
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def check_health(api_url: str):
|
| 77 |
+
"""Check API health"""
|
| 78 |
+
try:
|
| 79 |
+
response = requests.get(f"{api_url}/health")
|
| 80 |
+
response.raise_for_status()
|
| 81 |
+
data = response.json()
|
| 82 |
+
|
| 83 |
+
print(f"API Status: {data['status']}")
|
| 84 |
+
print(f"Version: {data['version']}")
|
| 85 |
+
print(f"Model Loaded: {data['model_loaded']}")
|
| 86 |
+
|
| 87 |
+
return True
|
| 88 |
+
except requests.exceptions.RequestException as e:
|
| 89 |
+
print(f"Error checking health: {e}")
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def get_stats(api_url: str):
|
| 94 |
+
"""Get API statistics"""
|
| 95 |
+
try:
|
| 96 |
+
response = requests.get(f"{api_url}/stats")
|
| 97 |
+
response.raise_for_status()
|
| 98 |
+
data = response.json()
|
| 99 |
+
|
| 100 |
+
print("API Statistics:")
|
| 101 |
+
print(f" Pending uploads: {data['pending_uploads']}")
|
| 102 |
+
print(f" Processed files: {data['processed_files']}")
|
| 103 |
+
print(f" Model loaded: {data['model_loaded']}")
|
| 104 |
+
|
| 105 |
+
return True
|
| 106 |
+
except requests.exceptions.RequestException as e:
|
| 107 |
+
print(f"Error getting stats: {e}")
|
| 108 |
+
return False
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
if __name__ == "__main__":
|
| 112 |
+
# Example usage
|
| 113 |
+
|
| 114 |
+
# For local development
|
| 115 |
+
API_URL = "http://localhost:7860"
|
| 116 |
+
|
| 117 |
+
# For HuggingFace Spaces (replace with your space URL)
|
| 118 |
+
# API_URL = "https://your-username-pdf-redaction-api.hf.space"
|
| 119 |
+
|
| 120 |
+
if len(sys.argv) < 2:
|
| 121 |
+
print("Usage:")
|
| 122 |
+
print(" python client_example.py <pdf_file> [output_file] [dpi]")
|
| 123 |
+
print("\nOr check health:")
|
| 124 |
+
print(" python client_example.py --health")
|
| 125 |
+
print("\nOr get stats:")
|
| 126 |
+
print(" python client_example.py --stats")
|
| 127 |
+
sys.exit(1)
|
| 128 |
+
|
| 129 |
+
if sys.argv[1] == "--health":
|
| 130 |
+
check_health(API_URL)
|
| 131 |
+
elif sys.argv[1] == "--stats":
|
| 132 |
+
get_stats(API_URL)
|
| 133 |
+
else:
|
| 134 |
+
pdf_path = sys.argv[1]
|
| 135 |
+
output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
|
| 136 |
+
dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
|
| 137 |
+
|
| 138 |
+
# Optional: Filter specific entity types
|
| 139 |
+
# entity_types = "PER,ORG" # Only redact persons and organizations
|
| 140 |
+
entity_types = None # Redact all entity types
|
| 141 |
+
|
| 142 |
+
redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)
|
client_supabase.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from supabase import create_client, Client
|
| 2 |
+
import os
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
load_dotenv()
|
| 5 |
+
|
| 6 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 7 |
+
SUPABASE_KEY = os.getenv("SERVICE_ROLE_KEY") # server-side key
|
| 8 |
+
|
| 9 |
+
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: '3.8'
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
api:
|
| 5 |
+
build:
|
| 6 |
+
context: .
|
| 7 |
+
dockerfile: Dockerfile
|
| 8 |
+
ports:
|
| 9 |
+
- "7860:7860"
|
| 10 |
+
volumes:
|
| 11 |
+
# Mount code for development (hot reload)
|
| 12 |
+
- .:/app
|
| 13 |
+
# Persistent storage for uploads/outputs
|
| 14 |
+
- ./uploads:/app/uploads
|
| 15 |
+
- ./outputs:/app/outputs
|
| 16 |
+
environment:
|
| 17 |
+
- PYTHONUNBUFFERED=1
|
| 18 |
+
- HF_HOME=/app/cache
|
| 19 |
+
- LOG_LEVEL=DEBUG
|
| 20 |
+
command: uvicorn main:app --host 0.0.0.0 --port 7860 --reload
|
| 21 |
+
restart: unless-stopped
|
| 22 |
+
healthcheck:
|
| 23 |
+
test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
|
| 24 |
+
interval: 30s
|
| 25 |
+
timeout: 10s
|
| 26 |
+
retries: 3
|
| 27 |
+
start_period: 40s
|
| 28 |
+
|
| 29 |
+
# Optional: Add nginx for production
|
| 30 |
+
# nginx:
|
| 31 |
+
# image: nginx:alpine
|
| 32 |
+
# ports:
|
| 33 |
+
# - "80:80"
|
| 34 |
+
# volumes:
|
| 35 |
+
# - ./nginx.conf:/etc/nginx/nginx.conf
|
| 36 |
+
# depends_on:
|
| 37 |
+
# - api
|
| 38 |
+
|
| 39 |
+
# Optional: Add Redis for caching
|
| 40 |
+
# redis:
|
| 41 |
+
# image: redis:alpine
|
| 42 |
+
# ports:
|
| 43 |
+
# - "6379:6379"
|
| 44 |
+
# volumes:
|
| 45 |
+
# - redis-data:/data
|
| 46 |
+
|
| 47 |
+
# volumes:
|
| 48 |
+
# redis-data:
|
main.py
ADDED
|
@@ -0,0 +1,344 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application for PDF redaction using NER
|
| 3 |
+
"""
|
| 4 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
|
| 5 |
+
from fastapi.responses import FileResponse
|
| 6 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 7 |
+
from pydantic import BaseModel
|
| 8 |
+
from typing import List, Optional, Dict
|
| 9 |
+
import uvicorn
|
| 10 |
+
import os
|
| 11 |
+
import uuid
|
| 12 |
+
import shutil
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import logging
|
| 15 |
+
import sys
|
| 16 |
+
from app.redaction import PDFRedactor
|
| 17 |
+
from client_supabase import supabase # Supabase client in separate file
|
| 18 |
+
|
| 19 |
+
# Configure logging
|
| 20 |
+
logging.basicConfig(
|
| 21 |
+
level=logging.INFO,
|
| 22 |
+
stream=sys.stdout,
|
| 23 |
+
force=True,
|
| 24 |
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
| 25 |
+
)
|
| 26 |
+
logger = logging.getLogger(__name__)
|
| 27 |
+
|
| 28 |
+
# Initialize FastAPI app
|
| 29 |
+
app = FastAPI(
|
| 30 |
+
title="PDF Redaction API",
|
| 31 |
+
description="Redact sensitive information from PDFs using Named Entity Recognition",
|
| 32 |
+
version="1.0.0"
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# CORS middleware
|
| 36 |
+
app.add_middleware(
|
| 37 |
+
CORSMiddleware,
|
| 38 |
+
allow_origins=["*"],
|
| 39 |
+
allow_credentials=True,
|
| 40 |
+
allow_methods=["*"],
|
| 41 |
+
allow_headers=["*"],
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Create directories
|
| 45 |
+
UPLOAD_DIR = Path("uploads")
|
| 46 |
+
OUTPUT_DIR = Path("outputs")
|
| 47 |
+
UPLOAD_DIR.mkdir(exist_ok=True)
|
| 48 |
+
OUTPUT_DIR.mkdir(exist_ok=True)
|
| 49 |
+
|
| 50 |
+
# Initialize redactor
|
| 51 |
+
redactor = PDFRedactor()
|
| 52 |
+
|
| 53 |
+
# ---------------- Response Models ----------------
|
| 54 |
+
class RedactionEntity(BaseModel):
|
| 55 |
+
entity_type: str
|
| 56 |
+
entity_text: str
|
| 57 |
+
page: int
|
| 58 |
+
word_count: int
|
| 59 |
+
|
| 60 |
+
class RedactionResponse(BaseModel):
|
| 61 |
+
job_id: str
|
| 62 |
+
status: str
|
| 63 |
+
message: str
|
| 64 |
+
entities: Optional[List[RedactionEntity]] = None
|
| 65 |
+
redacted_file_url: Optional[str] = None
|
| 66 |
+
|
| 67 |
+
class RedactionStatusResponse(BaseModel):
|
| 68 |
+
request_id: str
|
| 69 |
+
status: str
|
| 70 |
+
files: List[str]
|
| 71 |
+
message: str
|
| 72 |
+
|
| 73 |
+
class HealthResponse(BaseModel):
|
| 74 |
+
status: str
|
| 75 |
+
version: str
|
| 76 |
+
model_loaded: bool
|
| 77 |
+
|
| 78 |
+
# ---------------- DB Status Helpers ----------------
|
| 79 |
+
def set_request_status(request_id: str, status: str):
|
| 80 |
+
"""Update the status column in document_requests for the given request_id."""
|
| 81 |
+
supabase.from_("document_requests").update({"status": status}).eq("id", request_id).execute()
|
| 82 |
+
logger.info(f"Request {request_id} status -> {status}")
|
| 83 |
+
|
| 84 |
+
def get_request_status(request_id: str) -> str:
|
| 85 |
+
"""Fetch current status from document_requests."""
|
| 86 |
+
response = (
|
| 87 |
+
supabase
|
| 88 |
+
.from_("document_requests")
|
| 89 |
+
.select("status")
|
| 90 |
+
.eq("id", request_id)
|
| 91 |
+
.maybe_single()
|
| 92 |
+
.execute()
|
| 93 |
+
)
|
| 94 |
+
if response.data:
|
| 95 |
+
return response.data["status"]
|
| 96 |
+
return "not_found"
|
| 97 |
+
|
| 98 |
+
# ---------------- Helper Functions ----------------
|
| 99 |
+
def get_public_url(bucket: str, storage_path: str) -> str:
|
| 100 |
+
return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
|
| 101 |
+
|
| 102 |
+
def cleanup_files(job_id: str):
|
| 103 |
+
"""Clean up temporary files after a delay"""
|
| 104 |
+
try:
|
| 105 |
+
upload_path = UPLOAD_DIR / f"{job_id}.pdf"
|
| 106 |
+
if upload_path.exists():
|
| 107 |
+
upload_path.unlink()
|
| 108 |
+
logger.info(f"Cleaned up files for job {job_id}")
|
| 109 |
+
except Exception as e:
|
| 110 |
+
logger.error(f"Error cleaning up files for job {job_id}: {str(e)}")
|
| 111 |
+
|
| 112 |
+
def cleanup_temp_files(paths: List[Path]):
|
| 113 |
+
for path in paths:
|
| 114 |
+
if path.exists():
|
| 115 |
+
path.unlink()
|
| 116 |
+
|
| 117 |
+
def download_file_from_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 118 |
+
logger.info(f"Downloading {storage_path} to {local_path}")
|
| 119 |
+
data = supabase.storage.from_(bucket).download(storage_path)
|
| 120 |
+
if not data:
|
| 121 |
+
raise Exception(f"Failed to download {storage_path}")
|
| 122 |
+
with local_path.open("wb") as f:
|
| 123 |
+
f.write(data)
|
| 124 |
+
|
| 125 |
+
def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
|
| 126 |
+
logger.info(f"Uploading {local_path} to {storage_path}")
|
| 127 |
+
with local_path.open("rb") as f:
|
| 128 |
+
content = f.read()
|
| 129 |
+
supabase.storage.from_(bucket).upload(
|
| 130 |
+
path=storage_path,
|
| 131 |
+
file=content,
|
| 132 |
+
file_options={
|
| 133 |
+
"upsert": "true",
|
| 134 |
+
"content-type": "application/pdf"
|
| 135 |
+
}
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
def redact_request(request_id: str, bucket: str = "doc_storage"):
|
| 139 |
+
"""
|
| 140 |
+
Background task: redact all files for a given request_id.
|
| 141 |
+
DB writes: 2 total — one at start (redacting), one at end (redacted | failed).
|
| 142 |
+
The 'pending' write is done by the endpoint before this task is dispatched.
|
| 143 |
+
"""
|
| 144 |
+
try:
|
| 145 |
+
print("Request arrived at redact_request function")
|
| 146 |
+
# Write 1: mark as redacting
|
| 147 |
+
set_request_status(request_id, "redacting")
|
| 148 |
+
|
| 149 |
+
response = (
|
| 150 |
+
supabase
|
| 151 |
+
.from_("request_files")
|
| 152 |
+
.select("id, storage_path")
|
| 153 |
+
.eq("request_id", request_id)
|
| 154 |
+
.eq("file_role","seed")
|
| 155 |
+
.execute()
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
files = response.data
|
| 159 |
+
if not files:
|
| 160 |
+
set_request_status(request_id, "approved")
|
| 161 |
+
raise Exception(f"No files found for request {request_id}")
|
| 162 |
+
|
| 163 |
+
for file in files:
|
| 164 |
+
storage_path = file["storage_path"]
|
| 165 |
+
local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
|
| 166 |
+
local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
|
| 167 |
+
|
| 168 |
+
download_file_from_supabase(bucket, storage_path, local_upload)
|
| 169 |
+
redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
|
| 170 |
+
upload_file_to_supabase(bucket, storage_path, local_output)
|
| 171 |
+
cleanup_temp_files([local_upload, local_output])
|
| 172 |
+
|
| 173 |
+
# Write 2: mark as redacted
|
| 174 |
+
set_request_status(request_id, "redacted")
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
print(f"Redaction failed for {request_id}: {str(e)}")
|
| 178 |
+
logger.error(f"Redaction failed for {request_id}: {str(e)}")
|
| 179 |
+
# Write 2 (error path): mark as failed
|
| 180 |
+
set_request_status(request_id, "failed")
|
| 181 |
+
|
| 182 |
+
# ----------------- Existing Endpoints -----------------
|
| 183 |
+
@app.get("/", response_model=HealthResponse)
|
| 184 |
+
async def root():
|
| 185 |
+
return HealthResponse(
|
| 186 |
+
status="healthy",
|
| 187 |
+
version="1.0.0",
|
| 188 |
+
model_loaded=redactor.is_model_loaded()
|
| 189 |
+
)
|
| 190 |
+
|
| 191 |
+
@app.get("/health", response_model=HealthResponse)
|
| 192 |
+
async def health_check():
|
| 193 |
+
return HealthResponse(
|
| 194 |
+
status="healthy",
|
| 195 |
+
version="1.0.0",
|
| 196 |
+
model_loaded=redactor.is_model_loaded()
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
@app.post("/redact", response_model=RedactionResponse)
|
| 200 |
+
async def redact_pdf(
|
| 201 |
+
background_tasks: BackgroundTasks,
|
| 202 |
+
file: UploadFile = File(...),
|
| 203 |
+
dpi: int = 300,
|
| 204 |
+
entity_types: Optional[str] = None
|
| 205 |
+
):
|
| 206 |
+
if not file.filename.endswith('.pdf'):
|
| 207 |
+
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
| 208 |
+
job_id = str(uuid.uuid4())
|
| 209 |
+
upload_path = UPLOAD_DIR / f"{job_id}.pdf"
|
| 210 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 211 |
+
try:
|
| 212 |
+
with upload_path.open("wb") as buffer:
|
| 213 |
+
shutil.copyfileobj(file.file, buffer)
|
| 214 |
+
|
| 215 |
+
entity_filter = None
|
| 216 |
+
if entity_types:
|
| 217 |
+
entity_filter = [et.strip() for et in entity_types.split(',')]
|
| 218 |
+
|
| 219 |
+
result = redactor.redact_document(
|
| 220 |
+
pdf_path=str(upload_path),
|
| 221 |
+
output_path=str(output_path),
|
| 222 |
+
dpi=dpi,
|
| 223 |
+
entity_filter=entity_filter
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
response_entities = [
|
| 227 |
+
RedactionEntity(
|
| 228 |
+
entity_type=e['entity_type'],
|
| 229 |
+
entity_text=e['entity_text'],
|
| 230 |
+
page=e['words'][0]['page'] if e['words'] else 0,
|
| 231 |
+
word_count=len(e['words'])
|
| 232 |
+
) for e in result['entities']
|
| 233 |
+
]
|
| 234 |
+
|
| 235 |
+
background_tasks.add_task(cleanup_files, job_id)
|
| 236 |
+
|
| 237 |
+
return RedactionResponse(
|
| 238 |
+
job_id=job_id,
|
| 239 |
+
status="completed",
|
| 240 |
+
message=f"Successfully redacted {len(result['entities'])} entities",
|
| 241 |
+
entities=response_entities,
|
| 242 |
+
redacted_file_url=f"/download/{job_id}"
|
| 243 |
+
)
|
| 244 |
+
|
| 245 |
+
except Exception as e:
|
| 246 |
+
logger.error(f"Error processing job {job_id}: {str(e)}")
|
| 247 |
+
if upload_path.exists():
|
| 248 |
+
upload_path.unlink()
|
| 249 |
+
if output_path.exists():
|
| 250 |
+
output_path.unlink()
|
| 251 |
+
raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
|
| 252 |
+
|
| 253 |
+
@app.get("/download/{job_id}")
|
| 254 |
+
async def download_redacted_pdf(job_id: str):
|
| 255 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 256 |
+
if not output_path.exists():
|
| 257 |
+
raise HTTPException(status_code=404, detail="Redacted file not found")
|
| 258 |
+
return FileResponse(
|
| 259 |
+
path=output_path,
|
| 260 |
+
media_type="application/pdf",
|
| 261 |
+
filename=f"redacted_{job_id}.pdf"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
@app.delete("/cleanup/{job_id}")
|
| 265 |
+
async def cleanup_job(job_id: str):
|
| 266 |
+
try:
|
| 267 |
+
cleanup_files(job_id)
|
| 268 |
+
output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
|
| 269 |
+
if output_path.exists():
|
| 270 |
+
output_path.unlink()
|
| 271 |
+
return {"message": f"Successfully cleaned up files for job {job_id}"}
|
| 272 |
+
except Exception as e:
|
| 273 |
+
raise HTTPException(status_code=500, detail=f"Error cleaning up: {str(e)}")
|
| 274 |
+
|
| 275 |
+
@app.get("/stats")
|
| 276 |
+
async def get_stats():
|
| 277 |
+
upload_count = len(list(UPLOAD_DIR.glob("*.pdf")))
|
| 278 |
+
output_count = len(list(OUTPUT_DIR.glob("*.pdf")))
|
| 279 |
+
return {
|
| 280 |
+
"pending_uploads": upload_count,
|
| 281 |
+
"processed_files": output_count,
|
| 282 |
+
"model_loaded": redactor.is_model_loaded()
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
# ----------------- NEW Endpoints -----------------
|
| 286 |
+
@app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
|
| 287 |
+
async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
|
| 288 |
+
# Check current DB status to avoid re-triggering an in-progress job
|
| 289 |
+
current_status = get_request_status(request_id)
|
| 290 |
+
|
| 291 |
+
if current_status == "redacting":
|
| 292 |
+
return RedactionStatusResponse(
|
| 293 |
+
request_id=request_id,
|
| 294 |
+
status="redacting",
|
| 295 |
+
files=[],
|
| 296 |
+
message="Redaction already in progress"
|
| 297 |
+
)
|
| 298 |
+
|
| 299 |
+
# Write 1: set pending before dispatching background task
|
| 300 |
+
set_request_status(request_id, "pending")
|
| 301 |
+
background_tasks.add_task(redact_request, request_id)
|
| 302 |
+
|
| 303 |
+
return RedactionStatusResponse(
|
| 304 |
+
request_id=request_id,
|
| 305 |
+
status="pending",
|
| 306 |
+
files=[],
|
| 307 |
+
message="Redaction started in background"
|
| 308 |
+
)
|
| 309 |
+
|
| 310 |
+
@app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
|
| 311 |
+
async def get_redaction_status(request_id: str):
|
| 312 |
+
status = get_request_status(request_id)
|
| 313 |
+
|
| 314 |
+
files: List[str] = []
|
| 315 |
+
|
| 316 |
+
if status == "redacted":
|
| 317 |
+
response = (
|
| 318 |
+
supabase
|
| 319 |
+
.from_("request_files")
|
| 320 |
+
.select("storage_path")
|
| 321 |
+
.eq("file_role","seed")
|
| 322 |
+
.eq("request_id", request_id)
|
| 323 |
+
.execute()
|
| 324 |
+
)
|
| 325 |
+
if response.data:
|
| 326 |
+
files = [
|
| 327 |
+
get_public_url("doc_storage", row["storage_path"])
|
| 328 |
+
for row in response.data
|
| 329 |
+
]
|
| 330 |
+
|
| 331 |
+
message = {
|
| 332 |
+
"redacted": "Redaction completed",
|
| 333 |
+
"pending": "Redaction pending",
|
| 334 |
+
"redacting": "Redaction in progress",
|
| 335 |
+
"failed": "Redaction failed",
|
| 336 |
+
"not_found": "Request not found",
|
| 337 |
+
}.get(status, status)
|
| 338 |
+
|
| 339 |
+
return RedactionStatusResponse(
|
| 340 |
+
request_id=request_id,
|
| 341 |
+
status=status,
|
| 342 |
+
files=files,
|
| 343 |
+
message=message
|
| 344 |
+
)
|
outputs/.gitkeep
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.109.0
|
| 2 |
+
uvicorn[standard]==0.27.0
|
| 3 |
+
python-multipart==0.0.6
|
| 4 |
+
transformers>=4.45,<5.0
|
| 5 |
+
accelerate>=0.30
|
| 6 |
+
torch==2.2.2
|
| 7 |
+
pypdf==4.0.1
|
| 8 |
+
pdf2image==1.17.0
|
| 9 |
+
pytesseract==0.3.10
|
| 10 |
+
Pillow==10.2.0
|
| 11 |
+
pydantic==2.5.3
|
| 12 |
+
python-dotenv==1.0.0
|
| 13 |
+
supabase
|
| 14 |
+
numpy==1.26.4
|
uploads/.gitkeep
ADDED
|
File without changes
|