Sammi1211 commited on
Commit
955cf91
·
1 Parent(s): d5d3492

Resolving conflicts

Browse files
.dockerignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info
9
+ dist
10
+ build
11
+ .git
12
+ .gitignore
13
+ .env
14
+ .venv
15
+ venv/
16
+ env/
17
+ *.log
18
+ .DS_Store
19
+ .pytest_cache
20
+ .coverage
21
+ htmlcov/
22
+ uploads/*
23
+ outputs/*
24
+ !uploads/.gitkeep
25
+ !outputs/.gitkeep
26
+ *.pdf
27
+ README.md
28
+ .github/
.github/workflows/ci-cd.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.10'
20
+
21
+ - name: Install system dependencies
22
+ run: |
23
+ sudo apt-get update
24
+ sudo apt-get install -y tesseract-ocr poppler-utils
25
+
26
+ - name: Install Python dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install -r requirements.txt
30
+ pip install pytest pytest-cov httpx
31
+
32
+ - name: Run tests
33
+ run: |
34
+ pytest tests/ -v --cov=app --cov-report=xml
35
+
36
+ - name: Upload coverage
37
+ uses: codecov/codecov-action@v3
38
+ with:
39
+ file: ./coverage.xml
40
+ fail_ci_if_error: false
41
+
42
+ docker-build:
43
+ runs-on: ubuntu-latest
44
+ needs: test
45
+
46
+ steps:
47
+ - uses: actions/checkout@v3
48
+
49
+ - name: Set up Docker Buildx
50
+ uses: docker/setup-buildx-action@v2
51
+
52
+ - name: Build Docker image
53
+ run: |
54
+ docker build -t pdf-redaction-api:test .
55
+
56
+ - name: Test Docker image
57
+ run: |
58
+ docker run -d -p 7860:7860 --name test-api pdf-redaction-api:test
59
+ sleep 10
60
+ curl -f http://localhost:7860/health || exit 1
61
+ docker stop test-api
62
+
63
+ deploy-huggingface:
64
+ runs-on: ubuntu-latest
65
+ needs: [test, docker-build]
66
+ if: github.ref == 'refs/heads/main'
67
+
68
+ steps:
69
+ - uses: actions/checkout@v3
70
+
71
+ - name: Deploy to HuggingFace Spaces
72
+ env:
73
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
74
+ run: |
75
+ git config --global user.email "github-actions@github.com"
76
+ git config --global user.name "GitHub Actions"
77
+
78
+ # Add HuggingFace remote if it doesn't exist
79
+ git remote add hf https://user:$HF_TOKEN@huggingface.co/spaces/${{ secrets.HF_SPACE }} || true
80
+
81
+ # Push to HuggingFace
82
+ git push hf main:main
.gitignore ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ redact/
25
+ venv/
26
+ env/
27
+ ENV/
28
+ .venv
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Project specific
42
+ uploads/*.pdf
43
+ outputs/*.pdf
44
+ *.log
45
+
46
+ # Environment
47
+ .env
48
+ .env.local
49
+
50
+ # Testing
51
+ .pytest_cache/
52
+ .coverage
53
+ htmlcov/
54
+
55
+ # Model cache
56
+ cache/
57
+ models/
58
+
59
+ tests
COMPLETE_GUIDE.md ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Complete FastAPI Deployment Package
2
+
3
+ ## 📦 What You've Got
4
+
5
+ A production-ready FastAPI application for PDF redaction with Named Entity Recognition, ready to deploy on HuggingFace Spaces or any cloud platform.
6
+
7
+ ---
8
+
9
+ ## 📁 Directory Structure
10
+
11
+ ```
12
+ pdf-redaction-api/
13
+
14
+ ├── 📄 main.py # FastAPI application
15
+ ├── 🐳 Dockerfile # Production container
16
+ ├── 🐳 docker-compose.yml # Local development
17
+ ├── 📋 requirements.txt # Python dependencies
18
+
19
+ ├── 📱 app/
20
+ │ ├── __init__.py
21
+ │ └── redaction.py # Core redaction engine
22
+
23
+ ├── 📂 uploads/ # Temporary uploads
24
+ │ └── .gitkeep
25
+
26
+ ├── 📂 outputs/ # Redacted PDFs
27
+ │ └── .gitkeep
28
+
29
+ ├── 🧪 tests/
30
+ │ └── test_api.py # API tests
31
+
32
+ ├── 📚 Documentation/
33
+ │ ├── README.md # Main docs (for HF Spaces)
34
+ │ ├── DEPLOYMENT.md # Deployment guide
35
+ │ ├── QUICKSTART.md # Quick start guide
36
+ │ └── STRUCTURE.md # Project structure
37
+
38
+ ├── 🔧 Configuration/
39
+ │ ├── .env.example # Environment variables
40
+ │ ├── .gitignore # Git ignore
41
+ │ └── .dockerignore # Docker ignore
42
+
43
+ ├── 🤖 .github/
44
+ │ └── workflows/
45
+ │ └── ci-cd.yml # GitHub Actions CI/CD
46
+
47
+ ├── 📝 client_example.py # Example API client
48
+ └── 📜 LICENSE # MIT License
49
+ ```
50
+
51
+ ---
52
+
53
+ ## ✨ Features
54
+
55
+ ### Core Functionality
56
+ ✅ PDF upload and processing
57
+ ✅ OCR with pytesseract (configurable DPI)
58
+ ✅ Named Entity Recognition (NER)
59
+ ✅ Accurate coordinate-based redaction
60
+ ✅ Multiple entity type support
61
+ ✅ Downloadable redacted PDFs
62
+
63
+ ### API Features
64
+ ✅ RESTful API with FastAPI
65
+ ✅ Automatic OpenAPI documentation
66
+ ✅ File upload handling
67
+ ✅ Background task cleanup
68
+ ✅ Health checks
69
+ ✅ Statistics endpoint
70
+ ✅ CORS support
71
+
72
+ ### DevOps
73
+ ✅ Docker containerization
74
+ ✅ Docker Compose for local dev
75
+ ✅ GitHub Actions CI/CD
76
+ ✅ HuggingFace Spaces ready
77
+ ✅ Comprehensive testing
78
+ ✅ Logging and monitoring
79
+
80
+ ---
81
+
82
+ ## 🎯 Quick Deployment Paths
83
+
84
+ ### Option 1: HuggingFace Spaces (Recommended for Demo)
85
+
86
+ **Time: 10 minutes**
87
+
88
+ ```bash
89
+ # 1. Create Space on HuggingFace (select Docker SDK)
90
+ # 2. Clone your space
91
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
92
+ cd pdf-redaction-api
93
+
94
+ # 3. Copy all files
95
+ cp -r /path/to/pdf-redaction-api/* .
96
+
97
+ # 4. Deploy
98
+ git add .
99
+ git commit -m "Initial deployment"
100
+ git push
101
+ ```
102
+
103
+ **Your API will be at:** `https://YOUR_USERNAME-pdf-redaction-api.hf.space`
104
+
105
+ **Cost:** FREE (with CPU Basic tier)
106
+
107
+ ---
108
+
109
+ ### Option 2: Docker Locally
110
+
111
+ **Time: 5 minutes**
112
+
113
+ ```bash
114
+ # Build
115
+ docker build -t pdf-redaction-api .
116
+
117
+ # Run
118
+ docker run -p 7860:7860 pdf-redaction-api
119
+
120
+ # Test
121
+ curl http://localhost:7860/health
122
+ ```
123
+
124
+ ---
125
+
126
+ ### Option 3: Direct Python
127
+
128
+ **Time: 3 minutes**
129
+
130
+ ```bash
131
+ # Install dependencies
132
+ sudo apt-get install tesseract-ocr poppler-utils
133
+ pip install -r requirements.txt
134
+
135
+ # Run
136
+ python main.py
137
+
138
+ # Access at http://localhost:7860
139
+ ```
140
+
141
+ ---
142
+
143
+ ## 🔌 API Endpoints
144
+
145
+ ### Core Endpoints
146
+
147
+ | Method | Endpoint | Description |
148
+ |--------|----------|-------------|
149
+ | POST | `/redact` | Upload and redact PDF |
150
+ | GET | `/download/{job_id}` | Download redacted PDF |
151
+ | GET | `/health` | Health check |
152
+ | GET | `/stats` | API statistics |
153
+ | DELETE | `/cleanup/{job_id}` | Manual cleanup |
154
+ | GET | `/docs` | Interactive API docs |
155
+
156
+ ### Example Usage
157
+
158
+ **cURL:**
159
+ ```bash
160
+ curl -X POST "http://localhost:7860/redact" \
161
+ -F "file=@document.pdf" \
162
+ -F "dpi=300"
163
+ ```
164
+
165
+ **Python:**
166
+ ```python
167
+ import requests
168
+
169
+ response = requests.post(
170
+ "http://localhost:7860/redact",
171
+ files={"file": open("document.pdf", "rb")},
172
+ params={"dpi": 300}
173
+ )
174
+
175
+ job_id = response.json()["job_id"]
176
+ redacted = requests.get(f"http://localhost:7860/download/{job_id}")
177
+ ```
178
+
179
+ ---
180
+
181
+ ## 🎨 Architecture
182
+
183
+ ```
184
+ ┌─────────────────────────────────────────────────────────┐
185
+ │ CLIENT REQUEST │
186
+ │ (Upload PDF via POST /redact) │
187
+ └─────────────────────────────────────────────────────────┘
188
+
189
+ ┌─────────────────────────────────────────────────────────┐
190
+ │ FASTAPI (main.py) │
191
+ │ • Validate file │
192
+ │ • Generate job_id │
193
+ │ • Save to uploads/ │
194
+ └─────────────────────────────────────────────────────────┘
195
+
196
+ ┌─────────────────────────────────────────────────────────┐
197
+ │ PDFRedactor (app/redaction.py) │
198
+ │ │
199
+ │ ┌─────────────────────────────────────────┐ │
200
+ │ │ 1. OCR (pytesseract) │ │
201
+ │ │ • Convert PDF → Images (pdf2image) │ │
202
+ │ │ • Extract text + bounding boxes │ │
203
+ │ │ • Store image dimensions │ │
204
+ │ └─────────────────────────────────────────┘ │
205
+ │ ↓ │
206
+ │ ┌─────────────────────────────────────────┐ │
207
+ │ │ 2. NER (HuggingFace Transformers) │ │
208
+ │ │ • Load model │ │
209
+ │ │ • Identify entities in text │ │
210
+ │ │ • Return entity types + positions │ │
211
+ │ └─────────────────────────────────────────┘ │
212
+ │ ↓ │
213
+ │ ┌─────────────────────────────────────────┐ │
214
+ │ │ 3. Mapping │ │
215
+ │ │ • Create character span index │ │
216
+ │ │ • Match NER entities to OCR boxes │ │
217
+ │ └─────────────────────────────────────────┘ │
218
+ │ ↓ │
219
+ │ ┌─────────────────────────────────────────┐ │
220
+ │ │ 4. Redaction (pypdf) │ │
221
+ │ │ • Scale image coords → PDF coords │ │
222
+ │ │ • Create black rectangle annotations │ │
223
+ │ │ • Write redacted PDF │ │
224
+ │ └─────────────────────────────────────────┘ │
225
+ └─────────────────────────────────────────────────────────┘
226
+
227
+ ┌─────────────────────────────────────────────────────────┐
228
+ │ RESPONSE │
229
+ │ • job_id │
230
+ │ • List of entities │
231
+ │ • Download URL │
232
+ └─────────────────────────────────────────────────────────┘
233
+ ```
234
+
235
+ ---
236
+
237
+ ## 🔐 Security Considerations
238
+
239
+ ### Current Implementation
240
+ - ✅ File validation (PDF only)
241
+ - ✅ Temporary file cleanup
242
+ - ✅ CORS middleware
243
+ - ✅ Error handling
244
+
245
+ ### For Production (TODO)
246
+ - ⚠️ Add API key authentication
247
+ - ⚠️ Implement rate limiting
248
+ - ⚠️ Add file size limits
249
+ - ⚠️ Use HTTPS only
250
+ - ⚠️ Implement user quotas
251
+ - ⚠️ Add input sanitization
252
+
253
+ **Example API Key Auth:**
254
+ ```python
255
+ # Add to main.py
256
+ from fastapi import Security, HTTPException
257
+ from fastapi.security import APIKeyHeader
258
+
259
+ API_KEY = "your-secret-key"
260
+ api_key_header = APIKeyHeader(name="X-API-Key")
261
+
262
+ def verify_api_key(key: str = Security(api_key_header)):
263
+ if key != API_KEY:
264
+ raise HTTPException(401, "Invalid API Key")
265
+ ```
266
+
267
+ ---
268
+
269
+ ## 📊 Performance Tuning
270
+
271
+ ### DPI Settings
272
+
273
+ | DPI | Quality | Speed | Use Case |
274
+ |-----|---------|-------|----------|
275
+ | 150 | Low | Fast | Quick previews |
276
+ | 200 | Medium | Medium | General use |
277
+ | 300 | High | Slow | **Recommended** |
278
+ | 600 | Very High | Very Slow | Critical documents |
279
+
280
+ ### Hardware Requirements
281
+
282
+ **Minimum (Free Tier):**
283
+ - CPU: 2 cores
284
+ - RAM: 2GB
285
+ - Storage: 1GB
286
+
287
+ **Recommended (Production):**
288
+ - CPU: 4+ cores
289
+ - RAM: 8GB
290
+ - Storage: 10GB
291
+ - GPU: Optional (speeds up NER)
292
+
293
+ ---
294
+
295
+ ## 🧪 Testing
296
+
297
+ ```bash
298
+ # Install test dependencies
299
+ pip install pytest pytest-cov httpx
300
+
301
+ # Run tests
302
+ pytest tests/ -v
303
+
304
+ # With coverage
305
+ pytest tests/ --cov=app --cov-report=html
306
+
307
+ # View coverage report
308
+ open htmlcov/index.html
309
+ ```
310
+
311
+ ---
312
+
313
+ ## 📈 Monitoring
314
+
315
+ ### Built-in Endpoints
316
+
317
+ **Health Check:**
318
+ ```bash
319
+ curl http://localhost:7860/health
320
+ ```
321
+
322
+ **Statistics:**
323
+ ```bash
324
+ curl http://localhost:7860/stats
325
+ ```
326
+
327
+ ### Logs
328
+
329
+ **Development:**
330
+ ```bash
331
+ python main.py
332
+ # Logs appear in console
333
+ ```
334
+
335
+ **Docker:**
336
+ ```bash
337
+ docker logs -f container_name
338
+ ```
339
+
340
+ **HuggingFace Spaces:**
341
+ - View in Space dashboard → Logs tab
342
+
343
+ ---
344
+
345
+ ## 💰 Cost Estimation
346
+
347
+ ### HuggingFace Spaces
348
+
349
+ | Tier | CPU | RAM | Price | Use Case |
350
+ |------|-----|-----|-------|----------|
351
+ | Basic | 2 | 16GB | **FREE** | Demo, testing |
352
+ | CPU Upgrade | 4 | 32GB | $0.50/hr | Production |
353
+ | GPU T4 | - | - | $0.60/hr | Heavy load |
354
+ | GPU A10G | - | - | $1.50/hr | Enterprise |
355
+
356
+ **Monthly Costs (if always on):**
357
+ - Free: $0
358
+ - CPU Upgrade: ~$360/month
359
+ - GPU T4: ~$432/month
360
+
361
+ **Recommendation:** Start free, upgrade based on usage
362
+
363
+ ### Alternatives
364
+
365
+ **AWS ECS Fargate:** ~$30-100/month
366
+ **Google Cloud Run:** Pay per request (~$10-50/month)
367
+ **DigitalOcean App:** $12-24/month
368
+ **Self-hosted VPS:** $5-20/month
369
+
370
+ ---
371
+
372
+ ## 🔄 CI/CD Pipeline
373
+
374
+ ### Automated with GitHub Actions
375
+
376
+ ```
377
+ Push to GitHub
378
+
379
+ [Run Tests]
380
+
381
+ [Build Docker]
382
+
383
+ [Test Container]
384
+
385
+ [Deploy to HuggingFace]
386
+ ```
387
+
388
+ **Setup:**
389
+ 1. Add secrets in GitHub repo settings:
390
+ - `HF_TOKEN`: HuggingFace access token
391
+ - `HF_SPACE`: Your space name (username/space-name)
392
+
393
+ 2. Push to main branch → Auto-deploy! ✨
394
+
395
+ ---
396
+
397
+ ## 📚 Documentation Access
398
+
399
+ | Document | Purpose |
400
+ |----------|---------|
401
+ | `README.md` | Overview, API docs, usage examples |
402
+ | `QUICKSTART.md` | 5-minute setup guide |
403
+ | `DEPLOYMENT.md` | Production deployment |
404
+ | `STRUCTURE.md` | Code organization |
405
+ | `/docs` endpoint | Interactive API documentation |
406
+
407
+ ---
408
+
409
+ ## 🎓 Learning Resources
410
+
411
+ ### FastAPI
412
+ - Docs: https://fastapi.tiangolo.com
413
+ - Tutorial: https://fastapi.tiangolo.com/tutorial
414
+
415
+ ### HuggingFace
416
+ - Spaces: https://huggingface.co/docs/hub/spaces
417
+ - Transformers: https://huggingface.co/docs/transformers
418
+
419
+ ### Docker
420
+ - Getting Started: https://docs.docker.com/get-started
421
+
422
+ ---
423
+
424
+ ## 🐛 Troubleshooting
425
+
426
+ ### Common Issues
427
+
428
+ **Problem:** "Tesseract not found"
429
+ **Solution:** `apt-get install tesseract-ocr`
430
+
431
+ **Problem:** "Poppler not found"
432
+ **Solution:** `apt-get install poppler-utils`
433
+
434
+ **Problem:** Slow processing
435
+ **Solution:** Lower DPI to 150-200
436
+
437
+ **Problem:** Out of memory
438
+ **Solution:** Upgrade hardware or reduce DPI
439
+
440
+ **Problem:** Model not loading
441
+ **Solution:** Check internet, wait for download
442
+
443
+ ### Debug Mode
444
+
445
+ ```python
446
+ # In main.py, add debug mode
447
+ if __name__ == "__main__":
448
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True, log_level="debug")
449
+ ```
450
+
451
+ ---
452
+
453
+ ## ✅ Checklist for Production
454
+
455
+ - [ ] Test all endpoints thoroughly
456
+ - [ ] Add API key authentication
457
+ - [ ] Implement rate limiting
458
+ - [ ] Set up monitoring (Sentry, DataDog, etc.)
459
+ - [ ] Configure auto-scaling
460
+ - [ ] Set up backups
461
+ - [ ] Add usage analytics
462
+ - [ ] Create user documentation
463
+ - [ ] Set up SSL/TLS (HF provides by default)
464
+ - [ ] Test with large files
465
+ - [ ] Load testing
466
+ - [ ] Security audit
467
+ - [ ] Legal compliance (GDPR, etc.)
468
+
469
+ ---
470
+
471
+ ## 🎉 You're Ready!
472
+
473
+ Your FastAPI PDF Redaction application is complete and ready to deploy!
474
+
475
+ ### Next Steps:
476
+ 1. ✨ Deploy to HuggingFace Spaces (easiest)
477
+ 2. 🧪 Test with real PDFs
478
+ 3. 📊 Monitor usage
479
+ 4. 🔒 Add security for production
480
+ 5. 🚀 Scale as needed
481
+
482
+ ### Support:
483
+ - 📖 Read the documentation
484
+ - 🐛 Check troubleshooting guide
485
+ - 💬 HuggingFace community forums
486
+ - 📧 Create issues on your repo
487
+
488
+ **Happy Deploying! 🚀**
DEPLOYMENT.md ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deployment Guide for HuggingFace Spaces
2
+
3
+ ## Prerequisites
4
+
5
+ 1. **HuggingFace Account**: Sign up at https://huggingface.co/
6
+ 2. **Git**: Installed on your local machine
7
+ 3. **Git LFS**: For large file storage (optional)
8
+
9
+ ## Step-by-Step Deployment
10
+
11
+ ### 1. Create a New Space
12
+
13
+ 1. Go to https://huggingface.co/spaces
14
+ 2. Click "Create new Space"
15
+ 3. Fill in the details:
16
+ - **Space name**: `pdf-redaction-api` (or your preferred name)
17
+ - **License**: MIT
18
+ - **SDK**: Docker
19
+ - **Hardware**: CPU Basic (free tier) or upgrade if needed
20
+ 4. Click "Create Space"
21
+
22
+ ### 2. Clone Your Space Repository
23
+
24
+ ```bash
25
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
26
+ cd pdf-redaction-api
27
+ ```
28
+
29
+ ### 3. Copy All Files to the Repository
30
+
31
+ Copy all files from this project to your cloned space:
32
+
33
+ ```bash
34
+ # Copy all files
35
+ cp -r /path/to/pdf-redaction-api/* .
36
+
37
+ # Check the files
38
+ ls -la
39
+ ```
40
+
41
+ You should see:
42
+ - `main.py`
43
+ - `app/`
44
+ - `Dockerfile`
45
+ - `requirements.txt`
46
+ - `README.md`
47
+ - `.gitignore`
48
+ - `.dockerignore`
49
+ - `uploads/` (with .gitkeep)
50
+ - `outputs/` (with .gitkeep)
51
+
52
+ ### 4. Commit and Push
53
+
54
+ ```bash
55
+ # Add all files
56
+ git add .
57
+
58
+ # Commit
59
+ git commit -m "Initial deployment of PDF Redaction API"
60
+
61
+ # Push to HuggingFace
62
+ git push
63
+ ```
64
+
65
+ ### 5. Monitor Deployment
66
+
67
+ 1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
68
+ 2. You'll see the build logs
69
+ 3. Wait for the build to complete (usually 5-10 minutes)
70
+ 4. Once complete, your API will be live!
71
+
72
+ ### 6. Test Your Deployment
73
+
74
+ ```bash
75
+ # Check health
76
+ curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
77
+
78
+ # Test with a PDF
79
+ curl -X POST "https://YOUR_USERNAME-pdf-redaction-api.hf.space/redact" \
80
+ -F "file=@test.pdf" \
81
+ -F "dpi=300"
82
+ ```
83
+
84
+ ## Configuration Options
85
+
86
+ ### Hardware Upgrades
87
+
88
+ For better performance, consider upgrading your Space hardware:
89
+
90
+ 1. Go to Space Settings
91
+ 2. Click on "Hardware"
92
+ 3. Choose:
93
+ - **CPU Basic** (Free): Good for testing, slower processing
94
+ - **CPU Upgrade** (~$0.50/hour): Faster processing
95
+ - **GPU** (~$0.60-3/hour): Best for large documents
96
+
97
+ ### Environment Variables
98
+
99
+ Add environment variables in Space Settings if needed:
100
+
101
+ ```bash
102
+ HF_HOME=/app/cache
103
+ PYTHONUNBUFFERED=1
104
+ ```
105
+
106
+ ### Persistent Storage
107
+
108
+ For persistent file storage:
109
+
110
+ 1. Go to Space Settings
111
+ 2. Enable "Persistent Storage"
112
+ 3. This keeps uploaded/processed files between restarts
113
+
114
+ ## Custom Domain (Optional)
115
+
116
+ To use a custom domain:
117
+
118
+ 1. Go to Space Settings
119
+ 2. Click "Domains"
120
+ 3. Add your custom domain
121
+ 4. Follow DNS configuration instructions
122
+
123
+ ## Monitoring and Logs
124
+
125
+ ### View Logs
126
+
127
+ 1. Go to your Space page
128
+ 2. Click on "Logs" tab
129
+ 3. Monitor real-time logs
130
+
131
+ ### Check Resource Usage
132
+
133
+ 1. Click on "Insights" tab
134
+ 2. View CPU/Memory usage
135
+ 3. Monitor request patterns
136
+
137
+ ## Security Considerations
138
+
139
+ ### For Production Use
140
+
141
+ 1. **Add Authentication**:
142
+ - Implement API key authentication
143
+ - Use OAuth2 for user management
144
+
145
+ 2. **Rate Limiting**:
146
+ - Add rate limiting to prevent abuse
147
+ - Use slowapi or similar libraries
148
+
149
+ 3. **File Size Limits**:
150
+ - Restrict upload file sizes
151
+ - Implement timeout for long-running requests
152
+
153
+ 4. **HTTPS Only**:
154
+ - HuggingFace Spaces provides HTTPS by default
155
+ - Ensure all requests use HTTPS
156
+
157
+ Example with API key authentication:
158
+
159
+ ```python
160
+ from fastapi import Security, HTTPException, status
161
+ from fastapi.security import APIKeyHeader
162
+
163
+ API_KEY = "your-secret-key"
164
+ api_key_header = APIKeyHeader(name="X-API-Key")
165
+
166
+ def verify_api_key(api_key: str = Security(api_key_header)):
167
+ if api_key != API_KEY:
168
+ raise HTTPException(
169
+ status_code=status.HTTP_401_UNAUTHORIZED,
170
+ detail="Invalid API Key"
171
+ )
172
+ return api_key
173
+
174
+ # Add to endpoints
175
+ @app.post("/redact")
176
+ async def redact_pdf(
177
+ file: UploadFile = File(...),
178
+ api_key: str = Security(verify_api_key)
179
+ ):
180
+ # Your code here
181
+ ```
182
+
183
+ ## Troubleshooting
184
+
185
+ ### Build Fails
186
+
187
+ **Problem**: Docker build fails
188
+
189
+ **Solution**:
190
+ - Check Dockerfile syntax
191
+ - Ensure all dependencies are in requirements.txt
192
+ - Review build logs for specific errors
193
+
194
+ ### Out of Memory
195
+
196
+ **Problem**: API crashes with OOM errors
197
+
198
+ **Solution**:
199
+ - Reduce default DPI to 200
200
+ - Upgrade to larger hardware
201
+ - Implement request queuing
202
+
203
+ ### Slow Processing
204
+
205
+ **Problem**: Redaction takes too long
206
+
207
+ **Solution**:
208
+ - Lower DPI (150-200 for faster processing)
209
+ - Upgrade to GPU hardware
210
+ - Optimize batch processing
211
+
212
+ ### Model Download Issues
213
+
214
+ **Problem**: Model fails to download
215
+
216
+ **Solution**:
217
+ - Check HuggingFace model availability
218
+ - Verify internet access in Space
219
+ - Pre-download model and include in Docker image
220
+
221
+ ## Updating Your Space
222
+
223
+ To update your deployed API:
224
+
225
+ ```bash
226
+ # Make changes locally
227
+ # Test changes
228
+
229
+ # Commit and push
230
+ git add .
231
+ git commit -m "Update: description of changes"
232
+ git push
233
+
234
+ # HuggingFace will automatically rebuild
235
+ ```
236
+
237
+ ## Cost Estimation
238
+
239
+ ### Free Tier
240
+ - CPU Basic
241
+ - Limited to 2 CPU cores
242
+ - 16GB RAM
243
+ - Good for: Testing, low-traffic demos
244
+
245
+ ### Paid Tiers
246
+ - CPU Upgrade: ~$0.50/hour (~$360/month if always on)
247
+ - GPU T4: ~$0.60/hour (~$432/month)
248
+ - GPU A10G: ~$1.50/hour (~$1,080/month)
249
+
250
+ **Recommendation**: Start with free tier, upgrade based on usage
251
+
252
+ ## Alternative Deployment Options
253
+
254
+ ### 1. Deploy on Your Own Server
255
+
256
+ ```bash
257
+ # Build Docker image
258
+ docker build -t pdf-redaction-api .
259
+
260
+ # Run container
261
+ docker run -p 7860:7860 pdf-redaction-api
262
+ ```
263
+
264
+ ### 2. Deploy on Cloud Platforms
265
+
266
+ - **AWS ECS/Fargate**: For scalable production
267
+ - **Google Cloud Run**: Serverless container deployment
268
+ - **Azure Container Instances**: Easy container deployment
269
+ - **DigitalOcean App Platform**: Simple PaaS deployment
270
+
271
+ ### 3. Deploy on Render.com
272
+
273
+ 1. Connect your GitHub repo
274
+ 2. Select "Docker" as environment
275
+ 3. Deploy automatically
276
+
277
+ ## Support
278
+
279
+ For issues:
280
+ 1. Check HuggingFace Spaces documentation
281
+ 2. Review logs in Space dashboard
282
+ 3. Test locally with Docker first
283
+ 4. Open issue on your repository
284
+
285
+ ## Next Steps
286
+
287
+ After successful deployment:
288
+
289
+ 1. ✅ Test all API endpoints
290
+ 2. ✅ Set up monitoring
291
+ 3. ✅ Configure custom domain (optional)
292
+ 4. ✅ Add authentication for production
293
+ 5. ✅ Implement rate limiting
294
+ 6. ✅ Set up error tracking (e.g., Sentry)
295
+ 7. ✅ Create API documentation with examples
296
+ 8. ✅ Add usage analytics
297
+
298
+ Your API is now live and ready to use! 🚀
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ tesseract-ocr \
9
+ tesseract-ocr-eng \
10
+ poppler-utils \
11
+ libgl1 \
12
+ libglib2.0-0 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy requirements first for better caching
16
+ COPY requirements.txt .
17
+
18
+ # Install Python dependencies
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy application code
22
+ COPY . .
23
+
24
+ # Create necessary directories
25
+ RUN mkdir -p uploads outputs
26
+
27
+ # Expose port (HuggingFace Spaces uses 7860)
28
+ EXPOSE 7860
29
+
30
+ # Set environment variables
31
+ ENV PYTHONUNBUFFERED=1
32
+ ENV HF_HOME=/app/cache
33
+
34
+ # Run the application
35
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
36
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 PDF Redaction API
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
QUICKSTART.md ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick Start Guide 🚀
2
+
3
+ ## Local Development (5 minutes)
4
+
5
+ ### 1. Install System Dependencies
6
+
7
+ **Ubuntu/Debian:**
8
+ ```bash
9
+ sudo apt-get update
10
+ sudo apt-get install -y tesseract-ocr poppler-utils
11
+ ```
12
+
13
+ **macOS:**
14
+ ```bash
15
+ brew install tesseract poppler
16
+ ```
17
+
18
+ **Windows:**
19
+ - Download Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
20
+ - Download Poppler: https://github.com/oschwartz10612/poppler-windows/releases
21
+
22
+ ### 2. Install Python Dependencies
23
+
24
+ ```bash
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ ### 3. Run the Server
29
+
30
+ ```bash
31
+ python main.py
32
+ ```
33
+
34
+ The API will be available at: `http://localhost:7860`
35
+
36
+ ### 4. Test with cURL
37
+
38
+ ```bash
39
+ # Health check
40
+ curl http://localhost:7860/health
41
+
42
+ # Redact a PDF
43
+ curl -X POST "http://localhost:7860/redact" \
44
+ -F "file=@your_document.pdf" \
45
+ -F "dpi=300"
46
+ ```
47
+
48
+ ### 5. Access API Documentation
49
+
50
+ Open in browser: `http://localhost:7860/docs`
51
+
52
+ ## Using Docker (3 minutes)
53
+
54
+ ### 1. Build Image
55
+
56
+ ```bash
57
+ docker build -t pdf-redaction-api .
58
+ ```
59
+
60
+ ### 2. Run Container
61
+
62
+ ```bash
63
+ docker run -p 7860:7860 pdf-redaction-api
64
+ ```
65
+
66
+ ### 3. Test
67
+
68
+ ```bash
69
+ curl http://localhost:7860/health
70
+ ```
71
+
72
+ ## Deploy to HuggingFace Spaces (10 minutes)
73
+
74
+ ### 1. Create Space
75
+
76
+ 1. Go to https://huggingface.co/spaces
77
+ 2. Click "Create new Space"
78
+ 3. Name: `pdf-redaction-api`
79
+ 4. SDK: **Docker**
80
+ 5. Click "Create Space"
81
+
82
+ ### 2. Push Code
83
+
84
+ ```bash
85
+ # Clone your space
86
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
87
+ cd pdf-redaction-api
88
+
89
+ # Copy all project files
90
+ cp -r /path/to/project/* .
91
+
92
+ # Commit and push
93
+ git add .
94
+ git commit -m "Initial deployment"
95
+ git push
96
+ ```
97
+
98
+ ### 3. Wait for Build
99
+
100
+ Monitor at: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
101
+
102
+ ### 4. Test Your Deployed API
103
+
104
+ ```bash
105
+ curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
106
+ ```
107
+
108
+ ## Example Usage
109
+
110
+ ### Python Client
111
+
112
+ ```python
113
+ import requests
114
+
115
+ # Upload and redact
116
+ files = {"file": open("document.pdf", "rb")}
117
+ response = requests.post(
118
+ "http://localhost:7860/redact",
119
+ files=files,
120
+ params={"dpi": 300}
121
+ )
122
+
123
+ result = response.json()
124
+ job_id = result["job_id"]
125
+
126
+ # Download redacted PDF
127
+ redacted = requests.get(f"http://localhost:7860/download/{job_id}")
128
+ with open("redacted.pdf", "wb") as f:
129
+ f.write(redacted.content)
130
+
131
+ print(f"Redacted {len(result['entities'])} entities")
132
+ ```
133
+
134
+ ### JavaScript/Node.js
135
+
136
+ ```javascript
137
+ const FormData = require('form-data');
138
+ const fs = require('fs');
139
+ const axios = require('axios');
140
+
141
+ async function redactPDF() {
142
+ const form = new FormData();
143
+ form.append('file', fs.createReadStream('document.pdf'));
144
+
145
+ // Upload and redact
146
+ const response = await axios.post(
147
+ 'http://localhost:7860/redact',
148
+ form,
149
+ {
150
+ headers: form.getHeaders(),
151
+ params: { dpi: 300 }
152
+ }
153
+ );
154
+
155
+ const { job_id } = response.data;
156
+
157
+ // Download redacted PDF
158
+ const redacted = await axios.get(
159
+ `http://localhost:7860/download/${job_id}`,
160
+ { responseType: 'arraybuffer' }
161
+ );
162
+
163
+ fs.writeFileSync('redacted.pdf', redacted.data);
164
+ console.log('Redaction complete!');
165
+ }
166
+
167
+ redactPDF();
168
+ ```
169
+
170
+ ### cURL Advanced
171
+
172
+ ```bash
173
+ # Redact only specific entity types
174
+ curl -X POST "http://localhost:7860/redact" \
175
+ -F "file=@document.pdf" \
176
+ -F "dpi=300" \
177
+ -F "entity_types=PER,ORG"
178
+
179
+ # Get statistics
180
+ curl http://localhost:7860/stats
181
+
182
+ # Download specific file
183
+ curl -O -J http://localhost:7860/download/JOB_ID_HERE
184
+ ```
185
+
186
+ ## Common Use Cases
187
+
188
+ ### 1. Redact All Personal Information
189
+
190
+ ```python
191
+ response = requests.post(
192
+ "http://localhost:7860/redact",
193
+ files={"file": open("resume.pdf", "rb")},
194
+ params={"dpi": 300}
195
+ )
196
+ ```
197
+
198
+ ### 2. Redact Only Names and Organizations
199
+
200
+ ```python
201
+ response = requests.post(
202
+ "http://localhost:7860/redact",
203
+ files={"file": open("contract.pdf", "rb")},
204
+ params={
205
+ "dpi": 300,
206
+ "entity_types": "PER,ORG"
207
+ }
208
+ )
209
+ ```
210
+
211
+ ### 3. Fast Processing (Lower Quality)
212
+
213
+ ```python
214
+ response = requests.post(
215
+ "http://localhost:7860/redact",
216
+ files={"file": open("large_doc.pdf", "rb")},
217
+ params={"dpi": 150} # Faster but less accurate
218
+ )
219
+ ```
220
+
221
+ ### 4. High Quality (Slower)
222
+
223
+ ```python
224
+ response = requests.post(
225
+ "http://localhost:7860/redact",
226
+ files={"file": open("important.pdf", "rb")},
227
+ params={"dpi": 600} # Best quality, slowest
228
+ )
229
+ ```
230
+
231
+ ## Troubleshooting
232
+
233
+ ### "Model not loaded"
234
+ **Problem**: NER model failed to load
235
+ **Solution**: Check internet connection, wait for model download
236
+
237
+ ### "Tesseract not found"
238
+ **Problem**: OCR engine not installed
239
+ **Solution**: Install tesseract-ocr system package
240
+
241
+ ### "Poppler not found"
242
+ **Problem**: PDF converter not installed
243
+ **Solution**: Install poppler-utils system package
244
+
245
+ ### Slow processing
246
+ **Problem**: Redaction takes too long
247
+ **Solution**: Lower DPI to 150-200
248
+
249
+ ### Out of memory
250
+ **Problem**: Large PDF crashes the API
251
+ **Solution**:
252
+ - Process one page at a time
253
+ - Increase container memory
254
+ - Lower DPI
255
+
256
+ ## Next Steps
257
+
258
+ - ✅ Read full [README.md](README.md) for API details
259
+ - ✅ Check [DEPLOYMENT.md](DEPLOYMENT.md) for production setup
260
+ - ✅ Review [STRUCTURE.md](STRUCTURE.md) for code organization
261
+ - ✅ Run tests: `pytest tests/`
262
+ - ✅ Add authentication for production use
263
+ - ✅ Set up monitoring and logging
264
+
265
+ ## Support
266
+
267
+ - 📖 API Docs: `http://localhost:7860/docs`
268
+ - 🐛 Issues: Create on your repository
269
+ - 💬 HuggingFace: Community forums
270
+
271
+ Happy redacting! 🔒
README.md CHANGED
@@ -1,10 +1,167 @@
1
  ---
2
- title: Redact With Openai
3
- emoji: 📉
4
- colorFrom: green
5
- colorTo: gray
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: PDF Redaction API
3
+ emoji: 🔒
4
+ colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # PDF Redaction API 🔒
12
+
13
+ Automatically redact sensitive information from PDF documents using Named Entity Recognition (NER).
14
+
15
+ ## Features
16
+
17
+ - 🤖 **Powered by NER**: Uses state-of-the-art Named Entity Recognition
18
+ - 📄 **PDF Support**: Upload and process PDF documents
19
+ - 🎯 **Accurate Redaction**: Correctly positioned black rectangles over sensitive text
20
+ - 🚀 **Fast Processing**: Optimized OCR and NER pipeline
21
+ - 🔧 **Configurable**: Adjust DPI and filter entity types
22
+
23
+ ## API Endpoints
24
+
25
+ ### `POST /redact`
26
+
27
+ Upload a PDF file and get it redacted.
28
+
29
+ **Parameters:**
30
+ - `file`: PDF file (required)
31
+ - `dpi`: OCR quality (default: 300)
32
+ - `entity_types`: Comma-separated entity types to redact (optional)
33
+
34
+ **Example using cURL:**
35
+
36
+ ```bash
37
+ curl -X POST "https://your-space.hf.space/redact" \
38
+ -F "file=@document.pdf" \
39
+ -F "dpi=300"
40
+ ```
41
+
42
+ **Example using Python:**
43
+
44
+ ```python
45
+ import requests
46
+
47
+ url = "https://your-space.hf.space/redact"
48
+ files = {"file": open("document.pdf", "rb")}
49
+ params = {"dpi": 300}
50
+
51
+ response = requests.post(url, files=files, params=params)
52
+ result = response.json()
53
+
54
+ # Download redacted file
55
+ job_id = result["job_id"]
56
+ download_url = f"https://your-space.hf.space/download/{job_id}"
57
+ redacted_pdf = requests.get(download_url)
58
+
59
+ with open("redacted.pdf", "wb") as f:
60
+ f.write(redacted_pdf.content)
61
+ ```
62
+
63
+ ### `GET /download/{job_id}`
64
+
65
+ Download the redacted PDF file.
66
+
67
+ ### `GET /health`
68
+
69
+ Check API health and model status.
70
+
71
+ ### `GET /stats`
72
+
73
+ Get API statistics.
74
+
75
+ ## Response Format
76
+
77
+ ```json
78
+ {
79
+ "job_id": "uuid-here",
80
+ "status": "completed",
81
+ "message": "Successfully redacted 5 entities",
82
+ "entities": [
83
+ {
84
+ "entity_type": "PER",
85
+ "entity_text": "John Doe",
86
+ "page": 1,
87
+ "word_count": 2
88
+ }
89
+ ],
90
+ "redacted_file_url": "/download/uuid-here"
91
+ }
92
+ ```
93
+
94
+ ## Entity Types
95
+
96
+ Common entity types detected:
97
+ - `PER`: Person names
98
+ - `ORG`: Organizations
99
+ - `LOC`: Locations
100
+ - `DATE`: Dates
101
+ - `EMAIL`: Email addresses
102
+ - `PHONE`: Phone numbers
103
+ - And more...
104
+
105
+ ## Local Development
106
+
107
+ ### Prerequisites
108
+
109
+ - Python 3.10+
110
+ - Tesseract OCR
111
+ - Poppler utils
112
+
113
+ ### Installation
114
+
115
+ ```bash
116
+ # Install system dependencies (Ubuntu/Debian)
117
+ sudo apt-get install tesseract-ocr poppler-utils
118
+
119
+ # Install Python dependencies
120
+ pip install -r requirements.txt
121
+
122
+ # Run the server
123
+ python main.py
124
+ ```
125
+
126
+ The API will be available at `http://localhost:7860`
127
+
128
+ ### Using Docker
129
+
130
+ ```bash
131
+ # Build the image
132
+ docker build -t pdf-redaction-api .
133
+
134
+ # Run the container
135
+ docker run -p 7860:7860 pdf-redaction-api
136
+ ```
137
+
138
+ ## Configuration
139
+
140
+ Adjust the DPI parameter based on your needs:
141
+ - `150`: Fast processing, lower quality
142
+ - `300`: Recommended balance (default)
143
+ - `600`: High quality, slower processing
144
+
145
+ ## Limitations
146
+
147
+ - Maximum file size: Dependent on Space resources
148
+ - Processing time increases with page count and DPI
149
+ - Files are automatically cleaned up after processing
150
+
151
+ ## Privacy
152
+
153
+ - Uploaded files are processed in-memory and deleted after redaction
154
+ - No data is stored permanently
155
+ - Use your own deployment for sensitive documents
156
+
157
+ ## Credits
158
+
159
+ Built with:
160
+ - [FastAPI](https://fastapi.tiangolo.com/)
161
+ - [Transformers](https://huggingface.co/transformers/)
162
+ - [PyPDF](https://github.com/py-pdf/pypdf)
163
+ - [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
164
+
165
+ ## License
166
+
167
+ MIT License - See LICENSE file for details
STRUCTURE.md ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Structure
2
+
3
+ ```
4
+ pdf-redaction-api/
5
+
6
+ ├── main.py # FastAPI application entry point
7
+ ├── Dockerfile # Docker configuration for deployment
8
+ ├── requirements.txt # Python dependencies
9
+ ├── README.md # Project documentation (for HuggingFace)
10
+ ├── DEPLOYMENT.md # Deployment guide
11
+ ├── .gitignore # Git ignore rules
12
+ ├── .dockerignore # Docker ignore rules
13
+
14
+ ├── app/ # Application modules
15
+ │ ├── __init__.py # Package initialization
16
+ │ └── redaction.py # Core redaction logic (PDFRedactor class)
17
+
18
+ ├── uploads/ # Temporary upload directory
19
+ │ └── .gitkeep # Keep directory in git
20
+
21
+ ├── outputs/ # Redacted PDF output directory
22
+ │ └── .gitkeep # Keep directory in git
23
+
24
+ ├── tests/ # Test suite
25
+ │ ├── __init__.py
26
+ │ └── test_api.py # API endpoint tests
27
+
28
+ └── client_example.py # Example client for API usage
29
+ ```
30
+
31
+ ## File Descriptions
32
+
33
+ ### Core Files
34
+
35
+ #### `main.py`
36
+ FastAPI application with endpoints:
37
+ - `POST /redact` - Upload and redact PDF
38
+ - `GET /download/{job_id}` - Download redacted PDF
39
+ - `GET /health` - Health check
40
+ - `GET /stats` - API statistics
41
+ - `DELETE /cleanup/{job_id}` - Manual cleanup
42
+
43
+ #### `app/redaction.py`
44
+ Core redaction logic:
45
+ - `PDFRedactor` class
46
+ - OCR processing with pytesseract
47
+ - NER using HuggingFace transformers
48
+ - Entity-to-box mapping
49
+ - PDF redaction with coordinate scaling
50
+
51
+ ### Configuration Files
52
+
53
+ #### `requirements.txt`
54
+ Python dependencies:
55
+ - FastAPI & Uvicorn (API framework)
56
+ - Transformers & Torch (NER model)
57
+ - PyPDF (PDF manipulation)
58
+ - pdf2image (PDF to image conversion)
59
+ - pytesseract (OCR)
60
+ - Pillow (Image processing)
61
+
62
+ #### `Dockerfile`
63
+ Multi-stage build:
64
+ 1. Install system dependencies (tesseract, poppler)
65
+ 2. Install Python dependencies
66
+ 3. Copy application code
67
+ 4. Configure for port 7860 (HuggingFace default)
68
+
69
+ ### Documentation
70
+
71
+ #### `README.md`
72
+ HuggingFace Space documentation:
73
+ - Features overview
74
+ - API endpoint documentation
75
+ - Usage examples (cURL, Python)
76
+ - Response format
77
+ - Local development setup
78
+
79
+ #### `DEPLOYMENT.md`
80
+ Step-by-step deployment guide:
81
+ - HuggingFace Spaces setup
82
+ - Git workflow
83
+ - Configuration options
84
+ - Security considerations
85
+ - Troubleshooting
86
+ - Cost estimation
87
+
88
+ ### Testing & Examples
89
+
90
+ #### `tests/test_api.py`
91
+ Unit tests for API endpoints:
92
+ - Health check tests
93
+ - Upload validation tests
94
+ - Error handling tests
95
+
96
+ #### `client_example.py`
97
+ Example client implementation:
98
+ - Upload PDF
99
+ - Download redacted file
100
+ - Health check
101
+ - Statistics
102
+
103
+ ## Data Flow
104
+
105
+ ```
106
+ ┌─────────────────────────────────────────────────────────┐
107
+ │ 1. Client uploads PDF │
108
+ │ POST /redact with file │
109
+ └─────────────────────────────────────────────────────────┘
110
+
111
+ ┌─────────────────────────────────────────────────────────┐
112
+ │ 2. FastAPI (main.py) │
113
+ │ - Validates file │
114
+ │ - Generates job_id │
115
+ │ - Saves to uploads/ │
116
+ └─────────────────────────────────────────────────────────┘
117
+
118
+ ┌─────────────────────────────────────────────────────────┐
119
+ │ 3. PDFRedactor (app/redaction.py) │
120
+ │ - perform_ocr() → Extract text + boxes │
121
+ │ - run_ner() → Identify entities │
122
+ │ - map_entities_to_boxes() → Link entities to coords │
123
+ │ - create_redacted_pdf() → Generate output │
124
+ └─────────────────────────────────────────────────────────┘
125
+
126
+ ┌─────────────────────────────────────────────────────────┐
127
+ │ 4. Response │
128
+ │ - Return job_id and entity list │
129
+ │ - Save redacted PDF to outputs/ │
130
+ └─────────────────────────────────────────────────────────┘
131
+
132
+ ┌─────────────────────────────────────────────────────────┐
133
+ │ 5. Client downloads │
134
+ │ GET /download/{job_id} │
135
+ └─────────────────────────────────────────────────────────┘
136
+ ```
137
+
138
+ ## Key Components
139
+
140
+ ### 1. FastAPI Application (`main.py`)
141
+
142
+ **Endpoints:**
143
+ - RESTful API design
144
+ - File upload handling
145
+ - Background task cleanup
146
+ - CORS middleware for web access
147
+
148
+ **Features:**
149
+ - Automatic OpenAPI documentation at `/docs`
150
+ - JSON response models with Pydantic
151
+ - Error handling with HTTP exceptions
152
+ - Request validation
153
+
154
+ ### 2. Redaction Engine (`app/redaction.py`)
155
+
156
+ **Pipeline Steps:**
157
+
158
+ 1. **OCR Processing**
159
+ - Convert PDF pages to images (pdf2image)
160
+ - Extract text and bounding boxes (pytesseract)
161
+ - Store image dimensions for coordinate scaling
162
+
163
+ 2. **NER Processing**
164
+ - Load HuggingFace model
165
+ - Identify entities in text
166
+ - Return entity types and character positions
167
+
168
+ 3. **Mapping**
169
+ - Create character span index for OCR words
170
+ - Match NER entities to OCR bounding boxes
171
+ - Handle partial word matches
172
+
173
+ 4. **Redaction**
174
+ - Scale OCR image coordinates to PDF points
175
+ - Create black rectangle annotations
176
+ - Write redacted PDF with pypdf
177
+
178
+ ### 3. Docker Container
179
+
180
+ **Layers:**
181
+ - Base: Python 3.10 slim
182
+ - System packages: tesseract-ocr, poppler-utils
183
+ - Python packages: From requirements.txt
184
+ - Application code: Copied last for better caching
185
+
186
+ **Optimizations:**
187
+ - Multi-stage build (not used here, but possible)
188
+ - Minimal base image
189
+ - Cached dependency layers
190
+ - .dockerignore to reduce context size
191
+
192
+ ## Environment Variables
193
+
194
+ Default configuration (can be overridden):
195
+
196
+ ```bash
197
+ PYTHONUNBUFFERED=1 # Immediate log output
198
+ HF_HOME=/app/cache # HuggingFace cache directory
199
+ ```
200
+
201
+ ## Port Configuration
202
+
203
+ - **Development**: 7860 (configurable in main.py)
204
+ - **Production (HF Spaces)**: 7860 (required)
205
+
206
+ ## Directory Permissions
207
+
208
+ Ensure write permissions for:
209
+ - `uploads/` - Temporary PDF storage
210
+ - `outputs/` - Redacted PDF storage
211
+ - `cache/` - Model cache (created automatically)
212
+
213
+ ## Adding New Features
214
+
215
+ ### Add New Endpoint
216
+
217
+ 1. Define in `main.py`:
218
+ ```python
219
+ @app.get("/new-endpoint")
220
+ async def new_endpoint():
221
+ return {"message": "Hello"}
222
+ ```
223
+
224
+ 2. Add response model if needed
225
+ 3. Update README.md documentation
226
+ 4. Add tests in `tests/test_api.py`
227
+
228
+ ### Add New Redaction Option
229
+
230
+ 1. Modify `PDFRedactor` class in `app/redaction.py`
231
+ 2. Add parameter to `redact_document()` method
232
+ 3. Update API endpoint in `main.py`
233
+ 4. Document in README.md
234
+
235
+ ### Add Authentication
236
+
237
+ 1. Install: `pip install python-jose passlib`
238
+ 2. Create `app/auth.py` with JWT logic
239
+ 3. Add middleware to `main.py`
240
+ 4. Protect endpoints with dependencies
241
+
242
+ ## Best Practices
243
+
244
+ 1. **Logging**: Use `logger` for all important events
245
+ 2. **Error Handling**: Catch exceptions and return meaningful errors
246
+ 3. **Validation**: Use Pydantic models for request/response validation
247
+ 4. **Cleanup**: Always clean up temporary files
248
+ 5. **Documentation**: Keep README.md and code comments updated
249
+ 6. **Testing**: Add tests for new features
250
+
251
+ ## Performance Considerations
252
+
253
+ ### Bottlenecks
254
+ 1. OCR processing (most time-consuming)
255
+ 2. Model inference (NER)
256
+ 3. File I/O
257
+
258
+ ### Optimizations
259
+ - Lower DPI for faster OCR (trade-off with accuracy)
260
+ - Cache loaded models in memory
261
+ - Use async file operations
262
+ - Implement request queuing for high load
263
+ - Consider GPU for NER model
264
+
265
+ ### Scaling
266
+ - Horizontal: Multiple container instances
267
+ - Vertical: Larger CPU/RAM allocation
268
+ - Caching: Redis for temporary results
269
+ - Queue: Celery for background processing
app/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ App module for PDF redaction API
3
+ """
4
+ from .redaction import PDFRedactor
5
+
6
+ __all__ = ['PDFRedactor']
app/redaction.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Redaction module using NER
3
+ """
4
+ from pdf2image import convert_from_path
5
+ import pytesseract
6
+ from pypdf import PdfReader, PdfWriter
7
+ from pypdf.generic import DictionaryObject, ArrayObject, NameObject, NumberObject
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
9
+ from typing import List, Dict, Optional
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PDFRedactor:
16
+ """PDF Redaction using Named Entity Recognition"""
17
+
18
+ def __init__(self, model_name: str = "openai/privacy-filter"):
19
+ """
20
+ Initialize the PDF Redactor
21
+
22
+ Args:
23
+ model_name: HuggingFace model ID for NER
24
+ """
25
+ self.model_name = model_name
26
+ self.ner_pipeline = None
27
+ self._load_model()
28
+
29
+ def _load_model(self):
30
+ """Load the NER model"""
31
+ try:
32
+ logger.info(f"Loading NER model: {self.model_name}")
33
+ tokenizer = AutoTokenizer.from_pretrained(
34
+ self.model_name, trust_remote_code=True
35
+ )
36
+ model = AutoModelForTokenClassification.from_pretrained(
37
+ self.model_name, trust_remote_code=True, device_map="auto"
38
+ )
39
+
40
+ self.ner_pipeline = pipeline(
41
+ "token-classification",
42
+ model=model,
43
+ tokenizer=tokenizer,
44
+ aggregation_strategy="simple",
45
+ )
46
+ logger.info("NER model loaded successfully")
47
+ except Exception as e:
48
+ logger.error(f"Error loading NER model: {str(e)}")
49
+ raise
50
+
51
+ def is_model_loaded(self) -> bool:
52
+ """Check if the model is loaded"""
53
+ return self.ner_pipeline is not None
54
+
55
+ def perform_ocr(self, pdf_path: str, dpi: int = 300) -> List[Dict]:
56
+ """
57
+ Perform OCR on PDF and extract word bounding boxes
58
+
59
+ Args:
60
+ pdf_path: Path to the PDF file
61
+ dpi: DPI for PDF to image conversion
62
+
63
+ Returns:
64
+ List of word data with bounding boxes and image dimensions
65
+ """
66
+ logger.info(f"Starting OCR on {pdf_path} at {dpi} DPI")
67
+ all_words_data = []
68
+
69
+ try:
70
+ images = convert_from_path(pdf_path, dpi=dpi)
71
+ logger.info(f"Converted PDF to {len(images)} images")
72
+
73
+ for page_num, image in enumerate(images):
74
+ # Get image dimensions
75
+ image_width, image_height = image.size
76
+
77
+ # Perform OCR
78
+ data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
79
+ logger.info(f"OCR data: {data['text']}")
80
+
81
+ num_words = len(data['text'])
82
+ for i in range(num_words):
83
+ word_text = data['text'][i].strip()
84
+ confidence = int(data['conf'][i])
85
+
86
+ # Filter out empty or low-confidence words
87
+ if word_text and confidence > 0:
88
+ all_words_data.append({
89
+ 'text': word_text,
90
+ 'box': (data['left'][i], data['top'][i],
91
+ data['width'][i], data['height'][i]),
92
+ 'page': page_num + 1,
93
+ 'confidence': confidence,
94
+ 'image_width': image_width,
95
+ 'image_height': image_height
96
+ })
97
+
98
+ logger.info(f"Processed page {page_num + 1}: {len([w for w in all_words_data if w['page'] == page_num + 1])} words")
99
+
100
+ logger.info(f"OCR complete: {len(all_words_data)} total words extracted")
101
+ return all_words_data
102
+
103
+ except Exception as e:
104
+ logger.error(f"Error during OCR: {str(e)}")
105
+ raise
106
+
107
+ def run_ner(self, text: str) -> List[Dict]:
108
+ """
109
+ Run NER on text
110
+
111
+ Args:
112
+ text: Input text
113
+
114
+ Returns:
115
+ List of identified entities
116
+ """
117
+ if not self.ner_pipeline:
118
+ raise RuntimeError("NER model not loaded")
119
+
120
+ logger.info(f"Running NER on text of length {len(text)}")
121
+
122
+ try:
123
+ results = self.ner_pipeline(text)
124
+ logger.info(f"NER identified {len(results)} entities")
125
+ return results
126
+ except Exception as e:
127
+ logger.error(f"Error during NER: {str(e)}")
128
+ raise
129
+
130
+ def map_entities_to_boxes(self, ner_results: List[Dict],
131
+ ocr_data: List[Dict]) -> List[Dict]:
132
+ """
133
+ Map NER entities to OCR bounding boxes
134
+
135
+ Args:
136
+ ner_results: List of NER entities
137
+ ocr_data: List of OCR word data
138
+
139
+ Returns:
140
+ List of mapped entities with bounding boxes
141
+ """
142
+ logger.info("Mapping NER entities to OCR bounding boxes")
143
+ mapped_entities = []
144
+
145
+ # Create character span mapping
146
+ ocr_word_char_spans = []
147
+ current_char_index = 0
148
+
149
+ for ocr_data_idx, word_info in enumerate(ocr_data):
150
+ word_text = word_info['text']
151
+ length = len(word_text)
152
+
153
+ ocr_word_char_spans.append({
154
+ 'ocr_data_idx': ocr_data_idx,
155
+ 'start_char': current_char_index,
156
+ 'end_char': current_char_index + length
157
+ })
158
+ current_char_index += length + 1
159
+
160
+ # Map each NER entity to OCR words
161
+ for ner_entity in ner_results:
162
+ ner_entity_type = ner_entity['entity_group']
163
+ ner_start = ner_entity['start']
164
+ ner_end = ner_entity['end']
165
+ ner_word = ner_entity['word']
166
+
167
+ matching_ocr_words = []
168
+
169
+ for ocr_word_span in ocr_word_char_spans:
170
+ ocr_start = ocr_word_span['start_char']
171
+ ocr_end = ocr_word_span['end_char']
172
+
173
+ # Check for overlap
174
+ if max(ocr_start, ner_start) < min(ocr_end, ner_end):
175
+ matching_ocr_words.append(ocr_data[ocr_word_span['ocr_data_idx']])
176
+
177
+ if matching_ocr_words:
178
+ mapped_entities.append({
179
+ 'entity_type': ner_entity_type,
180
+ 'entity_text': ner_word,
181
+ 'words': matching_ocr_words
182
+ })
183
+
184
+ logger.info(f"Mapped {len(mapped_entities)} entities to bounding boxes")
185
+ return mapped_entities
186
+
187
+ def create_redacted_pdf(self, original_pdf_path: str,
188
+ mapped_entities: List[Dict],
189
+ output_path: str) -> str:
190
+ """
191
+ Create redacted PDF with black rectangles over entities
192
+
193
+ Args:
194
+ original_pdf_path: Path to original PDF
195
+ mapped_entities: List of entities with bounding boxes
196
+ output_path: Path for output PDF
197
+
198
+ Returns:
199
+ Path to redacted PDF
200
+ """
201
+ logger.info(f"Creating redacted PDF: {output_path}")
202
+
203
+ try:
204
+ reader = PdfReader(original_pdf_path)
205
+ writer = PdfWriter()
206
+
207
+ for page_num in range(len(reader.pages)):
208
+ page = reader.pages[page_num]
209
+ media_box = page.mediabox
210
+ page_width = float(media_box.width)
211
+ page_height = float(media_box.height)
212
+
213
+ writer.add_page(page)
214
+
215
+ page_entities = 0
216
+ for entity_info in mapped_entities:
217
+ for word_info in entity_info['words']:
218
+ if word_info['page'] == page_num + 1:
219
+ x, y, w, h = word_info['box']
220
+
221
+ # Get image dimensions
222
+ image_width = word_info['image_width']
223
+ image_height = word_info['image_height']
224
+
225
+ # Scale coordinates
226
+ scale_x = page_width / image_width
227
+ scale_y = page_height / image_height
228
+
229
+ x_scaled = x * scale_x
230
+ y_scaled = y * scale_y
231
+ w_scaled = w * scale_x
232
+ h_scaled = h * scale_y
233
+
234
+ # Convert to PDF coordinates
235
+ llx = x_scaled
236
+ lly = page_height - (y_scaled + h_scaled)
237
+ urx = x_scaled + w_scaled
238
+ ury = page_height - y_scaled
239
+
240
+ # Create redaction annotation
241
+ redaction_annotation = DictionaryObject()
242
+ redaction_annotation.update({
243
+ NameObject("/Type"): NameObject("/Annot"),
244
+ NameObject("/Subtype"): NameObject("/Square"),
245
+ NameObject("/Rect"): ArrayObject([
246
+ NumberObject(llx),
247
+ NumberObject(lly),
248
+ NumberObject(urx),
249
+ NumberObject(ury),
250
+ ]),
251
+ NameObject("/C"): ArrayObject([
252
+ NumberObject(0), NumberObject(0), NumberObject(0)
253
+ ]),
254
+ NameObject("/IC"): ArrayObject([
255
+ NumberObject(0), NumberObject(0), NumberObject(0)
256
+ ]),
257
+ NameObject("/BS"): DictionaryObject({
258
+ NameObject("/W"): NumberObject(0)
259
+ })
260
+ })
261
+
262
+ writer.add_annotation(page_number=page_num,
263
+ annotation=redaction_annotation)
264
+ page_entities += 1
265
+
266
+ logger.info(f"Page {page_num + 1}: Added {page_entities} redactions")
267
+
268
+ # Write output
269
+ with open(output_path, "wb") as output_file:
270
+ writer.write(output_file)
271
+
272
+ logger.info(f"Redacted PDF created successfully: {output_path}")
273
+ return output_path
274
+
275
+ except Exception as e:
276
+ logger.error(f"Error creating redacted PDF: {str(e)}")
277
+ raise
278
+
279
+ def redact_document(self, pdf_path: str, output_path: str,
280
+ dpi: int = 300,
281
+ entity_filter: Optional[List[str]] = None) -> Dict:
282
+ """
283
+ Complete redaction pipeline
284
+
285
+ Args:
286
+ pdf_path: Path to input PDF
287
+ output_path: Path for output PDF
288
+ dpi: DPI for OCR
289
+ entity_filter: List of entity types to redact (None = all). Valid
290
+ values: account_number, private_address, private_email,
291
+ private_person, private_phone, private_url, private_date, secret
292
+
293
+ Returns:
294
+ Dictionary with redaction results
295
+ """
296
+ logger.info(f"Starting redaction pipeline for {pdf_path}")
297
+
298
+ # Step 1: OCR
299
+ ocr_data = self.perform_ocr(pdf_path, dpi)
300
+
301
+ # Step 2: Extract text
302
+ full_text = " ".join([word['text'] for word in ocr_data])
303
+
304
+ # Step 3: NER
305
+ ner_results = self.run_ner(full_text)
306
+
307
+ # Step 4: Map entities to boxes
308
+ mapped_entities = self.map_entities_to_boxes(ner_results, ocr_data)
309
+
310
+ # Step 5: Filter entities if requested
311
+ if entity_filter:
312
+ mapped_entities = [
313
+ e for e in mapped_entities
314
+ if e['entity_type'] in entity_filter
315
+ ]
316
+ logger.info(f"Filtered to {len(mapped_entities)} entities of types: {entity_filter}")
317
+
318
+ # Step 6: Create redacted PDF
319
+ self.create_redacted_pdf(pdf_path, mapped_entities, output_path)
320
+
321
+ return {
322
+ 'output_path': output_path,
323
+ 'total_words': len(ocr_data),
324
+ 'total_entities': len(ner_results),
325
+ 'redacted_entities': len(mapped_entities),
326
+ 'entities': mapped_entities
327
+ }
client_example.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example client for PDF Redaction API
3
+ """
4
+ import requests
5
+ from pathlib import Path
6
+ import sys
7
+
8
+
9
+ def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf",
10
+ dpi: int = 300, entity_types: str = None):
11
+ """
12
+ Redact a PDF file using the API
13
+
14
+ Args:
15
+ api_url: Base URL of the API
16
+ pdf_path: Path to the PDF file to redact
17
+ output_path: Path to save the redacted PDF
18
+ dpi: DPI for OCR processing
19
+ entity_types: Comma-separated list of entity types to redact
20
+ """
21
+ # Check if file exists
22
+ if not Path(pdf_path).exists():
23
+ print(f"Error: File {pdf_path} not found")
24
+ return False
25
+
26
+ print(f"Uploading {pdf_path}...")
27
+
28
+ # Prepare request
29
+ files = {"file": open(pdf_path, "rb")}
30
+ params = {"dpi": dpi}
31
+
32
+ if entity_types:
33
+ params["entity_types"] = entity_types
34
+
35
+ try:
36
+ # Upload and redact
37
+ response = requests.post(f"{api_url}/redact", files=files, params=params)
38
+ response.raise_for_status()
39
+
40
+ result = response.json()
41
+ print(f"\nStatus: {result['status']}")
42
+ print(f"Message: {result['message']}")
43
+
44
+ # Display found entities
45
+ if result.get('entities'):
46
+ print("\nEntities redacted:")
47
+ for i, entity in enumerate(result['entities'], 1):
48
+ print(f" {i}. {entity['entity_type']}: {entity['entity_text']} "
49
+ f"(Page {entity['page']}, {entity['word_count']} words)")
50
+
51
+ # Download redacted file
52
+ job_id = result['job_id']
53
+ print(f"\nDownloading redacted PDF...")
54
+
55
+ download_response = requests.get(f"{api_url}/download/{job_id}")
56
+ download_response.raise_for_status()
57
+
58
+ # Save file
59
+ with open(output_path, "wb") as f:
60
+ f.write(download_response.content)
61
+
62
+ print(f"✓ Redacted PDF saved to: {output_path}")
63
+
64
+ # Cleanup (optional)
65
+ # requests.delete(f"{api_url}/cleanup/{job_id}")
66
+
67
+ return True
68
+
69
+ except requests.exceptions.RequestException as e:
70
+ print(f"Error: {e}")
71
+ return False
72
+ finally:
73
+ files["file"].close()
74
+
75
+
76
+ def check_health(api_url: str):
77
+ """Check API health"""
78
+ try:
79
+ response = requests.get(f"{api_url}/health")
80
+ response.raise_for_status()
81
+ data = response.json()
82
+
83
+ print(f"API Status: {data['status']}")
84
+ print(f"Version: {data['version']}")
85
+ print(f"Model Loaded: {data['model_loaded']}")
86
+
87
+ return True
88
+ except requests.exceptions.RequestException as e:
89
+ print(f"Error checking health: {e}")
90
+ return False
91
+
92
+
93
+ def get_stats(api_url: str):
94
+ """Get API statistics"""
95
+ try:
96
+ response = requests.get(f"{api_url}/stats")
97
+ response.raise_for_status()
98
+ data = response.json()
99
+
100
+ print("API Statistics:")
101
+ print(f" Pending uploads: {data['pending_uploads']}")
102
+ print(f" Processed files: {data['processed_files']}")
103
+ print(f" Model loaded: {data['model_loaded']}")
104
+
105
+ return True
106
+ except requests.exceptions.RequestException as e:
107
+ print(f"Error getting stats: {e}")
108
+ return False
109
+
110
+
111
+ if __name__ == "__main__":
112
+ # Example usage
113
+
114
+ # For local development
115
+ API_URL = "http://localhost:7860"
116
+
117
+ # For HuggingFace Spaces (replace with your space URL)
118
+ # API_URL = "https://your-username-pdf-redaction-api.hf.space"
119
+
120
+ if len(sys.argv) < 2:
121
+ print("Usage:")
122
+ print(" python client_example.py <pdf_file> [output_file] [dpi]")
123
+ print("\nOr check health:")
124
+ print(" python client_example.py --health")
125
+ print("\nOr get stats:")
126
+ print(" python client_example.py --stats")
127
+ sys.exit(1)
128
+
129
+ if sys.argv[1] == "--health":
130
+ check_health(API_URL)
131
+ elif sys.argv[1] == "--stats":
132
+ get_stats(API_URL)
133
+ else:
134
+ pdf_path = sys.argv[1]
135
+ output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
136
+ dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
137
+
138
+ # Optional: Filter specific entity types
139
+ # entity_types = "PER,ORG" # Only redact persons and organizations
140
+ entity_types = None # Redact all entity types
141
+
142
+ redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)
client_supabase.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from supabase import create_client, Client
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
7
+ SUPABASE_KEY = os.getenv("SERVICE_ROLE_KEY") # server-side key
8
+
9
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
docker-compose.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ api:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "7860:7860"
10
+ volumes:
11
+ # Mount code for development (hot reload)
12
+ - .:/app
13
+ # Persistent storage for uploads/outputs
14
+ - ./uploads:/app/uploads
15
+ - ./outputs:/app/outputs
16
+ environment:
17
+ - PYTHONUNBUFFERED=1
18
+ - HF_HOME=/app/cache
19
+ - LOG_LEVEL=DEBUG
20
+ command: uvicorn main:app --host 0.0.0.0 --port 7860 --reload
21
+ restart: unless-stopped
22
+ healthcheck:
23
+ test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
24
+ interval: 30s
25
+ timeout: 10s
26
+ retries: 3
27
+ start_period: 40s
28
+
29
+ # Optional: Add nginx for production
30
+ # nginx:
31
+ # image: nginx:alpine
32
+ # ports:
33
+ # - "80:80"
34
+ # volumes:
35
+ # - ./nginx.conf:/etc/nginx/nginx.conf
36
+ # depends_on:
37
+ # - api
38
+
39
+ # Optional: Add Redis for caching
40
+ # redis:
41
+ # image: redis:alpine
42
+ # ports:
43
+ # - "6379:6379"
44
+ # volumes:
45
+ # - redis-data:/data
46
+
47
+ # volumes:
48
+ # redis-data:
main.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for PDF redaction using NER
3
+ """
4
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
5
+ from fastapi.responses import FileResponse
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+ from typing import List, Optional, Dict
9
+ import uvicorn
10
+ import os
11
+ import uuid
12
+ import shutil
13
+ from pathlib import Path
14
+ import logging
15
+ import sys
16
+ from app.redaction import PDFRedactor
17
+ from client_supabase import supabase # Supabase client in separate file
18
+
19
+ # Configure logging
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ stream=sys.stdout,
23
+ force=True,
24
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
25
+ )
26
+ logger = logging.getLogger(__name__)
27
+
28
+ # Initialize FastAPI app
29
+ app = FastAPI(
30
+ title="PDF Redaction API",
31
+ description="Redact sensitive information from PDFs using Named Entity Recognition",
32
+ version="1.0.0"
33
+ )
34
+
35
+ # CORS middleware
36
+ app.add_middleware(
37
+ CORSMiddleware,
38
+ allow_origins=["*"],
39
+ allow_credentials=True,
40
+ allow_methods=["*"],
41
+ allow_headers=["*"],
42
+ )
43
+
44
+ # Create directories
45
+ UPLOAD_DIR = Path("uploads")
46
+ OUTPUT_DIR = Path("outputs")
47
+ UPLOAD_DIR.mkdir(exist_ok=True)
48
+ OUTPUT_DIR.mkdir(exist_ok=True)
49
+
50
+ # Initialize redactor
51
+ redactor = PDFRedactor()
52
+
53
+ # ---------------- Response Models ----------------
54
+ class RedactionEntity(BaseModel):
55
+ entity_type: str
56
+ entity_text: str
57
+ page: int
58
+ word_count: int
59
+
60
+ class RedactionResponse(BaseModel):
61
+ job_id: str
62
+ status: str
63
+ message: str
64
+ entities: Optional[List[RedactionEntity]] = None
65
+ redacted_file_url: Optional[str] = None
66
+
67
+ class RedactionStatusResponse(BaseModel):
68
+ request_id: str
69
+ status: str
70
+ files: List[str]
71
+ message: str
72
+
73
+ class HealthResponse(BaseModel):
74
+ status: str
75
+ version: str
76
+ model_loaded: bool
77
+
78
+ # ---------------- DB Status Helpers ----------------
79
+ def set_request_status(request_id: str, status: str):
80
+ """Update the status column in document_requests for the given request_id."""
81
+ supabase.from_("document_requests").update({"status": status}).eq("id", request_id).execute()
82
+ logger.info(f"Request {request_id} status -> {status}")
83
+
84
+ def get_request_status(request_id: str) -> str:
85
+ """Fetch current status from document_requests."""
86
+ response = (
87
+ supabase
88
+ .from_("document_requests")
89
+ .select("status")
90
+ .eq("id", request_id)
91
+ .maybe_single()
92
+ .execute()
93
+ )
94
+ if response.data:
95
+ return response.data["status"]
96
+ return "not_found"
97
+
98
+ # ---------------- Helper Functions ----------------
99
+ def get_public_url(bucket: str, storage_path: str) -> str:
100
+ return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
101
+
102
+ def cleanup_files(job_id: str):
103
+ """Clean up temporary files after a delay"""
104
+ try:
105
+ upload_path = UPLOAD_DIR / f"{job_id}.pdf"
106
+ if upload_path.exists():
107
+ upload_path.unlink()
108
+ logger.info(f"Cleaned up files for job {job_id}")
109
+ except Exception as e:
110
+ logger.error(f"Error cleaning up files for job {job_id}: {str(e)}")
111
+
112
+ def cleanup_temp_files(paths: List[Path]):
113
+ for path in paths:
114
+ if path.exists():
115
+ path.unlink()
116
+
117
+ def download_file_from_supabase(bucket: str, storage_path: str, local_path: Path):
118
+ logger.info(f"Downloading {storage_path} to {local_path}")
119
+ data = supabase.storage.from_(bucket).download(storage_path)
120
+ if not data:
121
+ raise Exception(f"Failed to download {storage_path}")
122
+ with local_path.open("wb") as f:
123
+ f.write(data)
124
+
125
+ def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
126
+ logger.info(f"Uploading {local_path} to {storage_path}")
127
+ with local_path.open("rb") as f:
128
+ content = f.read()
129
+ supabase.storage.from_(bucket).upload(
130
+ path=storage_path,
131
+ file=content,
132
+ file_options={
133
+ "upsert": "true",
134
+ "content-type": "application/pdf"
135
+ }
136
+ )
137
+
138
+ def redact_request(request_id: str, bucket: str = "doc_storage"):
139
+ """
140
+ Background task: redact all files for a given request_id.
141
+ DB writes: 2 total — one at start (redacting), one at end (redacted | failed).
142
+ The 'pending' write is done by the endpoint before this task is dispatched.
143
+ """
144
+ try:
145
+ print("Request arrived at redact_request function")
146
+ # Write 1: mark as redacting
147
+ set_request_status(request_id, "redacting")
148
+
149
+ response = (
150
+ supabase
151
+ .from_("request_files")
152
+ .select("id, storage_path")
153
+ .eq("request_id", request_id)
154
+ .eq("file_role","seed")
155
+ .execute()
156
+ )
157
+
158
+ files = response.data
159
+ if not files:
160
+ set_request_status(request_id, "approved")
161
+ raise Exception(f"No files found for request {request_id}")
162
+
163
+ for file in files:
164
+ storage_path = file["storage_path"]
165
+ local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
166
+ local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
167
+
168
+ download_file_from_supabase(bucket, storage_path, local_upload)
169
+ redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
170
+ upload_file_to_supabase(bucket, storage_path, local_output)
171
+ cleanup_temp_files([local_upload, local_output])
172
+
173
+ # Write 2: mark as redacted
174
+ set_request_status(request_id, "redacted")
175
+
176
+ except Exception as e:
177
+ print(f"Redaction failed for {request_id}: {str(e)}")
178
+ logger.error(f"Redaction failed for {request_id}: {str(e)}")
179
+ # Write 2 (error path): mark as failed
180
+ set_request_status(request_id, "failed")
181
+
182
+ # ----------------- Existing Endpoints -----------------
183
+ @app.get("/", response_model=HealthResponse)
184
+ async def root():
185
+ return HealthResponse(
186
+ status="healthy",
187
+ version="1.0.0",
188
+ model_loaded=redactor.is_model_loaded()
189
+ )
190
+
191
+ @app.get("/health", response_model=HealthResponse)
192
+ async def health_check():
193
+ return HealthResponse(
194
+ status="healthy",
195
+ version="1.0.0",
196
+ model_loaded=redactor.is_model_loaded()
197
+ )
198
+
199
+ @app.post("/redact", response_model=RedactionResponse)
200
+ async def redact_pdf(
201
+ background_tasks: BackgroundTasks,
202
+ file: UploadFile = File(...),
203
+ dpi: int = 300,
204
+ entity_types: Optional[str] = None
205
+ ):
206
+ if not file.filename.endswith('.pdf'):
207
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
208
+ job_id = str(uuid.uuid4())
209
+ upload_path = UPLOAD_DIR / f"{job_id}.pdf"
210
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
211
+ try:
212
+ with upload_path.open("wb") as buffer:
213
+ shutil.copyfileobj(file.file, buffer)
214
+
215
+ entity_filter = None
216
+ if entity_types:
217
+ entity_filter = [et.strip() for et in entity_types.split(',')]
218
+
219
+ result = redactor.redact_document(
220
+ pdf_path=str(upload_path),
221
+ output_path=str(output_path),
222
+ dpi=dpi,
223
+ entity_filter=entity_filter
224
+ )
225
+
226
+ response_entities = [
227
+ RedactionEntity(
228
+ entity_type=e['entity_type'],
229
+ entity_text=e['entity_text'],
230
+ page=e['words'][0]['page'] if e['words'] else 0,
231
+ word_count=len(e['words'])
232
+ ) for e in result['entities']
233
+ ]
234
+
235
+ background_tasks.add_task(cleanup_files, job_id)
236
+
237
+ return RedactionResponse(
238
+ job_id=job_id,
239
+ status="completed",
240
+ message=f"Successfully redacted {len(result['entities'])} entities",
241
+ entities=response_entities,
242
+ redacted_file_url=f"/download/{job_id}"
243
+ )
244
+
245
+ except Exception as e:
246
+ logger.error(f"Error processing job {job_id}: {str(e)}")
247
+ if upload_path.exists():
248
+ upload_path.unlink()
249
+ if output_path.exists():
250
+ output_path.unlink()
251
+ raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
252
+
253
+ @app.get("/download/{job_id}")
254
+ async def download_redacted_pdf(job_id: str):
255
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
256
+ if not output_path.exists():
257
+ raise HTTPException(status_code=404, detail="Redacted file not found")
258
+ return FileResponse(
259
+ path=output_path,
260
+ media_type="application/pdf",
261
+ filename=f"redacted_{job_id}.pdf"
262
+ )
263
+
264
+ @app.delete("/cleanup/{job_id}")
265
+ async def cleanup_job(job_id: str):
266
+ try:
267
+ cleanup_files(job_id)
268
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
269
+ if output_path.exists():
270
+ output_path.unlink()
271
+ return {"message": f"Successfully cleaned up files for job {job_id}"}
272
+ except Exception as e:
273
+ raise HTTPException(status_code=500, detail=f"Error cleaning up: {str(e)}")
274
+
275
+ @app.get("/stats")
276
+ async def get_stats():
277
+ upload_count = len(list(UPLOAD_DIR.glob("*.pdf")))
278
+ output_count = len(list(OUTPUT_DIR.glob("*.pdf")))
279
+ return {
280
+ "pending_uploads": upload_count,
281
+ "processed_files": output_count,
282
+ "model_loaded": redactor.is_model_loaded()
283
+ }
284
+
285
+ # ----------------- NEW Endpoints -----------------
286
+ @app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
287
+ async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
288
+ # Check current DB status to avoid re-triggering an in-progress job
289
+ current_status = get_request_status(request_id)
290
+
291
+ if current_status == "redacting":
292
+ return RedactionStatusResponse(
293
+ request_id=request_id,
294
+ status="redacting",
295
+ files=[],
296
+ message="Redaction already in progress"
297
+ )
298
+
299
+ # Write 1: set pending before dispatching background task
300
+ set_request_status(request_id, "pending")
301
+ background_tasks.add_task(redact_request, request_id)
302
+
303
+ return RedactionStatusResponse(
304
+ request_id=request_id,
305
+ status="pending",
306
+ files=[],
307
+ message="Redaction started in background"
308
+ )
309
+
310
+ @app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
311
+ async def get_redaction_status(request_id: str):
312
+ status = get_request_status(request_id)
313
+
314
+ files: List[str] = []
315
+
316
+ if status == "redacted":
317
+ response = (
318
+ supabase
319
+ .from_("request_files")
320
+ .select("storage_path")
321
+ .eq("file_role","seed")
322
+ .eq("request_id", request_id)
323
+ .execute()
324
+ )
325
+ if response.data:
326
+ files = [
327
+ get_public_url("doc_storage", row["storage_path"])
328
+ for row in response.data
329
+ ]
330
+
331
+ message = {
332
+ "redacted": "Redaction completed",
333
+ "pending": "Redaction pending",
334
+ "redacting": "Redaction in progress",
335
+ "failed": "Redaction failed",
336
+ "not_found": "Request not found",
337
+ }.get(status, status)
338
+
339
+ return RedactionStatusResponse(
340
+ request_id=request_id,
341
+ status=status,
342
+ files=files,
343
+ message=message
344
+ )
outputs/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn[standard]==0.27.0
3
+ python-multipart==0.0.6
4
+ transformers>=4.45,<5.0
5
+ accelerate>=0.30
6
+ torch==2.2.2
7
+ pypdf==4.0.1
8
+ pdf2image==1.17.0
9
+ pytesseract==0.3.10
10
+ Pillow==10.2.0
11
+ pydantic==2.5.3
12
+ python-dotenv==1.0.0
13
+ supabase
14
+ numpy==1.26.4
uploads/.gitkeep ADDED
File without changes