Sammi1211 commited on
Commit
af107f1
·
1 Parent(s): e262fe2

adding url support

Browse files
.dockerignore ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info
9
+ dist
10
+ build
11
+ .git
12
+ .gitignore
13
+ .env
14
+ .venv
15
+ venv/
16
+ env/
17
+ *.log
18
+ .DS_Store
19
+ .pytest_cache
20
+ .coverage
21
+ htmlcov/
22
+ uploads/*
23
+ outputs/*
24
+ !uploads/.gitkeep
25
+ !outputs/.gitkeep
26
+ *.pdf
27
+ README.md
28
+ .github/
.github/workflows/ci-cd.yml ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+
13
+ steps:
14
+ - uses: actions/checkout@v3
15
+
16
+ - name: Set up Python
17
+ uses: actions/setup-python@v4
18
+ with:
19
+ python-version: '3.10'
20
+
21
+ - name: Install system dependencies
22
+ run: |
23
+ sudo apt-get update
24
+ sudo apt-get install -y tesseract-ocr poppler-utils
25
+
26
+ - name: Install Python dependencies
27
+ run: |
28
+ python -m pip install --upgrade pip
29
+ pip install -r requirements.txt
30
+ pip install pytest pytest-cov httpx
31
+
32
+ - name: Run tests
33
+ run: |
34
+ pytest tests/ -v --cov=app --cov-report=xml
35
+
36
+ - name: Upload coverage
37
+ uses: codecov/codecov-action@v3
38
+ with:
39
+ file: ./coverage.xml
40
+ fail_ci_if_error: false
41
+
42
+ docker-build:
43
+ runs-on: ubuntu-latest
44
+ needs: test
45
+
46
+ steps:
47
+ - uses: actions/checkout@v3
48
+
49
+ - name: Set up Docker Buildx
50
+ uses: docker/setup-buildx-action@v2
51
+
52
+ - name: Build Docker image
53
+ run: |
54
+ docker build -t pdf-redaction-api:test .
55
+
56
+ - name: Test Docker image
57
+ run: |
58
+ docker run -d -p 7860:7860 --name test-api pdf-redaction-api:test
59
+ sleep 10
60
+ curl -f http://localhost:7860/health || exit 1
61
+ docker stop test-api
62
+
63
+ deploy-huggingface:
64
+ runs-on: ubuntu-latest
65
+ needs: [test, docker-build]
66
+ if: github.ref == 'refs/heads/main'
67
+
68
+ steps:
69
+ - uses: actions/checkout@v3
70
+
71
+ - name: Deploy to HuggingFace Spaces
72
+ env:
73
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
74
+ run: |
75
+ git config --global user.email "github-actions@github.com"
76
+ git config --global user.name "GitHub Actions"
77
+
78
+ # Add HuggingFace remote if it doesn't exist
79
+ git remote add hf https://user:$HF_TOKEN@huggingface.co/spaces/${{ secrets.HF_SPACE }} || true
80
+
81
+ # Push to HuggingFace
82
+ git push hf main:main
.gitignore ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual environments
24
+ redact/
25
+ venv/
26
+ env/
27
+ ENV/
28
+ .venv
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Project specific
42
+ uploads/*.pdf
43
+ outputs/*.pdf
44
+ *.log
45
+
46
+ # Environment
47
+ .env
48
+ .env.local
49
+
50
+ # Testing
51
+ .pytest_cache/
52
+ .coverage
53
+ htmlcov/
54
+
55
+ # Model cache
56
+ cache/
57
+ models/
COMPLETE_GUIDE.md ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Complete FastAPI Deployment Package
2
+
3
+ ## 📦 What You've Got
4
+
5
+ A production-ready FastAPI application for PDF redaction with Named Entity Recognition, ready to deploy on HuggingFace Spaces or any cloud platform.
6
+
7
+ ---
8
+
9
+ ## 📁 Directory Structure
10
+
11
+ ```
12
+ pdf-redaction-api/
13
+
14
+ ├── 📄 main.py # FastAPI application
15
+ ├── 🐳 Dockerfile # Production container
16
+ ├── 🐳 docker-compose.yml # Local development
17
+ ├── 📋 requirements.txt # Python dependencies
18
+
19
+ ├── 📱 app/
20
+ │ ├── __init__.py
21
+ │ └── redaction.py # Core redaction engine
22
+
23
+ ├── 📂 uploads/ # Temporary uploads
24
+ │ └── .gitkeep
25
+
26
+ ├── 📂 outputs/ # Redacted PDFs
27
+ │ └── .gitkeep
28
+
29
+ ├── 🧪 tests/
30
+ │ └── test_api.py # API tests
31
+
32
+ ├── 📚 Documentation/
33
+ │ ├── README.md # Main docs (for HF Spaces)
34
+ │ ├── DEPLOYMENT.md # Deployment guide
35
+ │ ├── QUICKSTART.md # Quick start guide
36
+ │ └── STRUCTURE.md # Project structure
37
+
38
+ ├── 🔧 Configuration/
39
+ │ ├── .env.example # Environment variables
40
+ │ ├── .gitignore # Git ignore
41
+ │ └── .dockerignore # Docker ignore
42
+
43
+ ├── 🤖 .github/
44
+ │ └── workflows/
45
+ │ └── ci-cd.yml # GitHub Actions CI/CD
46
+
47
+ ├── 📝 client_example.py # Example API client
48
+ └── 📜 LICENSE # MIT License
49
+ ```
50
+
51
+ ---
52
+
53
+ ## ✨ Features
54
+
55
+ ### Core Functionality
56
+ ✅ PDF upload and processing
57
+ ✅ OCR with pytesseract (configurable DPI)
58
+ ✅ Named Entity Recognition (NER)
59
+ ✅ Accurate coordinate-based redaction
60
+ ✅ Multiple entity type support
61
+ ✅ Downloadable redacted PDFs
62
+
63
+ ### API Features
64
+ ✅ RESTful API with FastAPI
65
+ ✅ Automatic OpenAPI documentation
66
+ ✅ File upload handling
67
+ ✅ Background task cleanup
68
+ ✅ Health checks
69
+ ✅ Statistics endpoint
70
+ ✅ CORS support
71
+
72
+ ### DevOps
73
+ ✅ Docker containerization
74
+ ✅ Docker Compose for local dev
75
+ ✅ GitHub Actions CI/CD
76
+ ✅ HuggingFace Spaces ready
77
+ ✅ Comprehensive testing
78
+ ✅ Logging and monitoring
79
+
80
+ ---
81
+
82
+ ## 🎯 Quick Deployment Paths
83
+
84
+ ### Option 1: HuggingFace Spaces (Recommended for Demo)
85
+
86
+ **Time: 10 minutes**
87
+
88
+ ```bash
89
+ # 1. Create Space on HuggingFace (select Docker SDK)
90
+ # 2. Clone your space
91
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
92
+ cd pdf-redaction-api
93
+
94
+ # 3. Copy all files
95
+ cp -r /path/to/pdf-redaction-api/* .
96
+
97
+ # 4. Deploy
98
+ git add .
99
+ git commit -m "Initial deployment"
100
+ git push
101
+ ```
102
+
103
+ **Your API will be at:** `https://YOUR_USERNAME-pdf-redaction-api.hf.space`
104
+
105
+ **Cost:** FREE (with CPU Basic tier)
106
+
107
+ ---
108
+
109
+ ### Option 2: Docker Locally
110
+
111
+ **Time: 5 minutes**
112
+
113
+ ```bash
114
+ # Build
115
+ docker build -t pdf-redaction-api .
116
+
117
+ # Run
118
+ docker run -p 7860:7860 pdf-redaction-api
119
+
120
+ # Test
121
+ curl http://localhost:7860/health
122
+ ```
123
+
124
+ ---
125
+
126
+ ### Option 3: Direct Python
127
+
128
+ **Time: 3 minutes**
129
+
130
+ ```bash
131
+ # Install dependencies
132
+ sudo apt-get install tesseract-ocr poppler-utils
133
+ pip install -r requirements.txt
134
+
135
+ # Run
136
+ python main.py
137
+
138
+ # Access at http://localhost:7860
139
+ ```
140
+
141
+ ---
142
+
143
+ ## 🔌 API Endpoints
144
+
145
+ ### Core Endpoints
146
+
147
+ | Method | Endpoint | Description |
148
+ |--------|----------|-------------|
149
+ | POST | `/redact` | Upload and redact PDF |
150
+ | GET | `/download/{job_id}` | Download redacted PDF |
151
+ | GET | `/health` | Health check |
152
+ | GET | `/stats` | API statistics |
153
+ | DELETE | `/cleanup/{job_id}` | Manual cleanup |
154
+ | GET | `/docs` | Interactive API docs |
155
+
156
+ ### Example Usage
157
+
158
+ **cURL:**
159
+ ```bash
160
+ curl -X POST "http://localhost:7860/redact" \
161
+ -F "file=@document.pdf" \
162
+ -F "dpi=300"
163
+ ```
164
+
165
+ **Python:**
166
+ ```python
167
+ import requests
168
+
169
+ response = requests.post(
170
+ "http://localhost:7860/redact",
171
+ files={"file": open("document.pdf", "rb")},
172
+ params={"dpi": 300}
173
+ )
174
+
175
+ job_id = response.json()["job_id"]
176
+ redacted = requests.get(f"http://localhost:7860/download/{job_id}")
177
+ ```
178
+
179
+ ---
180
+
181
+ ## 🎨 Architecture
182
+
183
+ ```
184
+ ┌─────────────────────────────────────────────────────────┐
185
+ │ CLIENT REQUEST │
186
+ │ (Upload PDF via POST /redact) │
187
+ └─────────────────────────────────────────────────────────┘
188
+
189
+ ┌─────────────────────────────────────────────────────────┐
190
+ │ FASTAPI (main.py) │
191
+ │ • Validate file │
192
+ │ • Generate job_id │
193
+ │ • Save to uploads/ │
194
+ └─────────────────────────────────────────────────────────┘
195
+
196
+ ┌─────────────────────────────────────────────────────────┐
197
+ │ PDFRedactor (app/redaction.py) │
198
+ │ │
199
+ │ ┌─────────────────────────────────────────┐ │
200
+ │ │ 1. OCR (pytesseract) │ │
201
+ │ │ • Convert PDF → Images (pdf2image) │ │
202
+ │ │ • Extract text + bounding boxes │ │
203
+ │ │ • Store image dimensions │ │
204
+ │ └─────────────────────────────────────────┘ │
205
+ │ ↓ │
206
+ │ ┌─────────────────────────────────────────┐ │
207
+ │ │ 2. NER (HuggingFace Transformers) │ │
208
+ │ │ • Load model │ │
209
+ │ │ • Identify entities in text │ │
210
+ │ │ • Return entity types + positions │ │
211
+ │ └─────────────────────────────────────────┘ │
212
+ │ ↓ │
213
+ │ ┌─────────────────────────────────────────┐ │
214
+ │ │ 3. Mapping │ │
215
+ │ │ • Create character span index │ │
216
+ │ │ • Match NER entities to OCR boxes │ │
217
+ │ └─────────────────────────────────────────┘ │
218
+ │ ↓ │
219
+ │ ┌─────────────────────────────────────────┐ │
220
+ │ │ 4. Redaction (pypdf) │ │
221
+ │ │ • Scale image coords → PDF coords │ │
222
+ │ │ • Create black rectangle annotations │ │
223
+ │ │ • Write redacted PDF │ │
224
+ │ └─────────────────────────────────────────┘ │
225
+ └─────────────────────────────────────────────────────────┘
226
+
227
+ ┌─────────────────────────────────────────────────────────┐
228
+ │ RESPONSE │
229
+ │ • job_id │
230
+ │ • List of entities │
231
+ │ • Download URL │
232
+ └─────────────────────────────────────────────────────────┘
233
+ ```
234
+
235
+ ---
236
+
237
+ ## 🔐 Security Considerations
238
+
239
+ ### Current Implementation
240
+ - ✅ File validation (PDF only)
241
+ - ✅ Temporary file cleanup
242
+ - ✅ CORS middleware
243
+ - ✅ Error handling
244
+
245
+ ### For Production (TODO)
246
+ - ⚠️ Add API key authentication
247
+ - ⚠️ Implement rate limiting
248
+ - ⚠️ Add file size limits
249
+ - ⚠️ Use HTTPS only
250
+ - ⚠️ Implement user quotas
251
+ - ⚠️ Add input sanitization
252
+
253
+ **Example API Key Auth:**
254
+ ```python
255
+ # Add to main.py
256
+ from fastapi import Security, HTTPException
257
+ from fastapi.security import APIKeyHeader
258
+
259
+ API_KEY = "your-secret-key"
260
+ api_key_header = APIKeyHeader(name="X-API-Key")
261
+
262
+ def verify_api_key(key: str = Security(api_key_header)):
263
+ if key != API_KEY:
264
+ raise HTTPException(401, "Invalid API Key")
265
+ ```
266
+
267
+ ---
268
+
269
+ ## 📊 Performance Tuning
270
+
271
+ ### DPI Settings
272
+
273
+ | DPI | Quality | Speed | Use Case |
274
+ |-----|---------|-------|----------|
275
+ | 150 | Low | Fast | Quick previews |
276
+ | 200 | Medium | Medium | General use |
277
+ | 300 | High | Slow | **Recommended** |
278
+ | 600 | Very High | Very Slow | Critical documents |
279
+
280
+ ### Hardware Requirements
281
+
282
+ **Minimum (Free Tier):**
283
+ - CPU: 2 cores
284
+ - RAM: 2GB
285
+ - Storage: 1GB
286
+
287
+ **Recommended (Production):**
288
+ - CPU: 4+ cores
289
+ - RAM: 8GB
290
+ - Storage: 10GB
291
+ - GPU: Optional (speeds up NER)
292
+
293
+ ---
294
+
295
+ ## 🧪 Testing
296
+
297
+ ```bash
298
+ # Install test dependencies
299
+ pip install pytest pytest-cov httpx
300
+
301
+ # Run tests
302
+ pytest tests/ -v
303
+
304
+ # With coverage
305
+ pytest tests/ --cov=app --cov-report=html
306
+
307
+ # View coverage report
308
+ open htmlcov/index.html
309
+ ```
310
+
311
+ ---
312
+
313
+ ## 📈 Monitoring
314
+
315
+ ### Built-in Endpoints
316
+
317
+ **Health Check:**
318
+ ```bash
319
+ curl http://localhost:7860/health
320
+ ```
321
+
322
+ **Statistics:**
323
+ ```bash
324
+ curl http://localhost:7860/stats
325
+ ```
326
+
327
+ ### Logs
328
+
329
+ **Development:**
330
+ ```bash
331
+ python main.py
332
+ # Logs appear in console
333
+ ```
334
+
335
+ **Docker:**
336
+ ```bash
337
+ docker logs -f container_name
338
+ ```
339
+
340
+ **HuggingFace Spaces:**
341
+ - View in Space dashboard → Logs tab
342
+
343
+ ---
344
+
345
+ ## 💰 Cost Estimation
346
+
347
+ ### HuggingFace Spaces
348
+
349
+ | Tier | CPU | RAM | Price | Use Case |
350
+ |------|-----|-----|-------|----------|
351
+ | Basic | 2 | 16GB | **FREE** | Demo, testing |
352
+ | CPU Upgrade | 4 | 32GB | $0.50/hr | Production |
353
+ | GPU T4 | - | - | $0.60/hr | Heavy load |
354
+ | GPU A10G | - | - | $1.50/hr | Enterprise |
355
+
356
+ **Monthly Costs (if always on):**
357
+ - Free: $0
358
+ - CPU Upgrade: ~$360/month
359
+ - GPU T4: ~$432/month
360
+
361
+ **Recommendation:** Start free, upgrade based on usage
362
+
363
+ ### Alternatives
364
+
365
+ **AWS ECS Fargate:** ~$30-100/month
366
+ **Google Cloud Run:** Pay per request (~$10-50/month)
367
+ **DigitalOcean App:** $12-24/month
368
+ **Self-hosted VPS:** $5-20/month
369
+
370
+ ---
371
+
372
+ ## 🔄 CI/CD Pipeline
373
+
374
+ ### Automated with GitHub Actions
375
+
376
+ ```
377
+ Push to GitHub
378
+
379
+ [Run Tests]
380
+
381
+ [Build Docker]
382
+
383
+ [Test Container]
384
+
385
+ [Deploy to HuggingFace]
386
+ ```
387
+
388
+ **Setup:**
389
+ 1. Add secrets in GitHub repo settings:
390
+ - `HF_TOKEN`: HuggingFace access token
391
+ - `HF_SPACE`: Your space name (username/space-name)
392
+
393
+ 2. Push to main branch → Auto-deploy! ✨
394
+
395
+ ---
396
+
397
+ ## 📚 Documentation Access
398
+
399
+ | Document | Purpose |
400
+ |----------|---------|
401
+ | `README.md` | Overview, API docs, usage examples |
402
+ | `QUICKSTART.md` | 5-minute setup guide |
403
+ | `DEPLOYMENT.md` | Production deployment |
404
+ | `STRUCTURE.md` | Code organization |
405
+ | `/docs` endpoint | Interactive API documentation |
406
+
407
+ ---
408
+
409
+ ## 🎓 Learning Resources
410
+
411
+ ### FastAPI
412
+ - Docs: https://fastapi.tiangolo.com
413
+ - Tutorial: https://fastapi.tiangolo.com/tutorial
414
+
415
+ ### HuggingFace
416
+ - Spaces: https://huggingface.co/docs/hub/spaces
417
+ - Transformers: https://huggingface.co/docs/transformers
418
+
419
+ ### Docker
420
+ - Getting Started: https://docs.docker.com/get-started
421
+
422
+ ---
423
+
424
+ ## 🐛 Troubleshooting
425
+
426
+ ### Common Issues
427
+
428
+ **Problem:** "Tesseract not found"
429
+ **Solution:** `apt-get install tesseract-ocr`
430
+
431
+ **Problem:** "Poppler not found"
432
+ **Solution:** `apt-get install poppler-utils`
433
+
434
+ **Problem:** Slow processing
435
+ **Solution:** Lower DPI to 150-200
436
+
437
+ **Problem:** Out of memory
438
+ **Solution:** Upgrade hardware or reduce DPI
439
+
440
+ **Problem:** Model not loading
441
+ **Solution:** Check internet, wait for download
442
+
443
+ ### Debug Mode
444
+
445
+ ```python
446
+ # In main.py, add debug mode
447
+ if __name__ == "__main__":
448
+ uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True, log_level="debug")
449
+ ```
450
+
451
+ ---
452
+
453
+ ## ✅ Checklist for Production
454
+
455
+ - [ ] Test all endpoints thoroughly
456
+ - [ ] Add API key authentication
457
+ - [ ] Implement rate limiting
458
+ - [ ] Set up monitoring (Sentry, DataDog, etc.)
459
+ - [ ] Configure auto-scaling
460
+ - [ ] Set up backups
461
+ - [ ] Add usage analytics
462
+ - [ ] Create user documentation
463
+ - [ ] Set up SSL/TLS (HF provides by default)
464
+ - [ ] Test with large files
465
+ - [ ] Load testing
466
+ - [ ] Security audit
467
+ - [ ] Legal compliance (GDPR, etc.)
468
+
469
+ ---
470
+
471
+ ## 🎉 You're Ready!
472
+
473
+ Your FastAPI PDF Redaction application is complete and ready to deploy!
474
+
475
+ ### Next Steps:
476
+ 1. ✨ Deploy to HuggingFace Spaces (easiest)
477
+ 2. 🧪 Test with real PDFs
478
+ 3. 📊 Monitor usage
479
+ 4. 🔒 Add security for production
480
+ 5. 🚀 Scale as needed
481
+
482
+ ### Support:
483
+ - 📖 Read the documentation
484
+ - 🐛 Check troubleshooting guide
485
+ - 💬 HuggingFace community forums
486
+ - 📧 Create issues on your repo
487
+
488
+ **Happy Deploying! 🚀**
DEPLOYMENT.md ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Deployment Guide for HuggingFace Spaces
2
+
3
+ ## Prerequisites
4
+
5
+ 1. **HuggingFace Account**: Sign up at https://huggingface.co/
6
+ 2. **Git**: Installed on your local machine
7
+ 3. **Git LFS**: For large file storage (optional)
8
+
9
+ ## Step-by-Step Deployment
10
+
11
+ ### 1. Create a New Space
12
+
13
+ 1. Go to https://huggingface.co/spaces
14
+ 2. Click "Create new Space"
15
+ 3. Fill in the details:
16
+ - **Space name**: `pdf-redaction-api` (or your preferred name)
17
+ - **License**: MIT
18
+ - **SDK**: Docker
19
+ - **Hardware**: CPU Basic (free tier) or upgrade if needed
20
+ 4. Click "Create Space"
21
+
22
+ ### 2. Clone Your Space Repository
23
+
24
+ ```bash
25
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
26
+ cd pdf-redaction-api
27
+ ```
28
+
29
+ ### 3. Copy All Files to the Repository
30
+
31
+ Copy all files from this project to your cloned space:
32
+
33
+ ```bash
34
+ # Copy all files
35
+ cp -r /path/to/pdf-redaction-api/* .
36
+
37
+ # Check the files
38
+ ls -la
39
+ ```
40
+
41
+ You should see:
42
+ - `main.py`
43
+ - `app/`
44
+ - `Dockerfile`
45
+ - `requirements.txt`
46
+ - `README.md`
47
+ - `.gitignore`
48
+ - `.dockerignore`
49
+ - `uploads/` (with .gitkeep)
50
+ - `outputs/` (with .gitkeep)
51
+
52
+ ### 4. Commit and Push
53
+
54
+ ```bash
55
+ # Add all files
56
+ git add .
57
+
58
+ # Commit
59
+ git commit -m "Initial deployment of PDF Redaction API"
60
+
61
+ # Push to HuggingFace
62
+ git push
63
+ ```
64
+
65
+ ### 5. Monitor Deployment
66
+
67
+ 1. Go to your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
68
+ 2. You'll see the build logs
69
+ 3. Wait for the build to complete (usually 5-10 minutes)
70
+ 4. Once complete, your API will be live!
71
+
72
+ ### 6. Test Your Deployment
73
+
74
+ ```bash
75
+ # Check health
76
+ curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
77
+
78
+ # Test with a PDF
79
+ curl -X POST "https://YOUR_USERNAME-pdf-redaction-api.hf.space/redact" \
80
+ -F "file=@test.pdf" \
81
+ -F "dpi=300"
82
+ ```
83
+
84
+ ## Configuration Options
85
+
86
+ ### Hardware Upgrades
87
+
88
+ For better performance, consider upgrading your Space hardware:
89
+
90
+ 1. Go to Space Settings
91
+ 2. Click on "Hardware"
92
+ 3. Choose:
93
+ - **CPU Basic** (Free): Good for testing, slower processing
94
+ - **CPU Upgrade** (~$0.50/hour): Faster processing
95
+ - **GPU** (~$0.60-3/hour): Best for large documents
96
+
97
+ ### Environment Variables
98
+
99
+ Add environment variables in Space Settings if needed:
100
+
101
+ ```bash
102
+ HF_HOME=/app/cache
103
+ PYTHONUNBUFFERED=1
104
+ ```
105
+
106
+ ### Persistent Storage
107
+
108
+ For persistent file storage:
109
+
110
+ 1. Go to Space Settings
111
+ 2. Enable "Persistent Storage"
112
+ 3. This keeps uploaded/processed files between restarts
113
+
114
+ ## Custom Domain (Optional)
115
+
116
+ To use a custom domain:
117
+
118
+ 1. Go to Space Settings
119
+ 2. Click "Domains"
120
+ 3. Add your custom domain
121
+ 4. Follow DNS configuration instructions
122
+
123
+ ## Monitoring and Logs
124
+
125
+ ### View Logs
126
+
127
+ 1. Go to your Space page
128
+ 2. Click on "Logs" tab
129
+ 3. Monitor real-time logs
130
+
131
+ ### Check Resource Usage
132
+
133
+ 1. Click on "Insights" tab
134
+ 2. View CPU/Memory usage
135
+ 3. Monitor request patterns
136
+
137
+ ## Security Considerations
138
+
139
+ ### For Production Use
140
+
141
+ 1. **Add Authentication**:
142
+ - Implement API key authentication
143
+ - Use OAuth2 for user management
144
+
145
+ 2. **Rate Limiting**:
146
+ - Add rate limiting to prevent abuse
147
+ - Use slowapi or similar libraries
148
+
149
+ 3. **File Size Limits**:
150
+ - Restrict upload file sizes
151
+ - Implement timeout for long-running requests
152
+
153
+ 4. **HTTPS Only**:
154
+ - HuggingFace Spaces provides HTTPS by default
155
+ - Ensure all requests use HTTPS
156
+
157
+ Example with API key authentication:
158
+
159
+ ```python
160
+ from fastapi import Security, HTTPException, status
161
+ from fastapi.security import APIKeyHeader
162
+
163
+ API_KEY = "your-secret-key"
164
+ api_key_header = APIKeyHeader(name="X-API-Key")
165
+
166
+ def verify_api_key(api_key: str = Security(api_key_header)):
167
+ if api_key != API_KEY:
168
+ raise HTTPException(
169
+ status_code=status.HTTP_401_UNAUTHORIZED,
170
+ detail="Invalid API Key"
171
+ )
172
+ return api_key
173
+
174
+ # Add to endpoints
175
+ @app.post("/redact")
176
+ async def redact_pdf(
177
+ file: UploadFile = File(...),
178
+ api_key: str = Security(verify_api_key)
179
+ ):
180
+ # Your code here
181
+ ```
182
+
183
+ ## Troubleshooting
184
+
185
+ ### Build Fails
186
+
187
+ **Problem**: Docker build fails
188
+
189
+ **Solution**:
190
+ - Check Dockerfile syntax
191
+ - Ensure all dependencies are in requirements.txt
192
+ - Review build logs for specific errors
193
+
194
+ ### Out of Memory
195
+
196
+ **Problem**: API crashes with OOM errors
197
+
198
+ **Solution**:
199
+ - Reduce default DPI to 200
200
+ - Upgrade to larger hardware
201
+ - Implement request queuing
202
+
203
+ ### Slow Processing
204
+
205
+ **Problem**: Redaction takes too long
206
+
207
+ **Solution**:
208
+ - Lower DPI (150-200 for faster processing)
209
+ - Upgrade to GPU hardware
210
+ - Optimize batch processing
211
+
212
+ ### Model Download Issues
213
+
214
+ **Problem**: Model fails to download
215
+
216
+ **Solution**:
217
+ - Check HuggingFace model availability
218
+ - Verify internet access in Space
219
+ - Pre-download model and include in Docker image
220
+
221
+ ## Updating Your Space
222
+
223
+ To update your deployed API:
224
+
225
+ ```bash
226
+ # Make changes locally
227
+ # Test changes
228
+
229
+ # Commit and push
230
+ git add .
231
+ git commit -m "Update: description of changes"
232
+ git push
233
+
234
+ # HuggingFace will automatically rebuild
235
+ ```
236
+
237
+ ## Cost Estimation
238
+
239
+ ### Free Tier
240
+ - CPU Basic
241
+ - Limited to 2 CPU cores
242
+ - 16GB RAM
243
+ - Good for: Testing, low-traffic demos
244
+
245
+ ### Paid Tiers
246
+ - CPU Upgrade: ~$0.50/hour (~$360/month if always on)
247
+ - GPU T4: ~$0.60/hour (~$432/month)
248
+ - GPU A10G: ~$1.50/hour (~$1,080/month)
249
+
250
+ **Recommendation**: Start with free tier, upgrade based on usage
251
+
252
+ ## Alternative Deployment Options
253
+
254
+ ### 1. Deploy on Your Own Server
255
+
256
+ ```bash
257
+ # Build Docker image
258
+ docker build -t pdf-redaction-api .
259
+
260
+ # Run container
261
+ docker run -p 7860:7860 pdf-redaction-api
262
+ ```
263
+
264
+ ### 2. Deploy on Cloud Platforms
265
+
266
+ - **AWS ECS/Fargate**: For scalable production
267
+ - **Google Cloud Run**: Serverless container deployment
268
+ - **Azure Container Instances**: Easy container deployment
269
+ - **DigitalOcean App Platform**: Simple PaaS deployment
270
+
271
+ ### 3. Deploy on Render.com
272
+
273
+ 1. Connect your GitHub repo
274
+ 2. Select "Docker" as environment
275
+ 3. Deploy automatically
276
+
277
+ ## Support
278
+
279
+ For issues:
280
+ 1. Check HuggingFace Spaces documentation
281
+ 2. Review logs in Space dashboard
282
+ 3. Test locally with Docker first
283
+ 4. Open issue on your repository
284
+
285
+ ## Next Steps
286
+
287
+ After successful deployment:
288
+
289
+ 1. ✅ Test all API endpoints
290
+ 2. ✅ Set up monitoring
291
+ 3. ✅ Configure custom domain (optional)
292
+ 4. ✅ Add authentication for production
293
+ 5. ✅ Implement rate limiting
294
+ 6. ✅ Set up error tracking (e.g., Sentry)
295
+ 7. ✅ Create API documentation with examples
296
+ 8. ✅ Add usage analytics
297
+
298
+ Your API is now live and ready to use! 🚀
Dockerfile ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.12-slim
2
+
3
+ # Set working directory
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ tesseract-ocr \
9
+ tesseract-ocr-eng \
10
+ poppler-utils \
11
+ libgl1 \
12
+ libglib2.0-0 \
13
+ && rm -rf /var/lib/apt/lists/*
14
+
15
+ # Copy requirements first for better caching
16
+ COPY requirements.txt .
17
+
18
+ # Install Python dependencies
19
+ RUN pip install --no-cache-dir -r requirements.txt
20
+
21
+ # Copy application code
22
+ COPY . .
23
+
24
+ # Create necessary directories
25
+ RUN mkdir -p uploads outputs
26
+
27
+ # Expose port (HuggingFace Spaces uses 7860)
28
+ EXPOSE 7860
29
+
30
+ # Set environment variables
31
+ ENV PYTHONUNBUFFERED=1
32
+ ENV HF_HOME=/app/cache
33
+
34
+ # Run the application
35
+ CMD ["uvicorn", "main:app", "--host", "localhost", "--port", "2700"]
36
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 PDF Redaction API
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
QUICKSTART.md ADDED
@@ -0,0 +1,271 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quick Start Guide 🚀
2
+
3
+ ## Local Development (5 minutes)
4
+
5
+ ### 1. Install System Dependencies
6
+
7
+ **Ubuntu/Debian:**
8
+ ```bash
9
+ sudo apt-get update
10
+ sudo apt-get install -y tesseract-ocr poppler-utils
11
+ ```
12
+
13
+ **macOS:**
14
+ ```bash
15
+ brew install tesseract poppler
16
+ ```
17
+
18
+ **Windows:**
19
+ - Download Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
20
+ - Download Poppler: https://github.com/oschwartz10612/poppler-windows/releases
21
+
22
+ ### 2. Install Python Dependencies
23
+
24
+ ```bash
25
+ pip install -r requirements.txt
26
+ ```
27
+
28
+ ### 3. Run the Server
29
+
30
+ ```bash
31
+ python main.py
32
+ ```
33
+
34
+ The API will be available at: `http://localhost:7860`
35
+
36
+ ### 4. Test with cURL
37
+
38
+ ```bash
39
+ # Health check
40
+ curl http://localhost:7860/health
41
+
42
+ # Redact a PDF
43
+ curl -X POST "http://localhost:7860/redact" \
44
+ -F "file=@your_document.pdf" \
45
+ -F "dpi=300"
46
+ ```
47
+
48
+ ### 5. Access API Documentation
49
+
50
+ Open in browser: `http://localhost:7860/docs`
51
+
52
+ ## Using Docker (3 minutes)
53
+
54
+ ### 1. Build Image
55
+
56
+ ```bash
57
+ docker build -t pdf-redaction-api .
58
+ ```
59
+
60
+ ### 2. Run Container
61
+
62
+ ```bash
63
+ docker run -p 7860:7860 pdf-redaction-api
64
+ ```
65
+
66
+ ### 3. Test
67
+
68
+ ```bash
69
+ curl http://localhost:7860/health
70
+ ```
71
+
72
+ ## Deploy to HuggingFace Spaces (10 minutes)
73
+
74
+ ### 1. Create Space
75
+
76
+ 1. Go to https://huggingface.co/spaces
77
+ 2. Click "Create new Space"
78
+ 3. Name: `pdf-redaction-api`
79
+ 4. SDK: **Docker**
80
+ 5. Click "Create Space"
81
+
82
+ ### 2. Push Code
83
+
84
+ ```bash
85
+ # Clone your space
86
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api
87
+ cd pdf-redaction-api
88
+
89
+ # Copy all project files
90
+ cp -r /path/to/project/* .
91
+
92
+ # Commit and push
93
+ git add .
94
+ git commit -m "Initial deployment"
95
+ git push
96
+ ```
97
+
98
+ ### 3. Wait for Build
99
+
100
+ Monitor at: `https://huggingface.co/spaces/YOUR_USERNAME/pdf-redaction-api`
101
+
102
+ ### 4. Test Your Deployed API
103
+
104
+ ```bash
105
+ curl https://YOUR_USERNAME-pdf-redaction-api.hf.space/health
106
+ ```
107
+
108
+ ## Example Usage
109
+
110
+ ### Python Client
111
+
112
+ ```python
113
+ import requests
114
+
115
+ # Upload and redact
116
+ files = {"file": open("document.pdf", "rb")}
117
+ response = requests.post(
118
+ "http://localhost:7860/redact",
119
+ files=files,
120
+ params={"dpi": 300}
121
+ )
122
+
123
+ result = response.json()
124
+ job_id = result["job_id"]
125
+
126
+ # Download redacted PDF
127
+ redacted = requests.get(f"http://localhost:7860/download/{job_id}")
128
+ with open("redacted.pdf", "wb") as f:
129
+ f.write(redacted.content)
130
+
131
+ print(f"Redacted {len(result['entities'])} entities")
132
+ ```
133
+
134
+ ### JavaScript/Node.js
135
+
136
+ ```javascript
137
+ const FormData = require('form-data');
138
+ const fs = require('fs');
139
+ const axios = require('axios');
140
+
141
+ async function redactPDF() {
142
+ const form = new FormData();
143
+ form.append('file', fs.createReadStream('document.pdf'));
144
+
145
+ // Upload and redact
146
+ const response = await axios.post(
147
+ 'http://localhost:7860/redact',
148
+ form,
149
+ {
150
+ headers: form.getHeaders(),
151
+ params: { dpi: 300 }
152
+ }
153
+ );
154
+
155
+ const { job_id } = response.data;
156
+
157
+ // Download redacted PDF
158
+ const redacted = await axios.get(
159
+ `http://localhost:7860/download/${job_id}`,
160
+ { responseType: 'arraybuffer' }
161
+ );
162
+
163
+ fs.writeFileSync('redacted.pdf', redacted.data);
164
+ console.log('Redaction complete!');
165
+ }
166
+
167
+ redactPDF();
168
+ ```
169
+
170
+ ### cURL Advanced
171
+
172
+ ```bash
173
+ # Redact only specific entity types
174
+ curl -X POST "http://localhost:7860/redact" \
175
+ -F "file=@document.pdf" \
176
+ -F "dpi=300" \
177
+ -F "entity_types=PER,ORG"
178
+
179
+ # Get statistics
180
+ curl http://localhost:7860/stats
181
+
182
+ # Download specific file
183
+ curl -O -J http://localhost:7860/download/JOB_ID_HERE
184
+ ```
185
+
186
+ ## Common Use Cases
187
+
188
+ ### 1. Redact All Personal Information
189
+
190
+ ```python
191
+ response = requests.post(
192
+ "http://localhost:7860/redact",
193
+ files={"file": open("resume.pdf", "rb")},
194
+ params={"dpi": 300}
195
+ )
196
+ ```
197
+
198
+ ### 2. Redact Only Names and Organizations
199
+
200
+ ```python
201
+ response = requests.post(
202
+ "http://localhost:7860/redact",
203
+ files={"file": open("contract.pdf", "rb")},
204
+ params={
205
+ "dpi": 300,
206
+ "entity_types": "PER,ORG"
207
+ }
208
+ )
209
+ ```
210
+
211
+ ### 3. Fast Processing (Lower Quality)
212
+
213
+ ```python
214
+ response = requests.post(
215
+ "http://localhost:7860/redact",
216
+ files={"file": open("large_doc.pdf", "rb")},
217
+ params={"dpi": 150} # Faster but less accurate
218
+ )
219
+ ```
220
+
221
+ ### 4. High Quality (Slower)
222
+
223
+ ```python
224
+ response = requests.post(
225
+ "http://localhost:7860/redact",
226
+ files={"file": open("important.pdf", "rb")},
227
+ params={"dpi": 600} # Best quality, slowest
228
+ )
229
+ ```
230
+
231
+ ## Troubleshooting
232
+
233
+ ### "Model not loaded"
234
+ **Problem**: NER model failed to load
235
+ **Solution**: Check internet connection, wait for model download
236
+
237
+ ### "Tesseract not found"
238
+ **Problem**: OCR engine not installed
239
+ **Solution**: Install tesseract-ocr system package
240
+
241
+ ### "Poppler not found"
242
+ **Problem**: PDF converter not installed
243
+ **Solution**: Install poppler-utils system package
244
+
245
+ ### Slow processing
246
+ **Problem**: Redaction takes too long
247
+ **Solution**: Lower DPI to 150-200
248
+
249
+ ### Out of memory
250
+ **Problem**: Large PDF crashes the API
251
+ **Solution**:
252
+ - Process one page at a time
253
+ - Increase container memory
254
+ - Lower DPI
255
+
256
+ ## Next Steps
257
+
258
+ - ✅ Read full [README.md](README.md) for API details
259
+ - ✅ Check [DEPLOYMENT.md](DEPLOYMENT.md) for production setup
260
+ - ✅ Review [STRUCTURE.md](STRUCTURE.md) for code organization
261
+ - ✅ Run tests: `pytest tests/`
262
+ - ✅ Add authentication for production use
263
+ - ✅ Set up monitoring and logging
264
+
265
+ ## Support
266
+
267
+ - 📖 API Docs: `http://localhost:7860/docs`
268
+ - 🐛 Issues: Create on your repository
269
+ - 💬 HuggingFace: Community forums
270
+
271
+ Happy redacting! 🔒
README.md CHANGED
@@ -1,10 +1,167 @@
1
  ---
2
  title: PDF Redaction API
3
- emoji: 👀
4
  colorFrom: blue
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: PDF Redaction API
3
+ emoji: 🔒
4
  colorFrom: blue
5
+ colorTo: green
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
  ---
10
 
11
+ # PDF Redaction API 🔒
12
+
13
+ Automatically redact sensitive information from PDF documents using Named Entity Recognition (NER).
14
+
15
+ ## Features
16
+
17
+ - 🤖 **Powered by NER**: Uses state-of-the-art Named Entity Recognition
18
+ - 📄 **PDF Support**: Upload and process PDF documents
19
+ - 🎯 **Accurate Redaction**: Correctly positioned black rectangles over sensitive text
20
+ - 🚀 **Fast Processing**: Optimized OCR and NER pipeline
21
+ - 🔧 **Configurable**: Adjust DPI and filter entity types
22
+
23
+ ## API Endpoints
24
+
25
+ ### `POST /redact`
26
+
27
+ Upload a PDF file and get it redacted.
28
+
29
+ **Parameters:**
30
+ - `file`: PDF file (required)
31
+ - `dpi`: OCR quality (default: 300)
32
+ - `entity_types`: Comma-separated entity types to redact (optional)
33
+
34
+ **Example using cURL:**
35
+
36
+ ```bash
37
+ curl -X POST "https://your-space.hf.space/redact" \
38
+ -F "file=@document.pdf" \
39
+ -F "dpi=300"
40
+ ```
41
+
42
+ **Example using Python:**
43
+
44
+ ```python
45
+ import requests
46
+
47
+ url = "https://your-space.hf.space/redact"
48
+ files = {"file": open("document.pdf", "rb")}
49
+ params = {"dpi": 300}
50
+
51
+ response = requests.post(url, files=files, params=params)
52
+ result = response.json()
53
+
54
+ # Download redacted file
55
+ job_id = result["job_id"]
56
+ download_url = f"https://your-space.hf.space/download/{job_id}"
57
+ redacted_pdf = requests.get(download_url)
58
+
59
+ with open("redacted.pdf", "wb") as f:
60
+ f.write(redacted_pdf.content)
61
+ ```
62
+
63
+ ### `GET /download/{job_id}`
64
+
65
+ Download the redacted PDF file.
66
+
67
+ ### `GET /health`
68
+
69
+ Check API health and model status.
70
+
71
+ ### `GET /stats`
72
+
73
+ Get API statistics.
74
+
75
+ ## Response Format
76
+
77
+ ```json
78
+ {
79
+ "job_id": "uuid-here",
80
+ "status": "completed",
81
+ "message": "Successfully redacted 5 entities",
82
+ "entities": [
83
+ {
84
+ "entity_type": "PER",
85
+ "entity_text": "John Doe",
86
+ "page": 1,
87
+ "word_count": 2
88
+ }
89
+ ],
90
+ "redacted_file_url": "/download/uuid-here"
91
+ }
92
+ ```
93
+
94
+ ## Entity Types
95
+
96
+ Common entity types detected:
97
+ - `PER`: Person names
98
+ - `ORG`: Organizations
99
+ - `LOC`: Locations
100
+ - `DATE`: Dates
101
+ - `EMAIL`: Email addresses
102
+ - `PHONE`: Phone numbers
103
+ - And more...
104
+
105
+ ## Local Development
106
+
107
+ ### Prerequisites
108
+
109
+ - Python 3.10+
110
+ - Tesseract OCR
111
+ - Poppler utils
112
+
113
+ ### Installation
114
+
115
+ ```bash
116
+ # Install system dependencies (Ubuntu/Debian)
117
+ sudo apt-get install tesseract-ocr poppler-utils
118
+
119
+ # Install Python dependencies
120
+ pip install -r requirements.txt
121
+
122
+ # Run the server
123
+ python main.py
124
+ ```
125
+
126
+ The API will be available at `http://localhost:7860`
127
+
128
+ ### Using Docker
129
+
130
+ ```bash
131
+ # Build the image
132
+ docker build -t pdf-redaction-api .
133
+
134
+ # Run the container
135
+ docker run -p 7860:7860 pdf-redaction-api
136
+ ```
137
+
138
+ ## Configuration
139
+
140
+ Adjust the DPI parameter based on your needs:
141
+ - `150`: Fast processing, lower quality
142
+ - `300`: Recommended balance (default)
143
+ - `600`: High quality, slower processing
144
+
145
+ ## Limitations
146
+
147
+ - Maximum file size: Dependent on Space resources
148
+ - Processing time increases with page count and DPI
149
+ - Files are automatically cleaned up after processing
150
+
151
+ ## Privacy
152
+
153
+ - Uploaded files are processed in-memory and deleted after redaction
154
+ - No data is stored permanently
155
+ - Use your own deployment for sensitive documents
156
+
157
+ ## Credits
158
+
159
+ Built with:
160
+ - [FastAPI](https://fastapi.tiangolo.com/)
161
+ - [Transformers](https://huggingface.co/transformers/)
162
+ - [PyPDF](https://github.com/py-pdf/pypdf)
163
+ - [Tesseract OCR](https://github.com/tesseract-ocr/tesseract)
164
+
165
+ ## License
166
+
167
+ MIT License - See LICENSE file for details
STRUCTURE.md ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Project Structure
2
+
3
+ ```
4
+ pdf-redaction-api/
5
+
6
+ ├── main.py # FastAPI application entry point
7
+ ├── Dockerfile # Docker configuration for deployment
8
+ ├── requirements.txt # Python dependencies
9
+ ├── README.md # Project documentation (for HuggingFace)
10
+ ├── DEPLOYMENT.md # Deployment guide
11
+ ├── .gitignore # Git ignore rules
12
+ ├── .dockerignore # Docker ignore rules
13
+
14
+ ├── app/ # Application modules
15
+ │ ├── __init__.py # Package initialization
16
+ │ └── redaction.py # Core redaction logic (PDFRedactor class)
17
+
18
+ ├── uploads/ # Temporary upload directory
19
+ │ └── .gitkeep # Keep directory in git
20
+
21
+ ├── outputs/ # Redacted PDF output directory
22
+ │ └── .gitkeep # Keep directory in git
23
+
24
+ ├── tests/ # Test suite
25
+ │ ├── __init__.py
26
+ │ └── test_api.py # API endpoint tests
27
+
28
+ └── client_example.py # Example client for API usage
29
+ ```
30
+
31
+ ## File Descriptions
32
+
33
+ ### Core Files
34
+
35
+ #### `main.py`
36
+ FastAPI application with endpoints:
37
+ - `POST /redact` - Upload and redact PDF
38
+ - `GET /download/{job_id}` - Download redacted PDF
39
+ - `GET /health` - Health check
40
+ - `GET /stats` - API statistics
41
+ - `DELETE /cleanup/{job_id}` - Manual cleanup
42
+
43
+ #### `app/redaction.py`
44
+ Core redaction logic:
45
+ - `PDFRedactor` class
46
+ - OCR processing with pytesseract
47
+ - NER using HuggingFace transformers
48
+ - Entity-to-box mapping
49
+ - PDF redaction with coordinate scaling
50
+
51
+ ### Configuration Files
52
+
53
+ #### `requirements.txt`
54
+ Python dependencies:
55
+ - FastAPI & Uvicorn (API framework)
56
+ - Transformers & Torch (NER model)
57
+ - PyPDF (PDF manipulation)
58
+ - pdf2image (PDF to image conversion)
59
+ - pytesseract (OCR)
60
+ - Pillow (Image processing)
61
+
62
+ #### `Dockerfile`
63
+ Multi-stage build:
64
+ 1. Install system dependencies (tesseract, poppler)
65
+ 2. Install Python dependencies
66
+ 3. Copy application code
67
+ 4. Configure for port 7860 (HuggingFace default)
68
+
69
+ ### Documentation
70
+
71
+ #### `README.md`
72
+ HuggingFace Space documentation:
73
+ - Features overview
74
+ - API endpoint documentation
75
+ - Usage examples (cURL, Python)
76
+ - Response format
77
+ - Local development setup
78
+
79
+ #### `DEPLOYMENT.md`
80
+ Step-by-step deployment guide:
81
+ - HuggingFace Spaces setup
82
+ - Git workflow
83
+ - Configuration options
84
+ - Security considerations
85
+ - Troubleshooting
86
+ - Cost estimation
87
+
88
+ ### Testing & Examples
89
+
90
+ #### `tests/test_api.py`
91
+ Unit tests for API endpoints:
92
+ - Health check tests
93
+ - Upload validation tests
94
+ - Error handling tests
95
+
96
+ #### `client_example.py`
97
+ Example client implementation:
98
+ - Upload PDF
99
+ - Download redacted file
100
+ - Health check
101
+ - Statistics
102
+
103
+ ## Data Flow
104
+
105
+ ```
106
+ ┌─────────────────────────────────────────────────────────┐
107
+ │ 1. Client uploads PDF │
108
+ │ POST /redact with file │
109
+ └─────────────────────────────────────────────────────────┘
110
+
111
+ ┌─────────────────────────────────────────────────────────┐
112
+ │ 2. FastAPI (main.py) │
113
+ │ - Validates file │
114
+ │ - Generates job_id │
115
+ │ - Saves to uploads/ │
116
+ └─────────────────────────────────────────────────────────┘
117
+
118
+ ┌─────────────────────────────────────────────────────────┐
119
+ │ 3. PDFRedactor (app/redaction.py) │
120
+ │ - perform_ocr() → Extract text + boxes │
121
+ │ - run_ner() → Identify entities │
122
+ │ - map_entities_to_boxes() → Link entities to coords │
123
+ │ - create_redacted_pdf() → Generate output │
124
+ └─────────────────────────────────────────────────────────┘
125
+
126
+ ┌─────────────────────────────────────────────────────────┐
127
+ │ 4. Response │
128
+ │ - Return job_id and entity list │
129
+ │ - Save redacted PDF to outputs/ │
130
+ └─────────────────────────────────────────────────────────┘
131
+
132
+ ┌─────────────────────────────────────────────────────────┐
133
+ │ 5. Client downloads │
134
+ │ GET /download/{job_id} │
135
+ └─────────────────────────────────────────────────────────┘
136
+ ```
137
+
138
+ ## Key Components
139
+
140
+ ### 1. FastAPI Application (`main.py`)
141
+
142
+ **Endpoints:**
143
+ - RESTful API design
144
+ - File upload handling
145
+ - Background task cleanup
146
+ - CORS middleware for web access
147
+
148
+ **Features:**
149
+ - Automatic OpenAPI documentation at `/docs`
150
+ - JSON response models with Pydantic
151
+ - Error handling with HTTP exceptions
152
+ - Request validation
153
+
154
+ ### 2. Redaction Engine (`app/redaction.py`)
155
+
156
+ **Pipeline Steps:**
157
+
158
+ 1. **OCR Processing**
159
+ - Convert PDF pages to images (pdf2image)
160
+ - Extract text and bounding boxes (pytesseract)
161
+ - Store image dimensions for coordinate scaling
162
+
163
+ 2. **NER Processing**
164
+ - Load HuggingFace model
165
+ - Identify entities in text
166
+ - Return entity types and character positions
167
+
168
+ 3. **Mapping**
169
+ - Create character span index for OCR words
170
+ - Match NER entities to OCR bounding boxes
171
+ - Handle partial word matches
172
+
173
+ 4. **Redaction**
174
+ - Scale OCR image coordinates to PDF points
175
+ - Create black rectangle annotations
176
+ - Write redacted PDF with pypdf
177
+
178
+ ### 3. Docker Container
179
+
180
+ **Layers:**
181
+ - Base: Python 3.10 slim
182
+ - System packages: tesseract-ocr, poppler-utils
183
+ - Python packages: From requirements.txt
184
+ - Application code: Copied last for better caching
185
+
186
+ **Optimizations:**
187
+ - Multi-stage build (not used here, but possible)
188
+ - Minimal base image
189
+ - Cached dependency layers
190
+ - .dockerignore to reduce context size
191
+
192
+ ## Environment Variables
193
+
194
+ Default configuration (can be overridden):
195
+
196
+ ```bash
197
+ PYTHONUNBUFFERED=1 # Immediate log output
198
+ HF_HOME=/app/cache # HuggingFace cache directory
199
+ ```
200
+
201
+ ## Port Configuration
202
+
203
+ - **Development**: 7860 (configurable in main.py)
204
+ - **Production (HF Spaces)**: 7860 (required)
205
+
206
+ ## Directory Permissions
207
+
208
+ Ensure write permissions for:
209
+ - `uploads/` - Temporary PDF storage
210
+ - `outputs/` - Redacted PDF storage
211
+ - `cache/` - Model cache (created automatically)
212
+
213
+ ## Adding New Features
214
+
215
+ ### Add New Endpoint
216
+
217
+ 1. Define in `main.py`:
218
+ ```python
219
+ @app.get("/new-endpoint")
220
+ async def new_endpoint():
221
+ return {"message": "Hello"}
222
+ ```
223
+
224
+ 2. Add response model if needed
225
+ 3. Update README.md documentation
226
+ 4. Add tests in `tests/test_api.py`
227
+
228
+ ### Add New Redaction Option
229
+
230
+ 1. Modify `PDFRedactor` class in `app/redaction.py`
231
+ 2. Add parameter to `redact_document()` method
232
+ 3. Update API endpoint in `main.py`
233
+ 4. Document in README.md
234
+
235
+ ### Add Authentication
236
+
237
+ 1. Install: `pip install python-jose passlib`
238
+ 2. Create `app/auth.py` with JWT logic
239
+ 3. Add middleware to `main.py`
240
+ 4. Protect endpoints with dependencies
241
+
242
+ ## Best Practices
243
+
244
+ 1. **Logging**: Use `logger` for all important events
245
+ 2. **Error Handling**: Catch exceptions and return meaningful errors
246
+ 3. **Validation**: Use Pydantic models for request/response validation
247
+ 4. **Cleanup**: Always clean up temporary files
248
+ 5. **Documentation**: Keep README.md and code comments updated
249
+ 6. **Testing**: Add tests for new features
250
+
251
+ ## Performance Considerations
252
+
253
+ ### Bottlenecks
254
+ 1. OCR processing (most time-consuming)
255
+ 2. Model inference (NER)
256
+ 3. File I/O
257
+
258
+ ### Optimizations
259
+ - Lower DPI for faster OCR (trade-off with accuracy)
260
+ - Cache loaded models in memory
261
+ - Use async file operations
262
+ - Implement request queuing for high load
263
+ - Consider GPU for NER model
264
+
265
+ ### Scaling
266
+ - Horizontal: Multiple container instances
267
+ - Vertical: Larger CPU/RAM allocation
268
+ - Caching: Redis for temporary results
269
+ - Queue: Celery for background processing
app/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """
2
+ App module for PDF redaction API
3
+ """
4
+ from .redaction import PDFRedactor
5
+
6
+ __all__ = ['PDFRedactor']
app/redaction.py ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ PDF Redaction module using NER
3
+ """
4
+ from pdf2image import convert_from_path
5
+ import pytesseract
6
+ from pypdf import PdfReader, PdfWriter
7
+ from pypdf.generic import DictionaryObject, ArrayObject, NameObject, NumberObject
8
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
9
+ from typing import List, Dict, Optional
10
+ import logging
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ class PDFRedactor:
16
+ """PDF Redaction using Named Entity Recognition"""
17
+
18
+ def __init__(self, model_name: str = "./model"):
19
+ """
20
+ Initialize the PDF Redactor
21
+
22
+ Args:
23
+ model_name: HuggingFace model name for NER
24
+ """
25
+ self.model_name = model_name
26
+ self.ner_pipeline = None
27
+ self._load_model()
28
+
29
+ def _load_model(self):
30
+ """Load the NER model"""
31
+ try:
32
+ logger.info(f"Loading NER model: {self.model_name}")
33
+ tokenizer = AutoTokenizer.from_pretrained(self.model_name)
34
+ model = AutoModelForTokenClassification.from_pretrained(self.model_name)
35
+
36
+ self.ner_pipeline = pipeline("token-classification", model=model,
37
+ tokenizer=tokenizer)
38
+ logger.info("NER model loaded successfully")
39
+ except Exception as e:
40
+ logger.error(f"Error loading NER model: {str(e)}")
41
+ raise
42
+
43
+ def is_model_loaded(self) -> bool:
44
+ """Check if the model is loaded"""
45
+ return self.ner_pipeline is not None
46
+
47
+ def perform_ocr(self, pdf_path: str, dpi: int = 300) -> List[Dict]:
48
+ """
49
+ Perform OCR on PDF and extract word bounding boxes
50
+
51
+ Args:
52
+ pdf_path: Path to the PDF file
53
+ dpi: DPI for PDF to image conversion
54
+
55
+ Returns:
56
+ List of word data with bounding boxes and image dimensions
57
+ """
58
+ logger.info(f"Starting OCR on {pdf_path} at {dpi} DPI")
59
+ all_words_data = []
60
+
61
+ try:
62
+ images = convert_from_path(pdf_path, dpi=dpi)
63
+ logger.info(f"Converted PDF to {len(images)} images")
64
+
65
+ for page_num, image in enumerate(images):
66
+ # Get image dimensions
67
+ image_width, image_height = image.size
68
+
69
+ # Perform OCR
70
+ data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
71
+
72
+ num_words = len(data['text'])
73
+ for i in range(num_words):
74
+ word_text = data['text'][i].strip()
75
+ confidence = int(data['conf'][i])
76
+
77
+ # Filter out empty or low-confidence words
78
+ if word_text and confidence > 0:
79
+ all_words_data.append({
80
+ 'text': word_text,
81
+ 'box': (data['left'][i], data['top'][i],
82
+ data['width'][i], data['height'][i]),
83
+ 'page': page_num + 1,
84
+ 'confidence': confidence,
85
+ 'image_width': image_width,
86
+ 'image_height': image_height
87
+ })
88
+
89
+ logger.info(f"Processed page {page_num + 1}: {len([w for w in all_words_data if w['page'] == page_num + 1])} words")
90
+
91
+ logger.info(f"OCR complete: {len(all_words_data)} total words extracted")
92
+ return all_words_data
93
+
94
+ except Exception as e:
95
+ logger.error(f"Error during OCR: {str(e)}")
96
+ raise
97
+
98
+ def run_ner(self, text: str) -> List[Dict]:
99
+ """
100
+ Run NER on text
101
+
102
+ Args:
103
+ text: Input text
104
+
105
+ Returns:
106
+ List of identified entities
107
+ """
108
+ if not self.ner_pipeline:
109
+ raise RuntimeError("NER model not loaded")
110
+
111
+ logger.info(f"Running NER on text of length {len(text)}")
112
+
113
+ try:
114
+ results = self.ner_pipeline(text)
115
+ logger.info(f"NER identified {len(results)} entities")
116
+ return results
117
+ except Exception as e:
118
+ logger.error(f"Error during NER: {str(e)}")
119
+ raise
120
+
121
+ def map_entities_to_boxes(self, ner_results: List[Dict],
122
+ ocr_data: List[Dict]) -> List[Dict]:
123
+ """
124
+ Map NER entities to OCR bounding boxes
125
+
126
+ Args:
127
+ ner_results: List of NER entities
128
+ ocr_data: List of OCR word data
129
+
130
+ Returns:
131
+ List of mapped entities with bounding boxes
132
+ """
133
+ logger.info("Mapping NER entities to OCR bounding boxes")
134
+ mapped_entities = []
135
+
136
+ # Create character span mapping
137
+ ocr_word_char_spans = []
138
+ current_char_index = 0
139
+
140
+ for ocr_data_idx, word_info in enumerate(ocr_data):
141
+ word_text = word_info['text']
142
+ length = len(word_text)
143
+
144
+ ocr_word_char_spans.append({
145
+ 'ocr_data_idx': ocr_data_idx,
146
+ 'start_char': current_char_index,
147
+ 'end_char': current_char_index + length
148
+ })
149
+ current_char_index += length + 1
150
+
151
+ # Map each NER entity to OCR words
152
+ for ner_entity in ner_results:
153
+ ner_entity_type = ner_entity['entity']
154
+ ner_start = ner_entity['start']
155
+ ner_end = ner_entity['end']
156
+ ner_word = ner_entity['word']
157
+
158
+ matching_ocr_words = []
159
+
160
+ for ocr_word_span in ocr_word_char_spans:
161
+ ocr_start = ocr_word_span['start_char']
162
+ ocr_end = ocr_word_span['end_char']
163
+
164
+ # Check for overlap
165
+ if max(ocr_start, ner_start) < min(ocr_end, ner_end):
166
+ matching_ocr_words.append(ocr_data[ocr_word_span['ocr_data_idx']])
167
+
168
+ if matching_ocr_words:
169
+ mapped_entities.append({
170
+ 'entity_type': ner_entity_type,
171
+ 'entity_text': ner_word,
172
+ 'words': matching_ocr_words
173
+ })
174
+
175
+ logger.info(f"Mapped {len(mapped_entities)} entities to bounding boxes")
176
+ return mapped_entities
177
+
178
+ def create_redacted_pdf(self, original_pdf_path: str,
179
+ mapped_entities: List[Dict],
180
+ output_path: str) -> str:
181
+ """
182
+ Create redacted PDF with black rectangles over entities
183
+
184
+ Args:
185
+ original_pdf_path: Path to original PDF
186
+ mapped_entities: List of entities with bounding boxes
187
+ output_path: Path for output PDF
188
+
189
+ Returns:
190
+ Path to redacted PDF
191
+ """
192
+ logger.info(f"Creating redacted PDF: {output_path}")
193
+
194
+ try:
195
+ reader = PdfReader(original_pdf_path)
196
+ writer = PdfWriter()
197
+
198
+ for page_num in range(len(reader.pages)):
199
+ page = reader.pages[page_num]
200
+ media_box = page.mediabox
201
+ page_width = float(media_box.width)
202
+ page_height = float(media_box.height)
203
+
204
+ writer.add_page(page)
205
+
206
+ page_entities = 0
207
+ for entity_info in mapped_entities:
208
+ for word_info in entity_info['words']:
209
+ if word_info['page'] == page_num + 1:
210
+ x, y, w, h = word_info['box']
211
+
212
+ # Get image dimensions
213
+ image_width = word_info['image_width']
214
+ image_height = word_info['image_height']
215
+
216
+ # Scale coordinates
217
+ scale_x = page_width / image_width
218
+ scale_y = page_height / image_height
219
+
220
+ x_scaled = x * scale_x
221
+ y_scaled = y * scale_y
222
+ w_scaled = w * scale_x
223
+ h_scaled = h * scale_y
224
+
225
+ # Convert to PDF coordinates
226
+ llx = x_scaled
227
+ lly = page_height - (y_scaled + h_scaled)
228
+ urx = x_scaled + w_scaled
229
+ ury = page_height - y_scaled
230
+
231
+ # Create redaction annotation
232
+ redaction_annotation = DictionaryObject()
233
+ redaction_annotation.update({
234
+ NameObject("/Type"): NameObject("/Annot"),
235
+ NameObject("/Subtype"): NameObject("/Square"),
236
+ NameObject("/Rect"): ArrayObject([
237
+ NumberObject(llx),
238
+ NumberObject(lly),
239
+ NumberObject(urx),
240
+ NumberObject(ury),
241
+ ]),
242
+ NameObject("/C"): ArrayObject([
243
+ NumberObject(0), NumberObject(0), NumberObject(0)
244
+ ]),
245
+ NameObject("/IC"): ArrayObject([
246
+ NumberObject(0), NumberObject(0), NumberObject(0)
247
+ ]),
248
+ NameObject("/BS"): DictionaryObject({
249
+ NameObject("/W"): NumberObject(0)
250
+ })
251
+ })
252
+
253
+ writer.add_annotation(page_number=page_num,
254
+ annotation=redaction_annotation)
255
+ page_entities += 1
256
+
257
+ logger.info(f"Page {page_num + 1}: Added {page_entities} redactions")
258
+
259
+ # Write output
260
+ with open(output_path, "wb") as output_file:
261
+ writer.write(output_file)
262
+
263
+ logger.info(f"Redacted PDF created successfully: {output_path}")
264
+ return output_path
265
+
266
+ except Exception as e:
267
+ logger.error(f"Error creating redacted PDF: {str(e)}")
268
+ raise
269
+
270
+ def redact_document(self, pdf_path: str, output_path: str,
271
+ dpi: int = 300,
272
+ entity_filter: Optional[List[str]] = None) -> Dict:
273
+ """
274
+ Complete redaction pipeline
275
+
276
+ Args:
277
+ pdf_path: Path to input PDF
278
+ output_path: Path for output PDF
279
+ dpi: DPI for OCR
280
+ entity_filter: List of entity types to redact (None = all)
281
+
282
+ Returns:
283
+ Dictionary with redaction results
284
+ """
285
+ logger.info(f"Starting redaction pipeline for {pdf_path}")
286
+
287
+ # Step 1: OCR
288
+ ocr_data = self.perform_ocr(pdf_path, dpi)
289
+
290
+ # Step 2: Extract text
291
+ full_text = " ".join([word['text'] for word in ocr_data])
292
+
293
+ # Step 3: NER
294
+ ner_results = self.run_ner(full_text)
295
+
296
+ # Step 4: Map entities to boxes
297
+ mapped_entities = self.map_entities_to_boxes(ner_results, ocr_data)
298
+
299
+ # Step 5: Filter entities if requested
300
+ if entity_filter:
301
+ mapped_entities = [
302
+ e for e in mapped_entities
303
+ if e['entity_type'] in entity_filter
304
+ ]
305
+ logger.info(f"Filtered to {len(mapped_entities)} entities of types: {entity_filter}")
306
+
307
+ # Step 6: Create redacted PDF
308
+ self.create_redacted_pdf(pdf_path, mapped_entities, output_path)
309
+
310
+ return {
311
+ 'output_path': output_path,
312
+ 'total_words': len(ocr_data),
313
+ 'total_entities': len(ner_results),
314
+ 'redacted_entities': len(mapped_entities),
315
+ 'entities': mapped_entities
316
+ }
client_example.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Example client for PDF Redaction API
3
+ """
4
+ import requests
5
+ from pathlib import Path
6
+ import sys
7
+
8
+
9
+ def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf",
10
+ dpi: int = 300, entity_types: str = None):
11
+ """
12
+ Redact a PDF file using the API
13
+
14
+ Args:
15
+ api_url: Base URL of the API
16
+ pdf_path: Path to the PDF file to redact
17
+ output_path: Path to save the redacted PDF
18
+ dpi: DPI for OCR processing
19
+ entity_types: Comma-separated list of entity types to redact
20
+ """
21
+ # Check if file exists
22
+ if not Path(pdf_path).exists():
23
+ print(f"Error: File {pdf_path} not found")
24
+ return False
25
+
26
+ print(f"Uploading {pdf_path}...")
27
+
28
+ # Prepare request
29
+ files = {"file": open(pdf_path, "rb")}
30
+ params = {"dpi": dpi}
31
+
32
+ if entity_types:
33
+ params["entity_types"] = entity_types
34
+
35
+ try:
36
+ # Upload and redact
37
+ response = requests.post(f"{api_url}/redact", files=files, params=params)
38
+ response.raise_for_status()
39
+
40
+ result = response.json()
41
+ print(f"\nStatus: {result['status']}")
42
+ print(f"Message: {result['message']}")
43
+
44
+ # Display found entities
45
+ if result.get('entities'):
46
+ print("\nEntities redacted:")
47
+ for i, entity in enumerate(result['entities'], 1):
48
+ print(f" {i}. {entity['entity_type']}: {entity['entity_text']} "
49
+ f"(Page {entity['page']}, {entity['word_count']} words)")
50
+
51
+ # Download redacted file
52
+ job_id = result['job_id']
53
+ print(f"\nDownloading redacted PDF...")
54
+
55
+ download_response = requests.get(f"{api_url}/download/{job_id}")
56
+ download_response.raise_for_status()
57
+
58
+ # Save file
59
+ with open(output_path, "wb") as f:
60
+ f.write(download_response.content)
61
+
62
+ print(f"✓ Redacted PDF saved to: {output_path}")
63
+
64
+ # Cleanup (optional)
65
+ # requests.delete(f"{api_url}/cleanup/{job_id}")
66
+
67
+ return True
68
+
69
+ except requests.exceptions.RequestException as e:
70
+ print(f"Error: {e}")
71
+ return False
72
+ finally:
73
+ files["file"].close()
74
+
75
+
76
+ def check_health(api_url: str):
77
+ """Check API health"""
78
+ try:
79
+ response = requests.get(f"{api_url}/health")
80
+ response.raise_for_status()
81
+ data = response.json()
82
+
83
+ print(f"API Status: {data['status']}")
84
+ print(f"Version: {data['version']}")
85
+ print(f"Model Loaded: {data['model_loaded']}")
86
+
87
+ return True
88
+ except requests.exceptions.RequestException as e:
89
+ print(f"Error checking health: {e}")
90
+ return False
91
+
92
+
93
+ def get_stats(api_url: str):
94
+ """Get API statistics"""
95
+ try:
96
+ response = requests.get(f"{api_url}/stats")
97
+ response.raise_for_status()
98
+ data = response.json()
99
+
100
+ print("API Statistics:")
101
+ print(f" Pending uploads: {data['pending_uploads']}")
102
+ print(f" Processed files: {data['processed_files']}")
103
+ print(f" Model loaded: {data['model_loaded']}")
104
+
105
+ return True
106
+ except requests.exceptions.RequestException as e:
107
+ print(f"Error getting stats: {e}")
108
+ return False
109
+
110
+
111
+ if __name__ == "__main__":
112
+ # Example usage
113
+
114
+ # For local development
115
+ API_URL = "http://localhost:7860"
116
+
117
+ # For HuggingFace Spaces (replace with your space URL)
118
+ # API_URL = "https://your-username-pdf-redaction-api.hf.space"
119
+
120
+ if len(sys.argv) < 2:
121
+ print("Usage:")
122
+ print(" python client_example.py <pdf_file> [output_file] [dpi]")
123
+ print("\nOr check health:")
124
+ print(" python client_example.py --health")
125
+ print("\nOr get stats:")
126
+ print(" python client_example.py --stats")
127
+ sys.exit(1)
128
+
129
+ if sys.argv[1] == "--health":
130
+ check_health(API_URL)
131
+ elif sys.argv[1] == "--stats":
132
+ get_stats(API_URL)
133
+ else:
134
+ pdf_path = sys.argv[1]
135
+ output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
136
+ dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
137
+
138
+ # Optional: Filter specific entity types
139
+ # entity_types = "PER,ORG" # Only redact persons and organizations
140
+ entity_types = None # Redact all entity types
141
+
142
+ redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)
client_supabase.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from supabase import create_client, Client
2
+ import os
3
+ from dotenv import load_dotenv
4
+ load_dotenv()
5
+
6
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
7
+ SUPABASE_KEY = os.getenv("SERVICE_ROLE_KEY") # server-side key
8
+
9
+ supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
docker-compose.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ api:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ ports:
9
+ - "7860:7860"
10
+ volumes:
11
+ # Mount code for development (hot reload)
12
+ - .:/app
13
+ # Persistent storage for uploads/outputs
14
+ - ./uploads:/app/uploads
15
+ - ./outputs:/app/outputs
16
+ environment:
17
+ - PYTHONUNBUFFERED=1
18
+ - HF_HOME=/app/cache
19
+ - LOG_LEVEL=DEBUG
20
+ command: uvicorn main:app --host 0.0.0.0 --port 7860 --reload
21
+ restart: unless-stopped
22
+ healthcheck:
23
+ test: ["CMD", "curl", "-f", "http://localhost:7860/health"]
24
+ interval: 30s
25
+ timeout: 10s
26
+ retries: 3
27
+ start_period: 40s
28
+
29
+ # Optional: Add nginx for production
30
+ # nginx:
31
+ # image: nginx:alpine
32
+ # ports:
33
+ # - "80:80"
34
+ # volumes:
35
+ # - ./nginx.conf:/etc/nginx/nginx.conf
36
+ # depends_on:
37
+ # - api
38
+
39
+ # Optional: Add Redis for caching
40
+ # redis:
41
+ # image: redis:alpine
42
+ # ports:
43
+ # - "6379:6379"
44
+ # volumes:
45
+ # - redis-data:/data
46
+
47
+ # volumes:
48
+ # redis-data:
main.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for PDF redaction using NER
3
+ """
4
+ from fastapi import FastAPI, File, UploadFile, HTTPException, BackgroundTasks
5
+ from fastapi.responses import FileResponse
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ from pydantic import BaseModel
8
+ from typing import List, Optional, Dict
9
+ import uvicorn
10
+ import os
11
+ import uuid
12
+ import shutil
13
+ from pathlib import Path
14
+ import logging
15
+
16
+ from app.redaction import PDFRedactor
17
+ from client_supabase import supabase # Supabase client in separate file
18
+
19
+ # Configure logging
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # Initialize FastAPI app
24
+ app = FastAPI(
25
+ title="PDF Redaction API",
26
+ description="Redact sensitive information from PDFs using Named Entity Recognition",
27
+ version="1.0.0"
28
+ )
29
+
30
+ # CORS middleware
31
+ app.add_middleware(
32
+ CORSMiddleware,
33
+ allow_origins=["*"],
34
+ allow_credentials=True,
35
+ allow_methods=["*"],
36
+ allow_headers=["*"],
37
+ )
38
+
39
+ # Create directories
40
+ UPLOAD_DIR = Path("uploads")
41
+ OUTPUT_DIR = Path("outputs")
42
+ UPLOAD_DIR.mkdir(exist_ok=True)
43
+ OUTPUT_DIR.mkdir(exist_ok=True)
44
+
45
+ # Initialize redactor
46
+ redactor = PDFRedactor()
47
+
48
+ # ---------------- In-Memory Redaction Status Tracker ----------------
49
+ # request_id -> status (pending | processing | completed | failed)
50
+ redaction_status: Dict[str, str] = {}
51
+
52
+ # ---------------- Response Models ----------------
53
+ class RedactionEntity(BaseModel):
54
+ entity_type: str
55
+ entity_text: str
56
+ page: int
57
+ word_count: int
58
+
59
+ class RedactionResponse(BaseModel):
60
+ job_id: str
61
+ status: str
62
+ message: str
63
+ entities: Optional[List[RedactionEntity]] = None
64
+ redacted_file_url: Optional[str] = None
65
+
66
+ class RedactionStatusResponse(BaseModel):
67
+ request_id: str
68
+ status: str
69
+ files: List[str]
70
+ message: str
71
+
72
+
73
+ class HealthResponse(BaseModel):
74
+ status: str
75
+ version: str
76
+ model_loaded: bool
77
+
78
+ # ---------------- Helper Functions ----------------
79
+ def get_public_url(bucket: str, storage_path: str) -> str:
80
+ return f"{os.getenv('SUPABASE_URL')}/storage/v1/object/public/{bucket}/{storage_path}"
81
+ def cleanup_files(job_id: str):
82
+ """Clean up temporary files after a delay"""
83
+ try:
84
+ upload_path = UPLOAD_DIR / f"{job_id}.pdf"
85
+ if upload_path.exists():
86
+ upload_path.unlink()
87
+ logger.info(f"Cleaned up files for job {job_id}")
88
+ except Exception as e:
89
+ logger.error(f"Error cleaning up files for job {job_id}: {str(e)}")
90
+
91
+ def cleanup_temp_files(paths: List[Path]):
92
+ for path in paths:
93
+ if path.exists():
94
+ path.unlink()
95
+
96
+ def download_file_from_supabase(bucket: str, storage_path: str, local_path: Path):
97
+ logger.info(f"Downloading {storage_path} to {local_path}")
98
+ data = supabase.storage.from_(bucket).download(storage_path)
99
+ if not data:
100
+ raise Exception(f"Failed to download {storage_path}")
101
+ with local_path.open("wb") as f:
102
+ f.write(data)
103
+
104
+ def upload_file_to_supabase(bucket: str, storage_path: str, local_path: Path):
105
+ logger.info(f"Uploading {local_path} to {storage_path}")
106
+
107
+ with local_path.open("rb") as f:
108
+ content = f.read()
109
+
110
+ supabase.storage.from_(bucket).upload(
111
+ path=storage_path,
112
+ file=content,
113
+ file_options={
114
+ "upsert": "true",
115
+ "content-type": "application/pdf"
116
+ }
117
+ )
118
+
119
+ def redact_request(request_id: str, bucket: str = "doc_storage"):
120
+ """Background task: redact all files for a given request_id"""
121
+ try:
122
+ redaction_status[request_id] = "processing"
123
+
124
+ # Fetch all files for this request_id
125
+ response = (
126
+ supabase
127
+ .from_("request_files")
128
+ .select("id, storage_path")
129
+ .eq("request_id", request_id)
130
+ .execute()
131
+ )
132
+
133
+ files = response.data
134
+
135
+ if not files:
136
+ raise Exception(f"No files found for request {request_id}")
137
+ if not files:
138
+ raise Exception(f"No files found for request {request_id}")
139
+
140
+ for file in files:
141
+ storage_path = file["storage_path"]
142
+ local_upload = UPLOAD_DIR / f"{uuid.uuid4()}.pdf"
143
+ local_output = OUTPUT_DIR / f"{uuid.uuid4()}_redacted.pdf"
144
+
145
+ # Download from Supabase
146
+ download_file_from_supabase(bucket, storage_path, local_upload)
147
+
148
+ # Redact
149
+ redactor.redact_document(pdf_path=str(local_upload), output_path=str(local_output))
150
+
151
+ # Upload redacted back to same path
152
+ upload_file_to_supabase(bucket, storage_path, local_output)
153
+
154
+ # Cleanup local files
155
+ cleanup_temp_files([local_upload, local_output])
156
+
157
+ redaction_status[request_id] = "completed"
158
+
159
+ except Exception as e:
160
+ logger.error(f"Redaction failed for {request_id}: {str(e)}")
161
+ redaction_status[request_id] = "failed"
162
+
163
+ # ----------------- Existing Endpoints -----------------
164
+ @app.get("/", response_model=HealthResponse)
165
+ async def root():
166
+ return HealthResponse(
167
+ status="healthy",
168
+ version="1.0.0",
169
+ model_loaded=redactor.is_model_loaded()
170
+ )
171
+
172
+ @app.get("/health", response_model=HealthResponse)
173
+ async def health_check():
174
+ return HealthResponse(
175
+ status="healthy",
176
+ version="1.0.0",
177
+ model_loaded=redactor.is_model_loaded()
178
+ )
179
+
180
+ @app.post("/redact", response_model=RedactionResponse)
181
+ async def redact_pdf(
182
+ background_tasks: BackgroundTasks,
183
+ file: UploadFile = File(...),
184
+ dpi: int = 300,
185
+ entity_types: Optional[str] = None
186
+ ):
187
+ if not file.filename.endswith('.pdf'):
188
+ raise HTTPException(status_code=400, detail="Only PDF files are supported")
189
+ job_id = str(uuid.uuid4())
190
+ upload_path = UPLOAD_DIR / f"{job_id}.pdf"
191
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
192
+ try:
193
+ with upload_path.open("wb") as buffer:
194
+ shutil.copyfileobj(file.file, buffer)
195
+
196
+ entity_filter = None
197
+ if entity_types:
198
+ entity_filter = [et.strip() for et in entity_types.split(',')]
199
+
200
+ result = redactor.redact_document(
201
+ pdf_path=str(upload_path),
202
+ output_path=str(output_path),
203
+ dpi=dpi,
204
+ entity_filter=entity_filter
205
+ )
206
+
207
+ response_entities = [
208
+ RedactionEntity(
209
+ entity_type=e['entity_type'],
210
+ entity_text=e['entity_text'],
211
+ page=e['words'][0]['page'] if e['words'] else 0,
212
+ word_count=len(e['words'])
213
+ ) for e in result['entities']
214
+ ]
215
+
216
+ background_tasks.add_task(cleanup_files, job_id)
217
+
218
+ return RedactionResponse(
219
+ job_id=job_id,
220
+ status="completed",
221
+ message=f"Successfully redacted {len(result['entities'])} entities",
222
+ entities=response_entities,
223
+ redacted_file_url=f"/download/{job_id}"
224
+ )
225
+
226
+ except Exception as e:
227
+ logger.error(f"Error processing job {job_id}: {str(e)}")
228
+ if upload_path.exists():
229
+ upload_path.unlink()
230
+ if output_path.exists():
231
+ output_path.unlink()
232
+ raise HTTPException(status_code=500, detail=f"Error processing PDF: {str(e)}")
233
+
234
+ @app.get("/download/{job_id}")
235
+ async def download_redacted_pdf(job_id: str):
236
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
237
+ if not output_path.exists():
238
+ raise HTTPException(status_code=404, detail="Redacted file not found")
239
+ return FileResponse(
240
+ path=output_path,
241
+ media_type="application/pdf",
242
+ filename=f"redacted_{job_id}.pdf"
243
+ )
244
+
245
+ @app.delete("/cleanup/{job_id}")
246
+ async def cleanup_job(job_id: str):
247
+ try:
248
+ cleanup_files(job_id)
249
+ output_path = OUTPUT_DIR / f"{job_id}_redacted.pdf"
250
+ if output_path.exists():
251
+ output_path.unlink()
252
+ return {"message": f"Successfully cleaned up files for job {job_id}"}
253
+ except Exception as e:
254
+ raise HTTPException(status_code=500, detail=f"Error cleaning up: {str(e)}")
255
+
256
+ @app.get("/stats")
257
+ async def get_stats():
258
+ upload_count = len(list(UPLOAD_DIR.glob("*.pdf")))
259
+ output_count = len(list(OUTPUT_DIR.glob("*.pdf")))
260
+ return {
261
+ "pending_uploads": upload_count,
262
+ "processed_files": output_count,
263
+ "model_loaded": redactor.is_model_loaded()
264
+ }
265
+
266
+ # ----------------- NEW Endpoints -----------------
267
+ @app.post("/redact_by_request/{request_id}", response_model=RedactionStatusResponse)
268
+ async def redact_by_request(request_id: str, background_tasks: BackgroundTasks):
269
+ if redaction_status.get(request_id) == "processing":
270
+ return RedactionStatusResponse(
271
+ request_id=request_id,
272
+ status="processing",
273
+ files=[],
274
+ message="Redaction already in progress"
275
+ )
276
+ redaction_status[request_id] = "pending"
277
+ background_tasks.add_task(redact_request, request_id)
278
+ return RedactionStatusResponse(
279
+ request_id=request_id,
280
+ status="pending",
281
+ files=[],
282
+ message="Redaction started in background"
283
+ )
284
+
285
+ @app.get("/redaction_status/{request_id}", response_model=RedactionStatusResponse)
286
+ async def get_redaction_status(request_id: str):
287
+ status = redaction_status.get(request_id, "not_found")
288
+
289
+ # Default empty response
290
+ files: List[str] = []
291
+
292
+ if status == "completed":
293
+ # Fetch file paths from DB
294
+ response = (
295
+ supabase
296
+ .from_("request_files")
297
+ .select("storage_path")
298
+ .eq("request_id", request_id)
299
+ .execute()
300
+ )
301
+
302
+ if response.data:
303
+ files = [
304
+ get_public_url("doc_storage", row["storage_path"])
305
+ for row in response.data
306
+ ]
307
+
308
+ message = (
309
+ "Redaction completed"
310
+ if status == "completed"
311
+ else "Redaction pending"
312
+ if status == "pending"
313
+ else "Redaction failed"
314
+ if status == "failed"
315
+ else "Request not found"
316
+ )
317
+
318
+ return RedactionStatusResponse(
319
+ request_id=request_id,
320
+ status=status,
321
+ files=files,
322
+ message=message
323
+ )
324
+
325
+ # ----------------- Run Server -----------------
326
+ if __name__ == "__main__":
327
+ uvicorn.run(
328
+ "main:app",
329
+ host="localhost",
330
+ port=2700,
331
+ reload=False
332
+ )
model/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
model/README.md ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ pipeline_tag: token-classification
3
+ ---
4
+ # Model Card for Model ID
5
+
6
+ <!-- Provide a quick summary of what the model is/does. -->
7
+
8
+ The NER model developed using BERT is designed to recognize named entities in text for multiple languages, including Arabic, French, and English. It is adaptable to new labels, allowing users to extend its capabilities beyond the initial set of 10 predefined labels. which are: 'Person_Name', 'Brand_vehicule', 'Model_vehicule', 'Organization_Name', 'location', 'phone_number', 'IBAN', 'credit_card', 'date_time', 'email', 'Identification_Number'
9
+ ## Model Details
10
+
11
+ ### Model Description
12
+
13
+ <!-- Provide a longer summary of what this model is. -->
14
+
15
+
16
+
17
+ - **Developed by:** yahya mdarhri
18
+ - **Model type:** TOKEN CLASSIFICATION
19
+ - **Finetuned from model :** bert-base-multilingual-cased
20
+ - **License:** OPEN SOURCE
21
+
22
+
23
+ ## Uses
24
+
25
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
26
+ Named Entity Recognition (NER): The primary purpose of this model is to perform Named Entity Recognition (NER) in text data. It identifies and categorizes entities such as names of people, organizations, locations, dates, and more.
27
+ Multilingual Support: The model is designed to support multiple languages, including Arabic, French, and English. It can be used by NLP practitioners, researchers, and developers working with text data in these languages.
28
+ Adaptability: Users can adapt the model to recognize new entity labels by providing labeled training data for the desired categories. This feature makes it versatile for various NER tasks.
29
+
30
+
31
+
32
+
33
+ ## Bias, Risks, and Limitations
34
+
35
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
36
+
37
+ Bias and Fairness: Users and affected parties should be aware of potential biases in entity recognition, especially when it comes to personal names or other sensitive categories. Efforts should be made to minimize bias and ensure fairness in entity recognition.
38
+
39
+ Privacy: The model should be used responsibly to protect the privacy of individuals and organizations. When handling personally identifiable information (PII), data protection laws and privacy guidelines should be followed.
40
+
41
+ Transparency: Transparency in how the model operates, including its training data and evaluation metrics, is crucial to build trust with users and affected parties.
42
+
43
+ User Consent: If the model is used in applications where user data is processed, obtaining informed consent from users for data processing is essential.
44
+
45
+
46
+ We value your feedback! Please share your thoughts on this model. Thank you!
47
+
48
+
49
+ ## Model Card Contact
50
+ I build custom AI models and solutions. If you're interested in collaboration or have specific requirements, feel free to reach out.
51
+ yahyamdarhri00@gmail.com
52
+
53
+
54
+
model/config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "C:\\Users\\pc\\OneDrive\\Documents\\GitHub\\apprentissage_actif\\./results",
3
+ "architectures": [
4
+ "BertForTokenClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "directionality": "bidi",
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 768,
12
+ "id2label": {
13
+ "0": "O",
14
+ "1": "Person_Name",
15
+ "2": "Brand_vehicule",
16
+ "3": "Model_vehicule",
17
+ "4": "Organization_Name",
18
+ "5": "location",
19
+ "6": "phone_number",
20
+ "7": "IBAN",
21
+ "8": "credit_card",
22
+ "9": "date_time",
23
+ "10": "email",
24
+ "11": "Identification_Number"
25
+ },
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 3072,
28
+ "label2id": {
29
+ "Brand_vehicule": 2,
30
+ "IBAN": 7,
31
+ "Identification_Number": 11,
32
+ "Model_vehicule": 3,
33
+ "O": 0,
34
+ "Organization_Name": 4,
35
+ "Person_Name": 1,
36
+ "credit_card": 8,
37
+ "date_time": 9,
38
+ "email": 10,
39
+ "location": 5,
40
+ "phone_number": 6
41
+ },
42
+ "layer_norm_eps": 1e-12,
43
+ "max_position_embeddings": 512,
44
+ "model_type": "bert",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 12,
47
+ "pad_token_id": 0,
48
+ "pooler_fc_size": 768,
49
+ "pooler_num_attention_heads": 12,
50
+ "pooler_num_fc_layers": 3,
51
+ "pooler_size_per_head": 128,
52
+ "pooler_type": "first_token_transform",
53
+ "position_embedding_type": "absolute",
54
+ "torch_dtype": "float32",
55
+ "transformers_version": "4.30.2",
56
+ "type_vocab_size": 2,
57
+ "use_cache": true,
58
+ "vocab_size": 119547
59
+ }
model/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:985fcceb62b1be6e40a5dcca2789694fe6f933ca310591b852f6a074479f4b5a
3
+ size 709160429
model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "clean_up_tokenization_spaces": true,
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "strip_accents": null,
10
+ "tokenize_chinese_chars": true,
11
+ "tokenizer_class": "BertTokenizer",
12
+ "unk_token": "[UNK]"
13
+ }
model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
outputs/.gitkeep ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.109.0
2
+ uvicorn[standard]==0.27.0
3
+ python-multipart==0.0.6
4
+ transformers==4.38
5
+ torch==2.2.2
6
+ pypdf==4.0.1
7
+ pdf2image==1.17.0
8
+ pytesseract==0.3.10
9
+ Pillow==10.2.0
10
+ pydantic==2.5.3
11
+ python-dotenv==1.0.0
12
+ supabase
13
+ numpy==1.26.4
tests/test_api.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test cases for PDF Redaction API
3
+ """
4
+ import pytest
5
+ from fastapi.testclient import TestClient
6
+ from pathlib import Path
7
+ import sys
8
+
9
+ # Add parent directory to path
10
+ sys.path.insert(0, str(Path(__file__).parent.parent))
11
+
12
+ from main import app
13
+
14
+ client = TestClient(app)
15
+
16
+
17
+ def test_health_check():
18
+ """Test health check endpoint"""
19
+ response = client.get("/health")
20
+ assert response.status_code == 200
21
+ data = response.json()
22
+ assert data["status"] == "healthy"
23
+ assert "model_loaded" in data
24
+
25
+
26
+ def test_root():
27
+ """Test root endpoint"""
28
+ response = client.get("/")
29
+ assert response.status_code == 200
30
+ data = response.json()
31
+ assert data["status"] == "healthy"
32
+
33
+
34
+ def test_stats():
35
+ """Test stats endpoint"""
36
+ response = client.get("/stats")
37
+ assert response.status_code == 200
38
+ data = response.json()
39
+ assert "pending_uploads" in data
40
+ assert "processed_files" in data
41
+ assert "model_loaded" in data
42
+
43
+
44
+ def test_redact_no_file():
45
+ """Test redaction without file"""
46
+ response = client.post("/redact")
47
+ assert response.status_code == 422 # Unprocessable entity
48
+
49
+
50
+ def test_redact_wrong_file_type():
51
+ """Test redaction with wrong file type"""
52
+ files = {"file": ("test.txt", b"test content", "text/plain")}
53
+ response = client.post("/redact", files=files)
54
+ assert response.status_code == 400
55
+
56
+
57
+ def test_download_nonexistent():
58
+ """Test downloading non-existent file"""
59
+ response = client.get("/download/nonexistent-id")
60
+ assert response.status_code == 404
61
+
62
+
63
+ # Add more tests as needed
64
+ # - Test with actual PDF file
65
+ # - Test with different DPI values
66
+ # - Test with entity type filtering
67
+ # - Test cleanup functionality
uploads/.gitkeep ADDED
File without changes