File size: 4,452 Bytes
af107f1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 | """
Example client for PDF Redaction API
"""
import requests
from pathlib import Path
import sys
def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf",
dpi: int = 300, entity_types: str = None):
"""
Redact a PDF file using the API
Args:
api_url: Base URL of the API
pdf_path: Path to the PDF file to redact
output_path: Path to save the redacted PDF
dpi: DPI for OCR processing
entity_types: Comma-separated list of entity types to redact
"""
# Check if file exists
if not Path(pdf_path).exists():
print(f"Error: File {pdf_path} not found")
return False
print(f"Uploading {pdf_path}...")
# Prepare request
files = {"file": open(pdf_path, "rb")}
params = {"dpi": dpi}
if entity_types:
params["entity_types"] = entity_types
try:
# Upload and redact
response = requests.post(f"{api_url}/redact", files=files, params=params)
response.raise_for_status()
result = response.json()
print(f"\nStatus: {result['status']}")
print(f"Message: {result['message']}")
# Display found entities
if result.get('entities'):
print("\nEntities redacted:")
for i, entity in enumerate(result['entities'], 1):
print(f" {i}. {entity['entity_type']}: {entity['entity_text']} "
f"(Page {entity['page']}, {entity['word_count']} words)")
# Download redacted file
job_id = result['job_id']
print(f"\nDownloading redacted PDF...")
download_response = requests.get(f"{api_url}/download/{job_id}")
download_response.raise_for_status()
# Save file
with open(output_path, "wb") as f:
f.write(download_response.content)
print(f"✓ Redacted PDF saved to: {output_path}")
# Cleanup (optional)
# requests.delete(f"{api_url}/cleanup/{job_id}")
return True
except requests.exceptions.RequestException as e:
print(f"Error: {e}")
return False
finally:
files["file"].close()
def check_health(api_url: str):
"""Check API health"""
try:
response = requests.get(f"{api_url}/health")
response.raise_for_status()
data = response.json()
print(f"API Status: {data['status']}")
print(f"Version: {data['version']}")
print(f"Model Loaded: {data['model_loaded']}")
return True
except requests.exceptions.RequestException as e:
print(f"Error checking health: {e}")
return False
def get_stats(api_url: str):
"""Get API statistics"""
try:
response = requests.get(f"{api_url}/stats")
response.raise_for_status()
data = response.json()
print("API Statistics:")
print(f" Pending uploads: {data['pending_uploads']}")
print(f" Processed files: {data['processed_files']}")
print(f" Model loaded: {data['model_loaded']}")
return True
except requests.exceptions.RequestException as e:
print(f"Error getting stats: {e}")
return False
if __name__ == "__main__":
# Example usage
# For local development
API_URL = "http://localhost:7860"
# For HuggingFace Spaces (replace with your space URL)
# API_URL = "https://your-username-pdf-redaction-api.hf.space"
if len(sys.argv) < 2:
print("Usage:")
print(" python client_example.py <pdf_file> [output_file] [dpi]")
print("\nOr check health:")
print(" python client_example.py --health")
print("\nOr get stats:")
print(" python client_example.py --stats")
sys.exit(1)
if sys.argv[1] == "--health":
check_health(API_URL)
elif sys.argv[1] == "--stats":
get_stats(API_URL)
else:
pdf_path = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
# Optional: Filter specific entity types
# entity_types = "PER,ORG" # Only redact persons and organizations
entity_types = None # Redact all entity types
redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)
|