Spaces:

Text-to-Document-Generation
/

PDF-Redaction-API

Sleeping

File size: 4,452 Bytes

af107f1

"""
Example client for PDF Redaction API
"""
import requests
from pathlib import Path
import sys


def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf", 
               dpi: int = 300, entity_types: str = None):
    """
    Redact a PDF file using the API
    
    Args:
        api_url: Base URL of the API
        pdf_path: Path to the PDF file to redact
        output_path: Path to save the redacted PDF
        dpi: DPI for OCR processing
        entity_types: Comma-separated list of entity types to redact
    """
    # Check if file exists
    if not Path(pdf_path).exists():
        print(f"Error: File {pdf_path} not found")
        return False
    
    print(f"Uploading {pdf_path}...")
    
    # Prepare request
    files = {"file": open(pdf_path, "rb")}
    params = {"dpi": dpi}
    
    if entity_types:
        params["entity_types"] = entity_types
    
    try:
        # Upload and redact
        response = requests.post(f"{api_url}/redact", files=files, params=params)
        response.raise_for_status()
        
        result = response.json()
        print(f"\nStatus: {result['status']}")
        print(f"Message: {result['message']}")
        
        # Display found entities
        if result.get('entities'):
            print("\nEntities redacted:")
            for i, entity in enumerate(result['entities'], 1):
                print(f"  {i}. {entity['entity_type']}: {entity['entity_text']} "
                      f"(Page {entity['page']}, {entity['word_count']} words)")
        
        # Download redacted file
        job_id = result['job_id']
        print(f"\nDownloading redacted PDF...")
        
        download_response = requests.get(f"{api_url}/download/{job_id}")
        download_response.raise_for_status()
        
        # Save file
        with open(output_path, "wb") as f:
            f.write(download_response.content)
        
        print(f"✓ Redacted PDF saved to: {output_path}")
        
        # Cleanup (optional)
        # requests.delete(f"{api_url}/cleanup/{job_id}")
        
        return True
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return False
    finally:
        files["file"].close()


def check_health(api_url: str):
    """Check API health"""
    try:
        response = requests.get(f"{api_url}/health")
        response.raise_for_status()
        data = response.json()
        
        print(f"API Status: {data['status']}")
        print(f"Version: {data['version']}")
        print(f"Model Loaded: {data['model_loaded']}")
        
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error checking health: {e}")
        return False


def get_stats(api_url: str):
    """Get API statistics"""
    try:
        response = requests.get(f"{api_url}/stats")
        response.raise_for_status()
        data = response.json()
        
        print("API Statistics:")
        print(f"  Pending uploads: {data['pending_uploads']}")
        print(f"  Processed files: {data['processed_files']}")
        print(f"  Model loaded: {data['model_loaded']}")
        
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error getting stats: {e}")
        return False


if __name__ == "__main__":
    # Example usage
    
    # For local development
    API_URL = "http://localhost:7860"
    
    # For HuggingFace Spaces (replace with your space URL)
    # API_URL = "https://your-username-pdf-redaction-api.hf.space"
    
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python client_example.py <pdf_file> [output_file] [dpi]")
        print("\nOr check health:")
        print("  python client_example.py --health")
        print("\nOr get stats:")
        print("  python client_example.py --stats")
        sys.exit(1)
    
    if sys.argv[1] == "--health":
        check_health(API_URL)
    elif sys.argv[1] == "--stats":
        get_stats(API_URL)
    else:
        pdf_path = sys.argv[1]
        output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
        dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
        
        # Optional: Filter specific entity types
        # entity_types = "PER,ORG"  # Only redact persons and organizations
        entity_types = None  # Redact all entity types
        
        redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)