File size: 4,452 Bytes
af107f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
"""
Example client for PDF Redaction API
"""
import requests
from pathlib import Path
import sys


def redact_pdf(api_url: str, pdf_path: str, output_path: str = "redacted.pdf", 
               dpi: int = 300, entity_types: str = None):
    """
    Redact a PDF file using the API
    
    Args:
        api_url: Base URL of the API
        pdf_path: Path to the PDF file to redact
        output_path: Path to save the redacted PDF
        dpi: DPI for OCR processing
        entity_types: Comma-separated list of entity types to redact
    """
    # Check if file exists
    if not Path(pdf_path).exists():
        print(f"Error: File {pdf_path} not found")
        return False
    
    print(f"Uploading {pdf_path}...")
    
    # Prepare request
    files = {"file": open(pdf_path, "rb")}
    params = {"dpi": dpi}
    
    if entity_types:
        params["entity_types"] = entity_types
    
    try:
        # Upload and redact
        response = requests.post(f"{api_url}/redact", files=files, params=params)
        response.raise_for_status()
        
        result = response.json()
        print(f"\nStatus: {result['status']}")
        print(f"Message: {result['message']}")
        
        # Display found entities
        if result.get('entities'):
            print("\nEntities redacted:")
            for i, entity in enumerate(result['entities'], 1):
                print(f"  {i}. {entity['entity_type']}: {entity['entity_text']} "
                      f"(Page {entity['page']}, {entity['word_count']} words)")
        
        # Download redacted file
        job_id = result['job_id']
        print(f"\nDownloading redacted PDF...")
        
        download_response = requests.get(f"{api_url}/download/{job_id}")
        download_response.raise_for_status()
        
        # Save file
        with open(output_path, "wb") as f:
            f.write(download_response.content)
        
        print(f"✓ Redacted PDF saved to: {output_path}")
        
        # Cleanup (optional)
        # requests.delete(f"{api_url}/cleanup/{job_id}")
        
        return True
    
    except requests.exceptions.RequestException as e:
        print(f"Error: {e}")
        return False
    finally:
        files["file"].close()


def check_health(api_url: str):
    """Check API health"""
    try:
        response = requests.get(f"{api_url}/health")
        response.raise_for_status()
        data = response.json()
        
        print(f"API Status: {data['status']}")
        print(f"Version: {data['version']}")
        print(f"Model Loaded: {data['model_loaded']}")
        
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error checking health: {e}")
        return False


def get_stats(api_url: str):
    """Get API statistics"""
    try:
        response = requests.get(f"{api_url}/stats")
        response.raise_for_status()
        data = response.json()
        
        print("API Statistics:")
        print(f"  Pending uploads: {data['pending_uploads']}")
        print(f"  Processed files: {data['processed_files']}")
        print(f"  Model loaded: {data['model_loaded']}")
        
        return True
    except requests.exceptions.RequestException as e:
        print(f"Error getting stats: {e}")
        return False


if __name__ == "__main__":
    # Example usage
    
    # For local development
    API_URL = "http://localhost:7860"
    
    # For HuggingFace Spaces (replace with your space URL)
    # API_URL = "https://your-username-pdf-redaction-api.hf.space"
    
    if len(sys.argv) < 2:
        print("Usage:")
        print("  python client_example.py <pdf_file> [output_file] [dpi]")
        print("\nOr check health:")
        print("  python client_example.py --health")
        print("\nOr get stats:")
        print("  python client_example.py --stats")
        sys.exit(1)
    
    if sys.argv[1] == "--health":
        check_health(API_URL)
    elif sys.argv[1] == "--stats":
        get_stats(API_URL)
    else:
        pdf_path = sys.argv[1]
        output_path = sys.argv[2] if len(sys.argv) > 2 else "redacted.pdf"
        dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 300
        
        # Optional: Filter specific entity types
        # entity_types = "PER,ORG"  # Only redact persons and organizations
        entity_types = None  # Redact all entity types
        
        redact_pdf(API_URL, pdf_path, output_path, dpi, entity_types)