| import gradio as gr |
| import os |
| import fitz |
| import pytesseract |
| from pdf2image import convert_from_path |
| from google.cloud import documentai_v1 as documentai |
| from presidio_analyzer import AnalyzerEngine |
| from presidio_anonymizer import AnonymizerEngine |
|
|
| |
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "document-ai-anonymizer.json" |
|
|
| |
| client = documentai.DocumentProcessorServiceClient() |
|
|
| |
| analyzer = AnalyzerEngine() |
| anonymizer = AnonymizerEngine() |
|
|
| def extract_text_from_pdf(pdf_path): |
| """ Extracts text from PDFs using Google Cloud Document AI. """ |
| with open(pdf_path, "rb") as f: |
| pdf_bytes = f.read() |
|
|
| |
| document = documentai.RawDocument(content=pdf_bytes, mime_type="application/pdf") |
| name = f"projects/presidio-450223/locations/us/processors/5cbc64853974c755" |
|
|
| request = documentai.ProcessRequest(name=name, raw_document=document) |
|
|
| |
| result = client.process_document(request=request) |
|
|
| return result.document.text if result.document.text else "No text detected." |
|
|
| def extract_text_from_image(image_path): |
| """ Extracts text from images using Tesseract OCR. """ |
| return pytesseract.image_to_string(image_path) |
|
|
| def redact_document(uploaded_file): |
| """ 1. Extracts text from PDFs, Word, or Image files. 2. Uses Presidio to redact sensitive PII. """ |
| file_ext = uploaded_file.split(".")[-1].lower() |
| |
| if file_ext == "pdf": |
| extracted_text = extract_text_from_pdf(uploaded_file) |
| elif file_ext in ["png", "jpg", "jpeg"]: |
| extracted_text = extract_text_from_image(uploaded_file) |
| else: |
| with open(uploaded_file, "r", encoding="utf-8") as f: |
| extracted_text = f.read() |
|
|
| |
| results = analyzer.analyze( |
| text=extracted_text, |
| entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "ID_NUMBER", "LOCATION"], |
| language="en" |
| ) |
| anonymized_text = anonymizer.anonymize(text=extracted_text, analyzer_results=results) |
|
|
| return extracted_text, anonymized_text.text |
|
|
| |
| iface = gr.Interface( |
| fn=redact_document, |
| inputs=gr.File(type="filepath"), |
| outputs=["text", "text"], |
| title="Legal & Business Document Redaction", |
| description="Upload a contract, business report, or legal document (PDF, Word, Image) and Presidio will anonymize sensitive data.", |
| examples=[] |
| ) |
|
|
| |
| if __name__ == "__main__": |
| iface.launch() |
|
|