File size: 5,422 Bytes
2002d0a
 
 
 
 
 
 
 
 
 
 
 
 
b7f402c
9365935
 
 
 
 
2002d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f4707
2002d0a
 
 
 
 
 
 
9365935
 
 
 
 
 
 
2002d0a
 
29f4707
 
b7f402c
 
29f4707
2002d0a
 
 
 
 
 
 
29f4707
 
 
 
2002d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f4707
2002d0a
29f4707
2002d0a
 
29f4707
2002d0a
29f4707
b7f402c
29f4707
 
 
 
 
 
2002d0a
 
29f4707
2002d0a
 
29f4707
2002d0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29f4707
 
 
 
9365935
29f4707
2002d0a
 
 
 
 
9365935
2002d0a
 
b7f402c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import cv2
import pytesseract
import re
import numpy as np
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
from docx import Document
import gradio as gr
import traceback
import shutil
from datetime import datetime

# Auto-detect system tesseract
tess_path = shutil.which("tesseract")
if tess_path:
    pytesseract.pytesseract.tesseract_cmd = tess_path
else:
    print("โš ๏ธ Tesseract not found. Install tesseract-ocr.")

def convert_to_images(filepath):
    images = []
    ext = os.path.splitext(filepath)[1].lower()
    try:
        if ext == ".pdf":
            pages = convert_from_path(filepath)
            images.extend([page.convert("RGB") for page in pages])
        elif ext == ".docx":
            doc = Document(filepath)
            text = "\n".join([para.text for para in doc.paragraphs]).strip()
            img = Image.new("RGB", (1200, 1600), color="white")
            draw = ImageDraw.Draw(img)
            draw.text((10, 10), text[:4000], fill="black")
            images.append(img)
        else:
            img = Image.open(filepath).convert("RGB")
            images.append(img)
    except Exception as e:
        print(f"โŒ Conversion error: {e}")
        img = Image.new("RGB", (800, 200), "white")
        draw = ImageDraw.Draw(img)
        draw.text((10, 90), f"File error: {e}", fill="red")
        images.append(img)
    return images

def blur_sensitive_text(pil_img, custom_words=None):
    np_img = np.array(pil_img)
    img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
    altered = False

    patterns = [
        r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
        r"\b\d{10}\b",
        r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b",
        r"\b\d{5,}\b",
        r"\b\d{4}\s\d{4}\s\d{4}\b",
        r"\b[A-Z]{5}\d{4}[A-Z]\b",
        r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}",
    ]

    if custom_words:
        for w in custom_words:
            if w.strip():
                patterns.append(rf"(?i)\b{re.escape(w.strip())}\b")

    for i, word in enumerate(data['text']):
        try:
            if int(data['conf'][i]) < 60:
                continue
        except:
            continue

        word_clean = (word or "").strip()
        if not word_clean:
            continue

        normalized = word_clean.replace(" ", "").replace("-", "")
        for pattern in patterns:
            if re.fullmatch(pattern, normalized) or re.fullmatch(pattern, word_clean, re.IGNORECASE):
                x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i]
                cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1)
                altered = True
                break

    return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered

def blur_faces(np_img):
    img = np_img.copy()
    altered = False
    face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml")
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    faces = face_cascade.detectMultiScale(gray, 1.1, 5)
    for (x, y, w, h) in faces:
        img[y:y+h, x:x+w] = cv2.GaussianBlur(img[y:y+h, x:x+w], (51, 51), 30)
        altered = True
    return img, altered

def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""):
    try:
        custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else []
        pages = convert_to_images(filepath)
        redacted_pages = []

        for page in pages:
            img_array = np.array(page)
            text_altered = face_altered = False

            if redact_text:
                img_array, text_altered = blur_sensitive_text(page, custom_words)

            if redact_faces:
                img_array, face_altered = blur_faces(img_array)

            if not text_altered and not face_altered:
                cv2.putText(img_array, "โœ… No sensitive info found", (50, 100),
                            cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3)

            redacted_pages.append(Image.fromarray(img_array))

        ts = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_pdf = f"/tmp/redacted_{ts}.pdf"
        redacted_pages[0].save(output_pdf, save_all=True, append_images=redacted_pages[1:])
        return redacted_pages, output_pdf

    except Exception as e:
        print("โŒ Error:", traceback.format_exc())
        img = Image.new("RGB", (800, 200), "white")
        draw = ImageDraw.Draw(img)
        draw.text((10, 90), f"Error: {e}", fill="red")
        fallback = f"/tmp/error_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png"
        img.save(fallback)
        return [img], fallback

iface = gr.Interface(
    fn=redact_document,
    inputs=[
        gr.File(label="Upload image, PDF, or DOCX", type="filepath"),
        gr.Checkbox(label="Redact Sensitive Text", value=True),
        gr.Checkbox(label="Redact Faces", value=True),
        gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, PAN, 123456")
    ],
    outputs=[
        gr.Gallery(label="Redacted Preview", columns=1),
        gr.File(label="Download Redacted PDF")
    ],
    title="๐Ÿ” Smart Doc Redactor",
    description="Redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add custom keywords too."
)

iface.launch()