Spaces:

HarshitaSuri
/

DocRedactorV2.0

Sleeping

App Files Files Community

HarshitaSuri commited on Aug 19, 2025

Commit

9365935

verified ·

1 Parent(s): 2d44ba0

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -18

app.py CHANGED Viewed

@@ -12,9 +12,12 @@ import shutil
 from datetime import datetime
 # Auto-detect system tesseract
-pytesseract.pytesseract.tesseract_cmd = shutil.which("tesseract")
-# -------------------- File Conversion --------------------
 def convert_to_images(filepath):
     images = []
     ext = os.path.splitext(filepath)[1].lower()
@@ -40,7 +43,6 @@ def convert_to_images(filepath):
         images.append(img)
     return images
-# -------------------- Text Redaction --------------------
 def blur_sensitive_text(pil_img, custom_words=None):
     np_img = np.array(pil_img)
     img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
@@ -48,24 +50,21 @@ def blur_sensitive_text(pil_img, custom_words=None):
     data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
     altered = False
-    # Built-in regex patterns
     patterns = [
-        r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",  # Email
-        r"\b\d{10}\b",                                     # Phone (10 digits)
-        r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b",              # Card numbers
-        r"\b\d{5,}\b",                                     # Long numbers
-        r"\b\d{4}\s\d{4}\s\d{4}\b",                        # Aadhaar
-        r"\b[A-Z]{5}\d{4}[A-Z]\b",                         # PAN
-        r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}",  # References
     ]
-    # Add custom words (regex word boundaries)
     if custom_words:
         for w in custom_words:
             if w.strip():
                 patterns.append(rf"(?i)\b{re.escape(w.strip())}\b")
-    # Redact detected tokens
     for i, word in enumerate(data['text']):
         try:
             if int(data['conf'][i]) < 60:
@@ -87,7 +86,6 @@ def blur_sensitive_text(pil_img, custom_words=None):
     return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered
-# -------------------- Face Redaction --------------------
 def blur_faces(np_img):
     img = np_img.copy()
     altered = False
@@ -99,7 +97,6 @@ def blur_faces(np_img):
         altered = True
     return img, altered
-# -------------------- Main Function --------------------
 def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""):
     try:
         custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else []
@@ -136,21 +133,20 @@ def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=
         img.save(fallback)
         return [img], fallback
-# -------------------- Gradio Interface --------------------
 iface = gr.Interface(
     fn=redact_document,
     inputs=[
         gr.File(label="Upload image, PDF, or DOCX", type="filepath"),
         gr.Checkbox(label="Redact Sensitive Text", value=True),
         gr.Checkbox(label="Redact Faces", value=True),
-        gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, Dronacharya, 123456")
     ],
     outputs=[
         gr.Gallery(label="Redacted Preview", columns=1),
         gr.File(label="Download Redacted PDF")
     ],
     title="🔐 Smart Doc Redactor",
-    description="Automatically redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add your own custom keywords too."
 )
 iface.launch()

 from datetime import datetime
 # Auto-detect system tesseract
+tess_path = shutil.which("tesseract")
+if tess_path:
+    pytesseract.pytesseract.tesseract_cmd = tess_path
+else:
+    print("⚠️ Tesseract not found. Install tesseract-ocr.")
 def convert_to_images(filepath):
     images = []
     ext = os.path.splitext(filepath)[1].lower()
         images.append(img)
     return images
 def blur_sensitive_text(pil_img, custom_words=None):
     np_img = np.array(pil_img)
     img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
     data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
     altered = False
     patterns = [
+        r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
+        r"\b\d{10}\b",
+        r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b",
+        r"\b\d{5,}\b",
+        r"\b\d{4}\s\d{4}\s\d{4}\b",
+        r"\b[A-Z]{5}\d{4}[A-Z]\b",
+        r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}",
     ]
     if custom_words:
         for w in custom_words:
             if w.strip():
                 patterns.append(rf"(?i)\b{re.escape(w.strip())}\b")
     for i, word in enumerate(data['text']):
         try:
             if int(data['conf'][i]) < 60:
     return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered
 def blur_faces(np_img):
     img = np_img.copy()
     altered = False
         altered = True
     return img, altered
 def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""):
     try:
         custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else []
         img.save(fallback)
         return [img], fallback
 iface = gr.Interface(
     fn=redact_document,
     inputs=[
         gr.File(label="Upload image, PDF, or DOCX", type="filepath"),
         gr.Checkbox(label="Redact Sensitive Text", value=True),
         gr.Checkbox(label="Redact Faces", value=True),
+        gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, PAN, 123456")
     ],
     outputs=[
         gr.Gallery(label="Redacted Preview", columns=1),
         gr.File(label="Download Redacted PDF")
     ],
     title="🔐 Smart Doc Redactor",
+    description="Redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add custom keywords too."
 )
 iface.launch()