HarshitaSuri commited on
Commit
9365935
·
verified ·
1 Parent(s): 2d44ba0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -18
app.py CHANGED
@@ -12,9 +12,12 @@ import shutil
12
  from datetime import datetime
13
 
14
  # Auto-detect system tesseract
15
- pytesseract.pytesseract.tesseract_cmd = shutil.which("tesseract")
 
 
 
 
16
 
17
- # -------------------- File Conversion --------------------
18
  def convert_to_images(filepath):
19
  images = []
20
  ext = os.path.splitext(filepath)[1].lower()
@@ -40,7 +43,6 @@ def convert_to_images(filepath):
40
  images.append(img)
41
  return images
42
 
43
- # -------------------- Text Redaction --------------------
44
  def blur_sensitive_text(pil_img, custom_words=None):
45
  np_img = np.array(pil_img)
46
  img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
@@ -48,24 +50,21 @@ def blur_sensitive_text(pil_img, custom_words=None):
48
  data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
49
  altered = False
50
 
51
- # Built-in regex patterns
52
  patterns = [
53
- r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", # Email
54
- r"\b\d{10}\b", # Phone (10 digits)
55
- r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b", # Card numbers
56
- r"\b\d{5,}\b", # Long numbers
57
- r"\b\d{4}\s\d{4}\s\d{4}\b", # Aadhaar
58
- r"\b[A-Z]{5}\d{4}[A-Z]\b", # PAN
59
- r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}", # References
60
  ]
61
 
62
- # Add custom words (regex word boundaries)
63
  if custom_words:
64
  for w in custom_words:
65
  if w.strip():
66
  patterns.append(rf"(?i)\b{re.escape(w.strip())}\b")
67
 
68
- # Redact detected tokens
69
  for i, word in enumerate(data['text']):
70
  try:
71
  if int(data['conf'][i]) < 60:
@@ -87,7 +86,6 @@ def blur_sensitive_text(pil_img, custom_words=None):
87
 
88
  return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered
89
 
90
- # -------------------- Face Redaction --------------------
91
  def blur_faces(np_img):
92
  img = np_img.copy()
93
  altered = False
@@ -99,7 +97,6 @@ def blur_faces(np_img):
99
  altered = True
100
  return img, altered
101
 
102
- # -------------------- Main Function --------------------
103
  def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""):
104
  try:
105
  custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else []
@@ -136,21 +133,20 @@ def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=
136
  img.save(fallback)
137
  return [img], fallback
138
 
139
- # -------------------- Gradio Interface --------------------
140
  iface = gr.Interface(
141
  fn=redact_document,
142
  inputs=[
143
  gr.File(label="Upload image, PDF, or DOCX", type="filepath"),
144
  gr.Checkbox(label="Redact Sensitive Text", value=True),
145
  gr.Checkbox(label="Redact Faces", value=True),
146
- gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, Dronacharya, 123456")
147
  ],
148
  outputs=[
149
  gr.Gallery(label="Redacted Preview", columns=1),
150
  gr.File(label="Download Redacted PDF")
151
  ],
152
  title="🔐 Smart Doc Redactor",
153
- description="Automatically redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add your own custom keywords too."
154
  )
155
 
156
  iface.launch()
 
12
  from datetime import datetime
13
 
14
  # Auto-detect system tesseract
15
+ tess_path = shutil.which("tesseract")
16
+ if tess_path:
17
+ pytesseract.pytesseract.tesseract_cmd = tess_path
18
+ else:
19
+ print("⚠️ Tesseract not found. Install tesseract-ocr.")
20
 
 
21
  def convert_to_images(filepath):
22
  images = []
23
  ext = os.path.splitext(filepath)[1].lower()
 
43
  images.append(img)
44
  return images
45
 
 
46
  def blur_sensitive_text(pil_img, custom_words=None):
47
  np_img = np.array(pil_img)
48
  img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR)
 
50
  data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT)
51
  altered = False
52
 
 
53
  patterns = [
54
+ r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+",
55
+ r"\b\d{10}\b",
56
+ r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b",
57
+ r"\b\d{5,}\b",
58
+ r"\b\d{4}\s\d{4}\s\d{4}\b",
59
+ r"\b[A-Z]{5}\d{4}[A-Z]\b",
60
+ r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}",
61
  ]
62
 
 
63
  if custom_words:
64
  for w in custom_words:
65
  if w.strip():
66
  patterns.append(rf"(?i)\b{re.escape(w.strip())}\b")
67
 
 
68
  for i, word in enumerate(data['text']):
69
  try:
70
  if int(data['conf'][i]) < 60:
 
86
 
87
  return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered
88
 
 
89
  def blur_faces(np_img):
90
  img = np_img.copy()
91
  altered = False
 
97
  altered = True
98
  return img, altered
99
 
 
100
  def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""):
101
  try:
102
  custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else []
 
133
  img.save(fallback)
134
  return [img], fallback
135
 
 
136
  iface = gr.Interface(
137
  fn=redact_document,
138
  inputs=[
139
  gr.File(label="Upload image, PDF, or DOCX", type="filepath"),
140
  gr.Checkbox(label="Redact Sensitive Text", value=True),
141
  gr.Checkbox(label="Redact Faces", value=True),
142
+ gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, PAN, 123456")
143
  ],
144
  outputs=[
145
  gr.Gallery(label="Redacted Preview", columns=1),
146
  gr.File(label="Download Redacted PDF")
147
  ],
148
  title="🔐 Smart Doc Redactor",
149
+ description="Redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add custom keywords too."
150
  )
151
 
152
  iface.launch()