Spaces:
Sleeping
Sleeping
| import os | |
| import cv2 | |
| import pytesseract | |
| import re | |
| import numpy as np | |
| from PIL import Image, ImageDraw | |
| from pdf2image import convert_from_path | |
| from docx import Document | |
| import gradio as gr | |
| import traceback | |
| import shutil | |
| from datetime import datetime | |
| # Auto-detect system tesseract | |
| tess_path = shutil.which("tesseract") | |
| if tess_path: | |
| pytesseract.pytesseract.tesseract_cmd = tess_path | |
| else: | |
| print("β οΈ Tesseract not found. Install tesseract-ocr.") | |
| def convert_to_images(filepath): | |
| images = [] | |
| ext = os.path.splitext(filepath)[1].lower() | |
| try: | |
| if ext == ".pdf": | |
| pages = convert_from_path(filepath) | |
| images.extend([page.convert("RGB") for page in pages]) | |
| elif ext == ".docx": | |
| doc = Document(filepath) | |
| text = "\n".join([para.text for para in doc.paragraphs]).strip() | |
| img = Image.new("RGB", (1200, 1600), color="white") | |
| draw = ImageDraw.Draw(img) | |
| draw.text((10, 10), text[:4000], fill="black") | |
| images.append(img) | |
| else: | |
| img = Image.open(filepath).convert("RGB") | |
| images.append(img) | |
| except Exception as e: | |
| print(f"β Conversion error: {e}") | |
| img = Image.new("RGB", (800, 200), "white") | |
| draw = ImageDraw.Draw(img) | |
| draw.text((10, 90), f"File error: {e}", fill="red") | |
| images.append(img) | |
| return images | |
| def blur_sensitive_text(pil_img, custom_words=None): | |
| np_img = np.array(pil_img) | |
| img = cv2.cvtColor(np_img, cv2.COLOR_RGB2BGR) | |
| gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) | |
| data = pytesseract.image_to_data(gray, output_type=pytesseract.Output.DICT) | |
| altered = False | |
| patterns = [ | |
| r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", | |
| r"\b\d{10}\b", | |
| r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4,6}\b", | |
| r"\b\d{5,}\b", | |
| r"\b\d{4}\s\d{4}\s\d{4}\b", | |
| r"\b[A-Z]{5}\d{4}[A-Z]\b", | |
| r"(?i)(rcpt|txn|order|ref|payment|utr)[^\s]{3,}", | |
| ] | |
| if custom_words: | |
| for w in custom_words: | |
| if w.strip(): | |
| patterns.append(rf"(?i)\b{re.escape(w.strip())}\b") | |
| for i, word in enumerate(data['text']): | |
| try: | |
| if int(data['conf'][i]) < 60: | |
| continue | |
| except: | |
| continue | |
| word_clean = (word or "").strip() | |
| if not word_clean: | |
| continue | |
| normalized = word_clean.replace(" ", "").replace("-", "") | |
| for pattern in patterns: | |
| if re.fullmatch(pattern, normalized) or re.fullmatch(pattern, word_clean, re.IGNORECASE): | |
| x, y, w, h = data['left'][i], data['top'][i], data['width'][i], data['height'][i] | |
| cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 0), -1) | |
| altered = True | |
| break | |
| return cv2.cvtColor(img, cv2.COLOR_BGR2RGB), altered | |
| def blur_faces(np_img): | |
| img = np_img.copy() | |
| altered = False | |
| face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + "haarcascade_frontalface_default.xml") | |
| gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) | |
| faces = face_cascade.detectMultiScale(gray, 1.1, 5) | |
| for (x, y, w, h) in faces: | |
| img[y:y+h, x:x+w] = cv2.GaussianBlur(img[y:y+h, x:x+w], (51, 51), 30) | |
| altered = True | |
| return img, altered | |
| def redact_document(filepath, redact_text=True, redact_faces=True, custom_input=""): | |
| try: | |
| custom_words = [w.strip() for w in custom_input.split(",")] if custom_input else [] | |
| pages = convert_to_images(filepath) | |
| redacted_pages = [] | |
| for page in pages: | |
| img_array = np.array(page) | |
| text_altered = face_altered = False | |
| if redact_text: | |
| img_array, text_altered = blur_sensitive_text(page, custom_words) | |
| if redact_faces: | |
| img_array, face_altered = blur_faces(img_array) | |
| if not text_altered and not face_altered: | |
| cv2.putText(img_array, "β No sensitive info found", (50, 100), | |
| cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 3) | |
| redacted_pages.append(Image.fromarray(img_array)) | |
| ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| output_pdf = f"/tmp/redacted_{ts}.pdf" | |
| redacted_pages[0].save(output_pdf, save_all=True, append_images=redacted_pages[1:]) | |
| return redacted_pages, output_pdf | |
| except Exception as e: | |
| print("β Error:", traceback.format_exc()) | |
| img = Image.new("RGB", (800, 200), "white") | |
| draw = ImageDraw.Draw(img) | |
| draw.text((10, 90), f"Error: {e}", fill="red") | |
| fallback = f"/tmp/error_{datetime.now().strftime('%Y%m%d_%H%M%S')}.png" | |
| img.save(fallback) | |
| return [img], fallback | |
| iface = gr.Interface( | |
| fn=redact_document, | |
| inputs=[ | |
| gr.File(label="Upload image, PDF, or DOCX", type="filepath"), | |
| gr.Checkbox(label="Redact Sensitive Text", value=True), | |
| gr.Checkbox(label="Redact Faces", value=True), | |
| gr.Textbox(label="Custom words/phrases (comma separated)", placeholder="e.g., Harshita, PAN, 123456") | |
| ], | |
| outputs=[ | |
| gr.Gallery(label="Redacted Preview", columns=1), | |
| gr.File(label="Download Redacted PDF") | |
| ], | |
| title="π Smart Doc Redactor", | |
| description="Redact sensitive info (emails, Aadhaar, PAN, phone, card numbers, faces). Add custom keywords too." | |
| ) | |
| iface.launch() | |