Spaces:
Sleeping
Sleeping
aladhefafalquran commited on
Commit ·
404d73f
1
Parent(s): 7dcfccf
Switch to Full Page Rasterization (Nuclear Method)
Browse files
app.py
CHANGED
|
@@ -69,76 +69,68 @@ import fitz # PyMuPDF
|
|
| 69 |
from concurrent.futures import ThreadPoolExecutor
|
| 70 |
|
| 71 |
def process_pdf(pdf_file, image_editor_data):
|
| 72 |
-
"""
|
| 73 |
if pdf_file is None or image_editor_data is None:
|
| 74 |
return None
|
| 75 |
|
| 76 |
-
# 1.
|
| 77 |
full_mask = get_mask_from_dict(image_editor_data)
|
| 78 |
|
| 79 |
# Dilate mask slightly to be safe
|
| 80 |
kernel = np.ones((5,5), np.uint8)
|
| 81 |
-
full_mask = cv2.dilate(full_mask, kernel, iterations=
|
| 82 |
-
|
| 83 |
-
#
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
# Open PDF
|
| 92 |
-
doc = fitz.open(pdf_file.name)
|
| 93 |
-
total_pages = len(doc)
|
| 94 |
|
| 95 |
-
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
for i in
|
| 98 |
-
|
|
|
|
| 99 |
|
| 100 |
-
#
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
|
| 103 |
-
#
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
y1 * scale,
|
| 107 |
-
x2 * scale,
|
| 108 |
-
y2 * scale
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
# 1. Image Object Removal (For Logos)
|
| 112 |
-
# Scan for images that are fully inside our mask
|
| 113 |
-
images = page.get_images()
|
| 114 |
-
for img in images:
|
| 115 |
-
xref = img[0]
|
| 116 |
-
# Get image bbox
|
| 117 |
-
try:
|
| 118 |
-
img_rect = page.get_image_rect(xref)
|
| 119 |
-
# If image is mostly inside our delete-zone
|
| 120 |
-
if rect.intersects(img_rect):
|
| 121 |
-
intersection = rect & img_rect
|
| 122 |
-
if intersection.get_area() > (img_rect.get_area() * 0.8):
|
| 123 |
-
print(f"Deleting logo/image on page {i+1}")
|
| 124 |
-
page.delete_image(xref)
|
| 125 |
-
except:
|
| 126 |
-
pass
|
| 127 |
-
|
| 128 |
-
# 2. Text/Vector Removal (For Text Watermarks)
|
| 129 |
-
# We redact the area, but we tell it NOT to remove background images
|
| 130 |
-
page.add_redact_annot(rect, fill=[], stroke=[]) # Transparent redaction
|
| 131 |
-
|
| 132 |
-
# CRITICAL FLAG: fitz.PDF_REDACT_IMAGE_NONE
|
| 133 |
-
# This tells it: "Remove text and drawings, but LEAVE THE BACKGROUND IMAGE ALONE"
|
| 134 |
-
page.apply_redactions(images=fitz.PDF_REDACT_IMAGE_NONE)
|
| 135 |
|
| 136 |
-
#
|
| 137 |
output_path = tempfile.mktemp(suffix=".pdf")
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
-
return
|
| 142 |
|
| 143 |
|
| 144 |
# --- UI Construction ---
|
|
|
|
| 69 |
from concurrent.futures import ThreadPoolExecutor
|
| 70 |
|
| 71 |
def process_pdf(pdf_file, image_editor_data):
|
| 72 |
+
"""Nuclear Method: Full Page Rasterization"""
|
| 73 |
if pdf_file is None or image_editor_data is None:
|
| 74 |
return None
|
| 75 |
|
| 76 |
+
# 1. Get the mask defined by user on Page 1
|
| 77 |
full_mask = get_mask_from_dict(image_editor_data)
|
| 78 |
|
| 79 |
# Dilate mask slightly to be safe
|
| 80 |
kernel = np.ones((5,5), np.uint8)
|
| 81 |
+
full_mask = cv2.dilate(full_mask, kernel, iterations=3)
|
| 82 |
+
|
| 83 |
+
# 2. Convert ALL pages to High-Res Images (300 DPI)
|
| 84 |
+
# This "flattens" vector graphics into pixels, solving the color profile mismatch.
|
| 85 |
+
print("Rasterizing PDF to Images (300 DPI)...")
|
| 86 |
+
try:
|
| 87 |
+
pages = pdf2image.convert_from_path(pdf_file.name, dpi=300)
|
| 88 |
+
except Exception as e:
|
| 89 |
+
print(f"Error converting PDF: {e}")
|
| 90 |
+
return None
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
+
cleaned_pages = []
|
| 93 |
+
total_pages = len(pages)
|
| 94 |
+
print(f"Processing {total_pages} pages...")
|
| 95 |
|
| 96 |
+
for i, page in enumerate(pages):
|
| 97 |
+
# Convert PIL to Numpy
|
| 98 |
+
img_np = np.array(page)
|
| 99 |
|
| 100 |
+
# Ensure RGB
|
| 101 |
+
if len(img_np.shape) == 2:
|
| 102 |
+
img_np = cv2.cvtColor(img_np, cv2.COLOR_GRAY2RGB)
|
| 103 |
+
elif len(img_np.shape) == 3 and img_np.shape[2] == 4:
|
| 104 |
+
img_np = cv2.cvtColor(img_np, cv2.COLOR_RGBA2RGB)
|
| 105 |
+
|
| 106 |
+
# Resize mask if page size differs from preview
|
| 107 |
+
if img_np.shape[:2] != full_mask.shape[:2]:
|
| 108 |
+
current_mask = cv2.resize(full_mask, (img_np.shape[1], img_np.shape[0]), interpolation=cv2.INTER_NEAREST)
|
| 109 |
+
else:
|
| 110 |
+
current_mask = full_mask
|
| 111 |
+
|
| 112 |
+
# Run AI (Inpainting)
|
| 113 |
+
# Since input is now RGB pixels, the AI's RGB output will blend much better.
|
| 114 |
+
result = lama.predict(img_np, current_mask)
|
| 115 |
|
| 116 |
+
# Convert back to PIL for PDF saving
|
| 117 |
+
cleaned_pages.append(Image.fromarray(result))
|
| 118 |
+
print(f"Processed page {i+1}/{total_pages}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
|
| 120 |
+
# 3. Save back to PDF with Max Quality
|
| 121 |
output_path = tempfile.mktemp(suffix=".pdf")
|
| 122 |
+
if cleaned_pages:
|
| 123 |
+
cleaned_pages[0].save(
|
| 124 |
+
output_path,
|
| 125 |
+
save_all=True,
|
| 126 |
+
append_images=cleaned_pages[1:],
|
| 127 |
+
quality=100, # Max JPEG quality
|
| 128 |
+
resolution=300.0, # Maintain High DPI
|
| 129 |
+
subsampling=0 # Disable chroma subsampling for sharper colors
|
| 130 |
+
)
|
| 131 |
+
return output_path
|
| 132 |
|
| 133 |
+
return None
|
| 134 |
|
| 135 |
|
| 136 |
# --- UI Construction ---
|