mastari
/

text-removal-endpoint

Model card Files Files and versions

xet

Community

mastari commited on Nov 5, 2025

Commit

943fefc

1 Parent(s): be56361

again

Browse files

Files changed (3) hide show

.DS_Store +0 -0
handler.py +79 -162
requirements.txt +7 -4

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

handler.py CHANGED Viewed

@@ -1,181 +1,98 @@
-import base64, cv2, numpy as np, importlib.util
-from typing import Dict, Any
 class EndpointHandler:
-    """
-    Robust hybrid text-removal handler:
-      • Uses EasyOCR (pixel-level) if available
-      • Falls back to EAST detector otherwise
-      • Expands & merges masks for full caption coverage
-      • Returns both mask overlay and inpainted (cleaned) image
-    """
-    def __init__(self, path: str = ""):
-        easyocr_spec = importlib.util.find_spec("easyocr")
-        if easyocr_spec:
-            import easyocr
-            self.reader = easyocr.Reader(["en"], gpu=False)
-            self.use_easyocr = True
-            print("[INIT] Using EasyOCR text detector")
-        else:
-            model_path = f"{path}/frozen_east_text_detection.pb"
-            self.net = cv2.dnn.readNet(model_path)
-            self.use_easyocr = False
-            print(f"[INIT] Using EAST model from {model_path}")
-    # ----------------------------- INFERENCE -----------------------------
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        inputs = data.get("inputs", data)
-        image_b64 = inputs.get("image")
-        if not image_b64:
-            raise ValueError("Missing 'image' in inputs")
-        img = self._decode_image(image_b64)
-        mask = self._make_mask(img)
-        cleaned = cv2.inpaint(img, mask, 3, cv2.INPAINT_TELEA)
-        # visualize mask overlay
-        vis = img.copy()
-        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        cv2.drawContours(vis, contours, -1, (0, 0, 255), 2)
-        return {
-            "mask_image": self._encode_image(vis),
-            "cleaned_image": self._encode_image(cleaned),
-        }
-    # ----------------------------- UTILITIES -----------------------------
-    def _decode_image(self, b64):
-        data = base64.b64decode(b64)
-        np_arr = np.frombuffer(data, np.uint8)
-        return cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
-    def _encode_image(self, im):
-        _, buf = cv2.imencode(".png", im)
-        return base64.b64encode(buf).decode("utf-8")
-    # ----------------------------- MASK CREATION -----------------------------
     def _make_mask(self, img):
         mask = np.zeros(img.shape[:2], np.uint8)
         h, w = img.shape[:2]
-        if self.use_easyocr:
-            results = self.reader.readtext(img)
-            for det in results:
-                try:
-                    box, text, conf = det
-                    if conf < 0.6:
-                        continue
-                    pts = np.array(box, np.int32)
-                    x, y, bw, bh = cv2.boundingRect(pts)
-                    # Skip very small noise
-                    if bw < 0.015 * w or bh < 0.015 * h:
-                        continue
-                    # Calculate local contrast
-                    roi = img[max(0, y):min(h, y + bh), max(0, x):min(w, x + bw)]
-                    gray = cv2.cvtColor(roi, cv2.COLOR_BGR2GRAY)
-                    contrast = gray.std()
-                    # Base padding — increase if high contrast or thick captions
-                    pad_scale = 0.03  # 3% of image width
-                    pad = max(int(w * pad_scale), 10)
-                    if contrast > 25:
-                        pad = int(pad * 1.5)
-                    # Expand more vertically — typical caption boxes have extra height
-                    pad_x = pad
-                    pad_y = int(pad * 1.4)
-                    x0, y0 = max(0, x - pad_x), max(0, y - pad_y)
-                    x1, y1 = min(w, x + bw + pad_x), min(h, y + bh + pad_y)
-                    cv2.rectangle(mask, (x0, y0), (x1, y1), 255, -1)
-                except Exception as e:
-                    print(f"[WARN] Skipped invalid detection: {e}")
-        else:
-            boxes = self._east_boxes(img)
-            for (x0, y0, x1, y1) in boxes:
-                pad = 10
-                cv2.rectangle(
-                    mask,
-                    (max(0, x0 - pad), max(0, y0 - pad)),
-                    (min(w, x1 + pad), min(h, y1 + pad)),
-                    255,
-                    -1,
-                )
-        # Merge nearby boxes and smooth edges
         kernel = np.ones((9, 9), np.uint8)
         mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=3)
         mask = cv2.dilate(mask, kernel, iterations=2)
-        # Feather slightly to eliminate border seams
-        mask = cv2.GaussianBlur(mask, (7, 7), 2)
         mask = (mask > 100).astype(np.uint8) * 255
         return mask
-    # ----------------------------- EAST FALLBACK -----------------------------
-    def _east_boxes(self, image, conf_threshold=0.5):
-        h, w = image.shape[:2]
-        new_w, new_h = 320, 320
-        r_w, r_h = w / new_w, h / new_h
-        blob = cv2.dnn.blobFromImage(
-            image,
-            1.0,
-            (new_w, new_h),
-            (123.68, 116.78, 103.94),
-            swapRB=True,
-            crop=False,
-        )
-        self.net.setInput(blob)
-        scores, geometry = self.net.forward(
-            ["feature_fusion/Conv_7/Sigmoid", "feature_fusion/concat_3"]
-        )
-        rects, confidences = self._decode(scores, geometry, conf_threshold)
-        indices = cv2.dnn.NMSBoxes(rects, confidences, conf_threshold, 0.4)
-        boxes = []
-        if len(indices) > 0:
-            for i in indices.flatten():
-                x0, y0, x1, y1 = rects[i]
-                boxes.append(
-                    [
-                        max(0, int(x0 * r_w)),
-                        max(0, int(y0 * r_h)),
-                        min(w, int(x1 * r_w)),
-                        min(h, int(y1 * r_h)),
-                    ]
-                )
-        return boxes
-    def _decode(self, scores, geometry, conf_threshold):
-        num_rows, num_cols = scores.shape[2:4]
-        rects, confidences = [], []
-        for y in range(num_rows):
-            scores_data = scores[0, 0, y]
-            x0 = geometry[0, 0, y]
-            x1 = geometry[0, 1, y]
-            x2 = geometry[0, 2, y]
-            x3 = geometry[0, 3, y]
-            angles = geometry[0, 4, y]
-            for x in range(num_cols):
-                if scores_data[x] < conf_threshold:
-                    continue
-                offset_x, offset_y = x * 4.0, y * 4.0
-                angle = angles[x]
-                cos, sin = np.cos(angle), np.sin(angle)
-                h_ = x0[x] + x2[x]
-                w_ = x1[x] + x3[x]
-                end_x = int(offset_x + cos * x1[x] + sin * x2[x])
-                end_y = int(offset_y - sin * x1[x] + cos * x2[x])
-                start_x = int(end_x - w_)
-                start_y = int(end_y - h_)
-                rects.append((start_x, start_y, end_x, end_y))
-                confidences.append(float(scores_data[x]))
-        return rects, confidences

+import base64
+import io
+import cv2
+import numpy as np
+from PIL import Image
+import torch
+from diffusers import StableDiffusionInpaintPipeline
+import easyocr
 class EndpointHandler:
+    def __init__(self, path=""):
+        print("[INIT] Loading EasyOCR and Stable Diffusion Inpainting model...")
+        # Text detector
+        self.reader = easyocr.Reader(["en"], gpu=True)
+        # SOTA inpainting model
+        self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-2-inpainting",
+            torch_dtype=torch.float16,
+        ).to("cuda")
+        print("[READY] Handler initialized successfully.")
+    # Decode incoming base64 image → numpy
+    def _decode_image(self, b64_image):
+        img_bytes = base64.b64decode(b64_image)
+        img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+    # Encode numpy → base64 PNG
+    def _encode_image(self, img):
+        _, buffer = cv2.imencode(".png", img)
+        return base64.b64encode(buffer).decode("utf-8")
+    # Make mask from detected text boxes
     def _make_mask(self, img):
         mask = np.zeros(img.shape[:2], np.uint8)
         h, w = img.shape[:2]
+        results = self.reader.readtext(img)
+        for det in results:
+            try:
+                box, text, conf = det
+                if conf < 0.6:
+                    continue
+                pts = np.array(box, np.int32)
+                x, y, bw, bh = cv2.boundingRect(pts)
+                if bw < 0.02 * w or bh < 0.015 * h:
+                    continue
+                pad_scale = 0.03
+                pad = max(int(w * pad_scale), 12)
+                pad_x, pad_y = pad, int(pad * 1.4)
+                x0, y0 = max(0, x - pad_x), max(0, y - pad_y)
+                x1, y1 = min(w, x + bw + pad_x), min(h, y + bh + pad_y)
+                cv2.rectangle(mask, (x0, y0), (x1, y1), 255, -1)
+            except Exception:
+                continue
+        # Merge and feather mask
         kernel = np.ones((9, 9), np.uint8)
         mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=3)
         mask = cv2.dilate(mask, kernel, iterations=2)
+        mask = cv2.GaussianBlur(mask, (9, 9), 3)
         mask = (mask > 100).astype(np.uint8) * 255
         return mask
+    def __call__(self, data):
+        if "image" not in data["inputs"]:
+            raise ValueError("Missing 'image' field in inputs")
+        # Decode input image
+        img = self._decode_image(data["inputs"]["image"])
+        mask = self._make_mask(img)
+        # Convert to PIL for pipeline
+        img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+        mask_pil = Image.fromarray(mask)
+        # Run inpainting (prompt left blank to stay realistic)
+        print("[INPAINT] Running Stable Diffusion 2 inpainting...")
+        out = self.pipe(prompt="", image=img_pil, mask_image=mask_pil).images[0]
+        cleaned = cv2.cvtColor(np.array(out), cv2.COLOR_RGB2BGR)
+        # Optional mask overlay for visualization
+        mask_overlay = img.copy()
+        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        cv2.drawContours(mask_overlay, contours, -1, (0, 0, 255), 2)
+        # Encode results
+        return {
+            "image": self._encode_image(cleaned),
+            "mask_overlay": self._encode_image(mask_overlay),
+        }

requirements.txt CHANGED Viewed

@@ -1,7 +1,10 @@
 opencv-python-headless>=4.8.0
-numpy>=1.26.0
-Pillow
-# Optional craft replacement – pure Python, compatible with Py3.11
 easyocr>=1.7.1
-torch>=2.1.0

+torch>=2.1.0
+torchvision
+diffusers>=0.29.0
+transformers>=4.41.0
+accelerate
 opencv-python-headless>=4.8.0
 easyocr>=1.7.1
+Pillow>=10.2.0
+numpy>=1.26.0