mastari
/

text-removal-endpoint

Model card Files Files and versions

xet

Community

mastari commited on Nov 5, 2025

Commit

286af0e

1 Parent(s): 943fefc

again

Browse files

Files changed (2) hide show

handler.py +31 -70
requirements.txt +0 -1

handler.py CHANGED Viewed

@@ -4,95 +4,56 @@ import cv2
 import numpy as np
 from PIL import Image
 import torch
-from diffusers import StableDiffusionInpaintPipeline
-import easyocr
 class EndpointHandler:
     def __init__(self, path=""):
-        print("[INIT] Loading EasyOCR and Stable Diffusion Inpainting model...")
-        # Text detector
-        self.reader = easyocr.Reader(["en"], gpu=True)
-        # SOTA inpainting model
-        self.pipe = StableDiffusionInpaintPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-2-inpainting",
             torch_dtype=torch.float16,
         ).to("cuda")
-        print("[READY] Handler initialized successfully.")
-    # Decode incoming base64 image → numpy
     def _decode_image(self, b64_image):
         img_bytes = base64.b64decode(b64_image)
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
-        return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-    # Encode numpy → base64 PNG
-    def _encode_image(self, img):
-        _, buffer = cv2.imencode(".png", img)
-        return base64.b64encode(buffer).decode("utf-8")
-    # Make mask from detected text boxes
-    def _make_mask(self, img):
-        mask = np.zeros(img.shape[:2], np.uint8)
-        h, w = img.shape[:2]
-        results = self.reader.readtext(img)
-        for det in results:
-            try:
-                box, text, conf = det
-                if conf < 0.6:
-                    continue
-                pts = np.array(box, np.int32)
-                x, y, bw, bh = cv2.boundingRect(pts)
-                if bw < 0.02 * w or bh < 0.015 * h:
-                    continue
-                pad_scale = 0.03
-                pad = max(int(w * pad_scale), 12)
-                pad_x, pad_y = pad, int(pad * 1.4)
-                x0, y0 = max(0, x - pad_x), max(0, y - pad_y)
-                x1, y1 = min(w, x + bw + pad_x), min(h, y + bh + pad_y)
-                cv2.rectangle(mask, (x0, y0), (x1, y1), 255, -1)
-            except Exception:
-                continue
-        # Merge and feather mask
-        kernel = np.ones((9, 9), np.uint8)
-        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=3)
-        mask = cv2.dilate(mask, kernel, iterations=2)
-        mask = cv2.GaussianBlur(mask, (9, 9), 3)
-        mask = (mask > 100).astype(np.uint8) * 255
-        return mask
     def __call__(self, data):
         if "image" not in data["inputs"]:
             raise ValueError("Missing 'image' field in inputs")
-        # Decode input image
-        img = self._decode_image(data["inputs"]["image"])
-        mask = self._make_mask(img)
-        # Convert to PIL for pipeline
-        img_pil = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-        mask_pil = Image.fromarray(mask)
-        # Run inpainting (prompt left blank to stay realistic)
-        print("[INPAINT] Running Stable Diffusion 2 inpainting...")
-        out = self.pipe(prompt="", image=img_pil, mask_image=mask_pil).images[0]
-        cleaned = cv2.cvtColor(np.array(out), cv2.COLOR_RGB2BGR)
-        # Optional mask overlay for visualization
-        mask_overlay = img.copy()
-        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-        cv2.drawContours(mask_overlay, contours, -1, (0, 0, 255), 2)
-        # Encode results
-        return {
-            "image": self._encode_image(cleaned),
-            "mask_overlay": self._encode_image(mask_overlay),
-        }

 import numpy as np
 from PIL import Image
 import torch
+from diffusers import AutoPipelineForInpainting
 class EndpointHandler:
     def __init__(self, path=""):
+        print("[INIT] Loading Nano Banana SDXL Inpainting pipeline...")
+        # Load Nano Banana (SDXL fine-tuned)
+        self.pipe = AutoPipelineForInpainting.from_pretrained(
+            "SG161222/RealVisXL_V4.0_Nano-Banana",
             torch_dtype=torch.float16,
+            variant="fp16"
         ).to("cuda")
+        # Default high-level removal instruction
+        self.default_prompt = "remove text captions, natural background, realistic restoration"
+        print("[READY] Nano Banana model loaded successfully.")
     def _decode_image(self, b64_image):
         img_bytes = base64.b64decode(b64_image)
         img = Image.open(io.BytesIO(img_bytes)).convert("RGB")
+        return img
+    def _encode_image(self, pil_img):
+        buf = io.BytesIO()
+        pil_img.save(buf, format="PNG")
+        return base64.b64encode(buf.getvalue()).decode("utf-8")
     def __call__(self, data):
         if "image" not in data["inputs"]:
             raise ValueError("Missing 'image' field in inputs")
+        prompt = data["inputs"].get("prompt", self.default_prompt)
+        # Decode base64 → PIL
+        img_pil = self._decode_image(data["inputs"]["image"])
+        print(f"[PROCESS] Running Nano Banana with prompt: '{prompt}'")
+        # Inpaint the whole image (no mask — full generative clean-up)
+        result = self.pipe(
+            prompt=prompt,
+            image=img_pil,
+            mask_image=None,
+            guidance_scale=3.0,
+            strength=0.85,
+            num_inference_steps=25
+        ).images[0]
+        # Encode result back to base64
+        cleaned_b64 = self._encode_image(result)
+        return {"image": cleaned_b64}

requirements.txt CHANGED Viewed

@@ -4,7 +4,6 @@ diffusers>=0.29.0
 transformers>=4.41.0
 accelerate
 opencv-python-headless>=4.8.0
-easyocr>=1.7.1
 Pillow>=10.2.0
 numpy>=1.26.0

 transformers>=4.41.0
 accelerate
 opencv-python-headless>=4.8.0
 Pillow>=10.2.0
 numpy>=1.26.0