Spaces:

ynyg
/

InkErase

Sleeping

ynyg commited on Jan 28

Commit

1c20dbd

1 Parent(s): 7bd5dc8

refactor: 重构分块推理逻辑并移除 Albumentations 依赖

- 替换 Albumentations 的图像预处理逻辑，直接使用 PyTorch 实现归一化操作
- 引入自定义的分块推理方法 `_tiled_infer`，支持 Tiling + Overlap 机制
- 优化模型加载流程，动态检测和加载权重文件
- 使用 Torch 和 OpenCV 完成输入/输出的处理，移除了 Albumentations 相关代码
- 支持大尺寸图像的高效推理，提升内存和性能表现

Files changed (1) hide show

app.py +184 -109

app.py CHANGED Viewed

@@ -1,11 +1,12 @@
 import json
 from contextlib import asynccontextmanager
 from pathlib import Path
-import albumentations as A
 import cv2
 import numpy as np
 import torch
 from anyio.to_thread import run_sync
 from fastapi import FastAPI, Request, UploadFile, File
 from fastapi.responses import Response
@@ -15,45 +16,168 @@ from segmentation_models_pytorch import UnetPlusPlus
 MODEL_PATH = "models/InkErase"
 # 設備
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# 分块大小
-TRAIN_SIZE = 512
 def load_model() -> UnetPlusPlus:
     """加載模型"""
-    # 模型路径
     path = Path(MODEL_PATH)
-    # 读取配置文件
     cfg = json.loads((path / "config.json").read_text(encoding="utf-8"))
-    # 加載模型
-    return UnetPlusPlus(
         encoder_name=cfg.get("encoder_name", "resnet50"),
-        encoder_weights=None,
         in_channels=int(cfg.get("in_channels", 3)),
         classes=int(cfg.get("classes", 3)),
         decoder_attention_type=cfg.get("decoder_attention_type"),
         activation=cfg.get("activation", "sigmoid"),
     )
-def get_preprocessing() -> A.Compose:
-    """获取Albumentations 預處理 pipeline"""
-    return A.Compose([
-        A.Normalize(mean=(0, 0, 0), std=(1, 1, 1), max_pixel_value=255.0),
-        A.ToTensorV2()
-    ])
 @asynccontextmanager
 async def lifespan(instance: FastAPI):
-    """
-    FastAPI 應用程序的生命周期管理器。
-    :param instance: FastAPI 應用程序實例
-    """
-    # 加載模型
     instance.state.model = load_model()
-    # 初始化預處理函數
-    instance.state.preprocess_fn = get_preprocessing()
     yield
@@ -63,105 +187,56 @@ app = FastAPI(lifespan=lifespan)
 @app.post("/predict")
 async def predict(request: Request, file: UploadFile = File(...)):
     """
-    笔迹擦除
-    :param request: 请求对象
-    :param file: 待处理的图片
-    :return: 預測結果，包括文本、預測類別和置信度
     """
-    # 1. 使用 OpenCV 直接從內存讀取圖片
     content = await file.read()
-    # 將 bytes 轉換為 numpy array
     nparr = np.frombuffer(content, np.uint8)
-    # 解碼圖片 (默認 BGR)
-    original_image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
-    # 转换为 RGB
-    original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
-    # 获取图片尺寸
-    orig_h, orig_w = original_image.shape[:2]
-    # 获取模型和处理流
     model = request.app.state.model
-    preprocess_fn = request.app.state.preprocess_fn
     def _inference_logic():
-        with torch.no_grad():
-            # ==============================
-            # 情況 A: 圖片大於 512，進行切塊處理
-            # ==============================
-            if orig_w > TRAIN_SIZE or orig_h > TRAIN_SIZE:
-                # 1. 計算新的寬高（補齊為 512 的倍數）
-                new_w = (orig_w // TRAIN_SIZE + (1 if orig_w % TRAIN_SIZE != 0 else 0)) * TRAIN_SIZE
-                new_h = (orig_h // TRAIN_SIZE + (1 if orig_h % TRAIN_SIZE != 0 else 0)) * TRAIN_SIZE
-                # 2. Padding 原圖 (使用 NumPy 創建黑色畫布)
-                padded_img = np.zeros((new_h, new_w, 3), dtype=np.uint8)
-                # 將原圖貼到左上角 (NumPy 切片賦值)
-                padded_img[:orig_h, :orig_w, :] = original_image
-                # 創建結果掩碼畫布 (單通道)
-                result_mask = np.zeros((new_h, new_w), dtype=np.uint8)
-                # 3. 循環切割
-                for y in range(0, new_h, TRAIN_SIZE):
-                    for x in range(0, new_w, TRAIN_SIZE):
-                        # NumPy 切片代替 crop
-                        patch = padded_img[y:y + TRAIN_SIZE, x:x + TRAIN_SIZE]
-                        # Albumentations 處理 (patch 已經是 numpy array)
-                        transformed = preprocess_fn(image=patch)
-                        input_tensor = transformed["image"].unsqueeze(0).to(device)
-                        output = model(input_tensor)
-                        pred_mask = (output > 0.5).float().squeeze().cpu().numpy()
-                        pred_mask = (pred_mask * 255).astype(np.uint8)
-                        # 將預測結果貼回大圖
-                        result_mask[y:y + TRAIN_SIZE, x:x + TRAIN_SIZE] = pred_mask
-                # 裁剪回原始尺寸
-                final_image = result_mask[:orig_h, :orig_w]
-            # ==============================
-            # 情況 B: 圖片小於等於 512
-            # ==============================
-            else:
-                # 創建黑色畫布
-                padded_img = np.zeros((TRAIN_SIZE, TRAIN_SIZE, 3), dtype=np.uint8)
-                padded_img[:orig_h, :orig_w, :] = original_image
-                # Albumentations 處理
-                transformed = preprocess_fn(image=padded_img)
-                input_tensor = transformed["image"].unsqueeze(0).to(device)
-                output = model(input_tensor)
-                pred_mask = (output > 0.5).float().squeeze().cpu().numpy()
-                pred_mask = (pred_mask * 255).astype(np.uint8)
-                # 裁剪回原始尺寸
-                final_image = pred_mask[:orig_h, :orig_w]
-        return final_image
-    # 執行推理
-    result_image = await run_sync(_inference_logic)
-    # 返回圖片流 (使用 cv2.imencode)
-    # result_image 是单通道灰度图，可以直接编码为 PNG
-    success, encoded_image = cv2.imencode(".png", result_image)
     return Response(content=encoded_image.tobytes(), media_type="image/png")
 @app.get("/")
 def greet_json():
-    """
-    返回一個 JSON 格式的歡迎訊息。
-    """
     return {"Hello": "World!"}
 if __name__ == '__main__':
     import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=8000)

 import json
+import math
 from contextlib import asynccontextmanager
 from pathlib import Path
 import cv2
 import numpy as np
 import torch
+import torch.nn.functional as F
 from anyio.to_thread import run_sync
 from fastapi import FastAPI, Request, UploadFile, File
 from fastapi.responses import Response
 MODEL_PATH = "models/InkErase"
 # 設備
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# 分块大小 (参考脚本默认 512)
+TILE_SIZE = 512
+# 重叠大小 (参考脚本默认 64)
+OVERLAP = 64
+# ==========================================
+# 核心 Tiling 算法 (移植自 infer_hd.py)
+# ==========================================
+def _ceil_to_multiple(value: int, multiple: int) -> int:
+    if multiple <= 1:
+        return value
+    return int(math.ceil(value / multiple) * multiple)
+def _build_starts(length: int, tile: int, stride: int) -> list[int]:
+    if length <= tile:
+        return [0]
+    starts = list(range(0, length - tile + 1, stride))
+    last = length - tile
+    if starts[-1] != last:
+        starts.append(last)
+    return starts
+def _precompute_axis_weights(starts: list[int], tile: int, overlap: int) -> list[torch.Tensor]:
+    """预计算融合权重，用于消除拼接缝隙"""
+    max_start = starts[-1]
+    weights: list[torch.Tensor] = []
+    if overlap <= 0:
+        one = torch.ones(tile, dtype=torch.float32)
+        return [one for _ in starts]
+    # 创建渐变权重 (Ramp)
+    ramp_up = torch.linspace(0.0, 1.0, overlap, dtype=torch.float32)
+    ramp_down = torch.linspace(1.0, 0.0, overlap, dtype=torch.float32)
+    for start in starts:
+        w = torch.ones(tile, dtype=torch.float32)
+        if start > 0:
+            w[:overlap] *= ramp_up
+        if start < max_start:
+            w[-overlap:] *= ramp_down
+        weights.append(w)
+    return weights
+def _tiled_infer(
+    model: torch.nn.Module,
+    x_cpu: torch.Tensor,
+    tile_size: int = 512,
+    overlap: int = 64,
+    batch_size: int = 1,
+    pad_multiple: int = 32,
+    pad_mode: str = "replicate",
+) -> torch.Tensor:
+    """
+    执行分块推理并融合结果
+    x_cpu: [1, 3, H, W] 的 Tensor (CPU)
+    """
+    _, _, h, w = x_cpu.shape
+    # 1. 计算 Padding 后的尺寸
+    padded_h = _ceil_to_multiple(max(h, tile_size), pad_multiple)
+    padded_w = _ceil_to_multiple(max(w, tile_size), pad_multiple)
+    pad_h = padded_h - h
+    pad_w = padded_w - w
+    if pad_h or pad_w:
+        x_cpu = F.pad(x_cpu, (0, pad_w, 0, pad_h), mode=pad_mode)
+    # 2. 计算切片坐标
+    stride = tile_size - overlap
+    y_starts = _build_starts(padded_h, tile_size, stride)
+    x_starts = _build_starts(padded_w, tile_size, stride)
+    y_weights = _precompute_axis_weights(y_starts, tile_size, overlap)
+    x_weights = _precompute_axis_weights(x_starts, tile_size, overlap)
+    # 3. 初始化累加器和权重图
+    # 注意：这里假设输出是 3 通道 (RGB)，如果你确认只输出单通道 Mask，可以改这里为 1
+    # 但根据 infer_hd.py 的逻辑，它初始化为 x_cpu.shape[1] 即 3
+    channels = x_cpu.shape[1]
+    accum = torch.zeros((1, channels, padded_h, padded_w), dtype=torch.float32)
+    weight = torch.zeros((1, 1, padded_h, padded_w), dtype=torch.float32)
+    coords = []
+    for yi, yy in enumerate(y_starts):
+        for xi, xx in enumerate(x_starts):
+            coords.append((yy, xx, yi, xi))
+    # 4. 批量推理
+    # model 已经在外部被移动到了 device
+    with torch.inference_mode():
+        for i in range(0, len(coords), batch_size):
+            chunk = coords[i : i + batch_size]
+            # 提取 Batch Tiles
+            tiles = torch.stack(
+                [x_cpu[0, :, yy : yy + tile_size, xx : xx + tile_size] for (yy, xx, _, _) in chunk],
+                dim=0,
+            ).to(device)
+            # 推理
+            pred = model(tiles).float().detach().cpu()  # [B, C, tile, tile]
+            # 累加结果 (带权重)
+            for bi, (yy, xx, yi, xi) in enumerate(chunk):
+                wy = y_weights[yi]
+                wx = x_weights[xi]
+                # 构建权重矩阵 [1, 1, tile, tile]
+                m = (wy[:, None] * wx[None, :]).unsqueeze(0).unsqueeze(0)
+                accum[:, :, yy : yy + tile_size, xx : xx + tile_size] += pred[bi : bi + 1] * m
+                weight[:, :, yy : yy + tile_size, xx : xx + tile_size] += m
+    # 5. 归一化并裁剪
+    out = (accum / weight.clamp_min(1e-8)).clamp(0, 1)
+    return out[:, :, :h, :w]
+# ==========================================
+# FastAPI 逻辑
+# ==========================================
 def load_model() -> UnetPlusPlus:
     """加載模型"""
     path = Path(MODEL_PATH)
     cfg = json.loads((path / "config.json").read_text(encoding="utf-8"))
+    model = UnetPlusPlus(
         encoder_name=cfg.get("encoder_name", "resnet50"),
+        encoder_weights=None, # 注意：如果需要加载预训练权重，需在此处处理
         in_channels=int(cfg.get("in_channels", 3)),
         classes=int(cfg.get("classes", 3)),
         decoder_attention_type=cfg.get("decoder_attention_type"),
         activation=cfg.get("activation", "sigmoid"),
     )
+    # 如果有本地权重文件 (参考 infer_hd.py 中的 model.safetensors)
+    weights_path = path / "model.safetensors"
+    if weights_path.exists():
+        try:
+            from safetensors.torch import load_file
+            state_dict = load_file(str(weights_path))
+            # 简单的 key 过滤，防止不匹配
+            model_keys = set(model.state_dict().keys())
+            filtered_dict = {k: v for k, v in state_dict.items() if k in model_keys}
+            model.load_state_dict(filtered_dict, strict=False)
+            print(f"Loaded weights from {weights_path}")
+        except Exception as e:
+            print(f"Failed to load weights: {e}")
+    model.to(device)
+    model.eval()
+    return model
 @asynccontextmanager
 async def lifespan(instance: FastAPI):
     instance.state.model = load_model()
     yield
 @app.post("/predict")
 async def predict(request: Request, file: UploadFile = File(...)):
     """
+    笔迹擦除 (使用 Tiling + Overlap)
     """
     content = await file.read()
     nparr = np.frombuffer(content, np.uint8)
+    # 1. OpenCV 解码 -> BGR
+    img_bgr = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
+    # 转 RGB
+    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
     model = request.app.state.model
     def _inference_logic():
+        # 2. 预处理: NumPy (H, W, C) -> Tensor (1, C, H, W) 且归一化到 [0, 1]
+        # 参考脚本使用的是 TF.to_tensor，它会把 uint8 除以 255 转 float
+        input_tensor = torch.from_numpy(img_rgb).permute(2, 0, 1).float() / 255.0
+        input_tensor = input_tensor.unsqueeze(0) # [1, 3, H, W]
+        # 3. 执行分块推理
+        output_tensor = _tiled_infer(
+            model=model,
+            x_cpu=input_tensor,
+            tile_size=TILE_SIZE,
+            overlap=OVERLAP,
+            batch_size=1, # 显存够大可以调大
+            pad_mode="replicate"
+        )
+        # 4. 后处理: Tensor (1, C, H, W) -> NumPy (H, W, C) [0, 255]
+        output_tensor = output_tensor.squeeze(0).permute(1, 2, 0) # [H, W, C]
+        output_np = (output_tensor.numpy() * 255).astype(np.uint8)
+        return output_np
+    # 執行推理 (在线程池中运行 CPU 密集型操作)
+    result_rgb = await run_sync(_inference_logic)
+    # 5. 转回 BGR 以便 OpenCV 编码
+    result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
+    # 编码返回
+    success, encoded_image = cv2.imencode(".png", result_bgr)
     return Response(content=encoded_image.tobytes(), media_type="image/png")
 @app.get("/")
 def greet_json():
     return {"Hello": "World!"}
 if __name__ == '__main__':
     import uvicorn
     uvicorn.run("app:app", host="0.0.0.0", port=8000)