Spaces:

UEXdo
/

HeightAdaptor

Sleeping

App Files Files Community

PubAccount commited on about 1 month ago

Commit

12a114a

verified ·

1 Parent(s): 47ab827

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -98

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ except ImportError:
         def GPU(duration=120):
             return lambda fn: fn
-import os, io
 import torch
 import numpy as np
 import matplotlib; matplotlib.use("Agg")
@@ -25,30 +25,29 @@ from peft import PeftModel
 import gradio as gr
 import safetensors.torch
 import warnings
-import asyncio
-# 忽略 asyncio 事件循环析构时的 ResourceWarning
 warnings.filterwarnings("ignore", category=ResourceWarning)
 from networks.semantic_head import SemanticHead
 from networks.height_head   import HeightHead
 from networks.decoder       import Decoder
 def fix_lora_state_dict(state_dict: dict) -> dict:
     """把旧版 Linear proj_in/proj_out 的 2D LoRA 权重升维到 Conv2d 所需的 4D"""
     fixed = {}
     for k, v in state_dict.items():
         if ("proj_in" in k or "proj_out" in k) and v.ndim == 2:
-            v = v.unsqueeze(-1).unsqueeze(-1)  # (out, in) → (out, in, 1, 1)
         fixed[k] = v
     return fixed
 # ══════════════════════════════════════════════════════════════
 #  常量 & 配置
 # ══════════════════════════════════════════════════════════════
 RGB_LATENT_SCALE = 0.18215
-# 通过环境变量可覆盖，否则使用默认 HF Repo ID
 SD_MODEL_ID  = os.environ.get("SD_MODEL_ID",      "sd-research/stable-diffusion-2-1-base")
 ADAPTOR_REPO = os.environ.get("ADAPTOR_MODEL_ID", "UEXdo/HeightAdaptor-weight")
@@ -68,26 +67,26 @@ LABEL_COLORS = {
     },
 }
 # ══════════════════════════════════════════════════════════════
-#  启动时下载 Adaptor 权重（缓存到本地，后续无需重复下载）
 # ══════════════════════════════════════════════════════════════
 print(f"📦 Downloading adaptor weights from {ADAPTOR_REPO} ...")
 ADAPTOR_DIR = snapshot_download(repo_id=ADAPTOR_REPO)
 print(f"✅ Weights cached at: {ADAPTOR_DIR}")
 # ══════════════════════════════════════════════════════════════
-#  模型管理（主进程维护 CPU 模型，GPU 子进程 copy-on-use）
 # ══════════════════════════════════════════════════════════════
 _model     = None
-_model_key = None   # (dataset_name, h_type)
-def build_model(dataset_name: str, h_type: str) -> StableDiffusionPipeline:
-    """从 HF Hub 拉取基础模型，叠加 LoRA + 三个自定义 Head，返回 CPU 模型。"""
     classes_num = DATASET_CFG[dataset_name]["classes_num"]
     print(f"🔧 Building model — dataset={dataset_name}, h_type={h_type}")
-    # 1. 加载 SD v1.5 基础 Pipeline
     pipe = StableDiffusionPipeline.from_pretrained(
         SD_MODEL_ID,
         torch_dtype=torch.float32,
@@ -95,8 +94,6 @@ def build_model(dataset_name: str, h_type: str) -> StableDiffusionPipeline:
         requires_safety_checker=False,
     )
-    # 2. 用 PEFT 把 LoRA 权重注入 UNet
-    # 尝试加载 safetensors 或 pytorch bin
     lora_path = os.path.join(ADAPTOR_DIR, "lora")
     ckpt_file = os.path.join(lora_path, "adapter_model.safetensors")
     if os.path.exists(ckpt_file):
@@ -107,23 +104,20 @@ def build_model(dataset_name: str, h_type: str) -> StableDiffusionPipeline:
             os.path.join(lora_path, "adapter_model.bin"),
             map_location="cpu"
         )
-    fixed_sd = fix_lora_state_dict(raw_sd)
     pipe.unet = PeftModel.from_pretrained(pipe.unet, lora_path)
-    # 3. 加载 Decoder
     pipe.decoder = Decoder(in_channel=320)
     pipe.decoder.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "decoder.pth"), map_location="cpu"))
     pipe.decoder.eval()
-    # 4. 加载 HeightHead
     pipe.height_head = HeightHead(in_channels=192, h_type=h_type)
     pipe.height_head.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "height_head.pth"), map_location="cpu"))
     pipe.height_head.eval()
-    # 5. 加载 SemanticHead（类别数由 dataset 决定）
     pipe.semantic_head = SemanticHead(in_channels=192, num_classes=classes_num)
     pipe.semantic_head.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "semantic_head.pth"), map_location="cpu"))
@@ -134,11 +128,6 @@ def build_model(dataset_name: str, h_type: str) -> StableDiffusionPipeline:
 def reload_model(dataset_name: str, h_type: str) -> str:
-    """
-    在主进程中重建模型，供 Gradio 按钮调用。
-    注意：此函数 **不加** @spaces.GPU，直接运行在主进程，
-    全局 _model 更新后，下一次 @spaces.GPU 调用会 fork 到新模型。
-    """
     global _model, _model_key
     key = (dataset_name, h_type)
     if _model is not None and _model_key == key:
@@ -153,11 +142,9 @@ reload_model("OpenDC", "ER")
 # ══════════════════════════════════════════════════════════════
-#  VAE / UNet forward（移除了 DistributedDataParallel 分支，
-#  Spaces 单卡场景不需要）
 # ══════════════════════════════════════════════════════════════
 def _vae_encode(pipe, x: torch.Tensor):
-    """通过 VAE Encoder 前向，返回 (最终特征, 中间特征列表)。"""
     enc   = pipe.vae.encoder
     x     = enc.conv_in(x)
     feats = []
@@ -168,7 +155,7 @@ def _vae_encode(pipe, x: torch.Tensor):
     x = enc.conv_norm_out(x)
     x = enc.conv_act(x)
     x = enc.conv_out(x)
-    return x, feats[:-1]   # 与原始代码一致，丢弃最后一层特征
 def _unet_forward(unet, sample, timestep, enc_hs):
@@ -194,17 +181,74 @@ def _unet_forward(unet, sample, timestep, enc_hs):
 # ══════════════════════════════════════════════════════════════
-#  GPU 推理（用 @spaces.GPU 装饰，申请最多 120s GPU）
 # ══════════════════════════════════════════════════════════════
-@spaces.GPU(duration=120)
 @torch.no_grad()
-def run_inference(
-    image:        Image.Image,
-    task:         str,
-    dataset_name: str,
-    h_type:       str,
-    mode_type:    str,
-):
     if image is None:
         return None, "⚠️ Please upload an image first."
     if _model is None:
@@ -212,71 +256,57 @@ def run_inference(
     device = "cuda"
     pipe   = _model
-    pipe.to(device)          # ZeroGPU 子进程拿到 CPU 副本后移到 GPU
     try:
-        # ── 1. 文本编码 ──────────────────────────────────────
-        tokens   = pipe.tokenizer(
-            "", padding="max_length", truncation=True,
-            max_length=pipe.tokenizer.model_max_length, return_tensors="pt")
-        text_emb = pipe.text_encoder(tokens.input_ids.to(device))[0].float()
-        # text_emb: [1, 77, 768]  (SD v1.5 的 text dim 为 768)
-        # ── 2. 图像预处理 → [1, 3, 512, 512] ∈ [-1, 1] ──────
-        img  = image.convert("RGB").resize((512, 512), Image.BILINEAR)
-        arr  = np.array(img, dtype=np.float32).transpose(2, 0, 1)
-        norm = (torch.from_numpy(arr) / 255.0 * 2.0 - 1.0).unsqueeze(0).to(device)
-        # ── 3. VAE 编码 ───────────────────────────────────────
-        h, h_list = _vae_encode(pipe, norm)
-        moments   = pipe.vae.quant_conv(h)
-        mean, lv  = torch.chunk(moments, 2, dim=1)
-        latents   = (mean + torch.exp(0.5 * lv) * torch.randn_like(mean)) * RGB_LATENT_SCALE
-        # ── 4. UNet + 自定义 Decoder ─────────────────────────
-        ts     = torch.ones([latents.shape[0]], device=device) * 999
-        unet_o = _unet_forward(pipe.unet, latents, ts, text_emb)
-        dec_o  = pipe.decoder(unet_o, res_list=h_list[::-1])
-        # ── 5. 任务 Head ──────────────────────────────────────
-        h_out = pipe.height_head(dec_o)
-        s_out = pipe.semantic_head(dec_o)
-        # ── 6. 后处理 & 可视化 ───────────────────────────────
-        if mode_type == "Height Map":
-            pred = F.interpolate(h_out[0].cpu(), (512, 512),
-                                 mode="bilinear", align_corners=False)
-            pred = ((pred + 1.0) / 2.0).clamp(0, 1).squeeze().numpy()
-            fig, ax = plt.subplots(figsize=(6, 5), tight_layout=True)
-            im = ax.imshow(pred, cmap="plasma")
-            fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
-            ax.set_title("Predicted Height Map"); ax.axis("off")
-            buf = io.BytesIO()
-            fig.savefig(buf, format="png", dpi=150)
-            plt.close(fig); buf.seek(0)
-            out_img = Image.open(buf).copy()
-            info    = (f"Normalized range: [{pred.min():.4f}, {pred.max():.4f}]\n"
-                       "(0 ≈ 0 m,  1 ≈ 50 m  before denormalization)")
-        else:  # Semantic Map
-            pred   = F.interpolate(s_out, (512, 512), mode="bilinear", align_corners=False)
-            argmax = torch.argmax(pred, dim=1).squeeze().cpu().numpy()
-            canvas = np.zeros((512, 512, 3), dtype=np.uint8)
-            for lbl, col in LABEL_COLORS[dataset_name].items():
-                canvas[argmax == lbl] = col
-            out_img = Image.fromarray(canvas)
-            info    = f"Detected class indices: {np.unique(argmax).tolist()}"
-        return out_img, info
     finally:
-        # ZeroGPU 子进程结束后 GPU 内存自动释放，
-        # 这里显式移回 CPU 只是额外保险
         pipe.to("cpu")
         torch.cuda.empty_cache()
 # ══════════════════════════════════════════════════════════════
 #  Gradio UI
 # ══════════════════════════════════════════════════════════════
@@ -284,12 +314,10 @@ with gr.Blocks(title="HeightAdaptor") as demo:
     gr.Markdown("""
     # 🏙️ HeightAdaptor
     **Remote Sensing Image → Height Map / Semantic Segmentation**
     Backbone: `stable-diffusion-v1-5` + LoRA adaptor (`UEXdo/HeightAdaptor-weight`) + 自定义 Task Heads
     """)
     with gr.Row():
-        # ── 左栏：输入 & 配置 ──────────────────────────────────
         with gr.Column(scale=1):
             inp_img = gr.Image(type="pil", label="📷 Input RGB Image")
@@ -313,7 +341,6 @@ with gr.Blocks(title="HeightAdaptor") as demo:
             run_btn = gr.Button("🚀 Run Inference", variant="primary", size="lg")
-        # ── 右栏：输出 ──────────────────────────────────────────
         with gr.Column(scale=1):
             out_img  = gr.Image(type="pil", label="📊 Output")
             out_info = gr.Textbox(label="ℹ️ Info", interactive=False, lines=3)
@@ -324,7 +351,6 @@ with gr.Blocks(title="HeightAdaptor") as demo:
     > 图像会自动缩放至 512 × 512，GPU 推理约需 10–30 秒。
     """)
-    # ── 事件绑定 ─────────────────────────────────────────────────
     load_btn.click(
         fn=reload_model,
         inputs=[dataset_radio, h_type_radio],

         def GPU(duration=120):
             return lambda fn: fn
+import os, io, traceback          # ← 新增 traceback
 import torch
 import numpy as np
 import matplotlib; matplotlib.use("Agg")
 import gradio as gr
 import safetensors.torch
 import warnings
 warnings.filterwarnings("ignore", category=ResourceWarning)
 from networks.semantic_head import SemanticHead
 from networks.height_head   import HeightHead
 from networks.decoder       import Decoder
 def fix_lora_state_dict(state_dict: dict) -> dict:
     """把旧版 Linear proj_in/proj_out 的 2D LoRA 权重升维到 Conv2d 所需的 4D"""
     fixed = {}
     for k, v in state_dict.items():
         if ("proj_in" in k or "proj_out" in k) and v.ndim == 2:
+            v = v.unsqueeze(-1).unsqueeze(-1)
         fixed[k] = v
     return fixed
 # ══════════════════════════════════════════════════════════════
 #  常量 & 配置
 # ══════════════════════════════════════════════════════════════
 RGB_LATENT_SCALE = 0.18215
 SD_MODEL_ID  = os.environ.get("SD_MODEL_ID",      "sd-research/stable-diffusion-2-1-base")
 ADAPTOR_REPO = os.environ.get("ADAPTOR_MODEL_ID", "UEXdo/HeightAdaptor-weight")
     },
 }
 # ══════════════════════════════════════════════════════════════
+#  下载 Adaptor 权重
 # ══════════════════════════════════════════════════════════════
 print(f"📦 Downloading adaptor weights from {ADAPTOR_REPO} ...")
 ADAPTOR_DIR = snapshot_download(repo_id=ADAPTOR_REPO)
 print(f"✅ Weights cached at: {ADAPTOR_DIR}")
 # ══════════════════════════════════════════════════════════════
+#  模型管理
 # ══════════════════════════════════════════════════════════════
 _model     = None
+_model_key = None
+def build_model(dataset_name: str, h_type: str):
     classes_num = DATASET_CFG[dataset_name]["classes_num"]
     print(f"🔧 Building model — dataset={dataset_name}, h_type={h_type}")
     pipe = StableDiffusionPipeline.from_pretrained(
         SD_MODEL_ID,
         torch_dtype=torch.float32,
         requires_safety_checker=False,
     )
     lora_path = os.path.join(ADAPTOR_DIR, "lora")
     ckpt_file = os.path.join(lora_path, "adapter_model.safetensors")
     if os.path.exists(ckpt_file):
             os.path.join(lora_path, "adapter_model.bin"),
             map_location="cpu"
         )
+    fixed_sd = fix_lora_state_dict(raw_sd)   # noqa: F841（修复后暂存，PeftModel 会读文件）
     pipe.unet = PeftModel.from_pretrained(pipe.unet, lora_path)
     pipe.decoder = Decoder(in_channel=320)
     pipe.decoder.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "decoder.pth"), map_location="cpu"))
     pipe.decoder.eval()
     pipe.height_head = HeightHead(in_channels=192, h_type=h_type)
     pipe.height_head.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "height_head.pth"), map_location="cpu"))
     pipe.height_head.eval()
     pipe.semantic_head = SemanticHead(in_channels=192, num_classes=classes_num)
     pipe.semantic_head.load_state_dict(
         torch.load(os.path.join(ADAPTOR_DIR, "semantic_head.pth"), map_location="cpu"))
 def reload_model(dataset_name: str, h_type: str) -> str:
     global _model, _model_key
     key = (dataset_name, h_type)
     if _model is not None and _model_key == key:
 # ══════════════════════════════════════════════════════════════
+#  VAE / UNet forward
 # ══════════════════════════════════════════════════════════════
 def _vae_encode(pipe, x: torch.Tensor):
     enc   = pipe.vae.encoder
     x     = enc.conv_in(x)
     feats = []
     x = enc.conv_norm_out(x)
     x = enc.conv_act(x)
     x = enc.conv_out(x)
+    return x, feats[:-1]
 def _unet_forward(unet, sample, timestep, enc_hs):
 # ══════════════════════════════════════════════════════════════
+#  核心推理逻辑（与 @spaces.GPU 解耦，可独立用 CPU 测试）
 # ══════════════════════════════════════════════════════════════
 @torch.no_grad()
+def _run_inference_core(pipe, device, image, task, dataset_name, h_type, mode_type):
+    """
+    纯推理逻辑，不依赖 @spaces.GPU。
+    pipe 和所有 tensor 必须已经在同一个 device 上。
+    """
+    # ── 1. 文本编码 ──────────────────────────────────────────
+    tokens   = pipe.tokenizer(
+        "", padding="max_length", truncation=True,
+        max_length=pipe.tokenizer.model_max_length, return_tensors="pt")
+    text_emb = pipe.text_encoder(tokens.input_ids.to(device))[0].float()
+    # ── 2. 图像预处理 → [1, 3, 512, 512] ∈ [-1, 1] ─────────
+    img  = image.convert("RGB").resize((512, 512), Image.BILINEAR)
+    arr  = np.array(img, dtype=np.float32).transpose(2, 0, 1)
+    norm = (torch.from_numpy(arr) / 255.0 * 2.0 - 1.0).unsqueeze(0).to(device)
+    # ── 3. VAE 编码 ───────────────────────────────────────────
+    h, h_list = _vae_encode(pipe, norm)
+    moments   = pipe.vae.quant_conv(h)
+    mean, lv  = torch.chunk(moments, 2, dim=1)
+    latents   = (mean + torch.exp(0.5 * lv) * torch.randn_like(mean)) * RGB_LATENT_SCALE
+    # ── 4. UNet + 自定义 Decoder ─────────────────────────────
+    ts     = torch.ones([latents.shape[0]], device=device) * 999
+    unet_o = _unet_forward(pipe.unet, latents, ts, text_emb)
+    dec_o  = pipe.decoder(unet_o, res_list=h_list[::-1])
+    # ── 5. 任务 Head ──────────────────────────────────────────
+    h_out = pipe.height_head(dec_o)
+    s_out = pipe.semantic_head(dec_o)
+    # ── 6. 后处理 & 可视化 ────────────────────────────────────
+    if mode_type == "Height Map":
+        pred = F.interpolate(h_out[0].cpu(), (512, 512),
+                             mode="bilinear", align_corners=False)
+        pred = ((pred + 1.0) / 2.0).clamp(0, 1).squeeze().numpy()
+        fig, ax = plt.subplots(figsize=(6, 5), tight_layout=True)
+        im = ax.imshow(pred, cmap="plasma")
+        fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
+        ax.set_title("Predicted Height Map"); ax.axis("off")
+        buf = io.BytesIO()
+        fig.savefig(buf, format="png", dpi=150)
+        plt.close(fig); buf.seek(0)
+        out_img = Image.open(buf).copy()
+        info    = (f"Normalized range: [{pred.min():.4f}, {pred.max():.4f}]\n"
+                   "(0 ≈ 0 m,  1 ≈ 50 m  before denormalization)")
+    else:  # Semantic Map
+        pred   = F.interpolate(s_out, (512, 512), mode="bilinear", align_corners=False)
+        argmax = torch.argmax(pred, dim=1).squeeze().cpu().numpy()
+        canvas = np.zeros((512, 512, 3), dtype=np.uint8)
+        for lbl, col in LABEL_COLORS[dataset_name].items():
+            canvas[argmax == lbl] = col
+        out_img = Image.fromarray(canvas)
+        info    = f"Detected class indices: {np.unique(argmax).tolist()}"
+    return out_img, info
+# ══════════════════════════════════════════════════════════════
+#  GPU 推理入口（Gradio 按钮触发）
+# ══════════════════════════════════════════════════════════════
+@spaces.GPU(duration=120)
+def run_inference(image, task, dataset_name, h_type, mode_type):
     if image is None:
         return None, "⚠️ Please upload an image first."
     if _model is None:
     device = "cuda"
     pipe   = _model
+    pipe.to(device)
     try:
+        return _run_inference_core(pipe, device, image, task, dataset_name, h_type, mode_type)
+    except Exception as e:
+        traceback.print_exc()          # ← 终端打完整 stack trace
+        return None, f"❌ Inference error: {e}"
     finally:
         pipe.to("cpu")
         torch.cuda.empty_cache()
+# ══════════════════════════════════════════════════════════════
+#  ★ 启动测试：用 Demo1.png 在 CPU 上跑一次完整推理
+#    成功 → 打印结果范围，并把输出图存到 Demo1_result.png
+#    失败 → 打印完整 traceback，方便定位错误
+# ══════════════════════════════════════════════════════════════
+_DEMO_IMG_PATH = "Demo1.png"
+print(f"\n{'='*60}")
+print(f"🧪 Startup inference test — {_DEMO_IMG_PATH} (device=cpu)")
+print(f"{'='*60}")
+try:
+    if not os.path.exists(_DEMO_IMG_PATH):
+        print(f"⚠️  {_DEMO_IMG_PATH} not found, skipping test.")
+    else:
+        _test_img = Image.open(_DEMO_IMG_PATH)
+        print(f"   Image size : {_test_img.size}, mode: {_test_img.mode}")
+        # 把模型组件移到 CPU（此时本来就在 CPU，仅做显式确认）
+        _model.to("cuda")
+        _out_img, _info = _run_inference_core(
+            _model, "cuda",
+            _test_img,
+            "Height Estimation",   # task
+            "OpenDC",              # dataset_name
+            "ER",                  # h_type
+            "Height Map",          # mode_type
+        )
+        _out_img.save("Demo1_result.png")
+        print(f"✅ Test PASSED")
+        print(f"   Info       : {_info}")
+        print(f"   Saved to   : Demo1_result.png")
+except Exception:
+    print("❌ Test FAILED — full traceback below:")
+    traceback.print_exc()
+print(f"{'='*60}\n")
 # ══════════════════════════════════════════════════════════════
 #  Gradio UI
 # ══════════════════════════════════════════════════════════════
     gr.Markdown("""
     # 🏙️ HeightAdaptor
     **Remote Sensing Image → Height Map / Semantic Segmentation**
     Backbone: `stable-diffusion-v1-5` + LoRA adaptor (`UEXdo/HeightAdaptor-weight`) + 自定义 Task Heads
     """)
     with gr.Row():
         with gr.Column(scale=1):
             inp_img = gr.Image(type="pil", label="📷 Input RGB Image")
             run_btn = gr.Button("🚀 Run Inference", variant="primary", size="lg")
         with gr.Column(scale=1):
             out_img  = gr.Image(type="pil", label="📊 Output")
             out_info = gr.Textbox(label="ℹ️ Info", interactive=False, lines=3)
     > 图像会自动缩放至 512 × 512，GPU 推理约需 10–30 秒。
     """)
     load_btn.click(
         fn=reload_model,
         inputs=[dataset_radio, h_type_radio],