Spaces:

MLBench
/

Transformer_Base_Matching

Paused

App Files Files Community

saim1309 commited on Sep 3, 2025

Commit

ec7f44e

verified ·

1 Parent(s): 420eb14

Upload 6 files

Browse files

Files changed (6) hide show

app.py +74 -0
best_model.pth +3 -0
plot.py +105 -0
requirements.txt +15 -0
test.py +118 -0
train.py +314 -0

app.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import gradio as gr
+import os
+import tempfile
+import cv2
+from test import predict_one
+from plot import (
+    autocrop, get_json_corners, extract_points_from_xml,
+    draw_feature_matching, stack_images_side_by_side
+)
+# Hard-coded model checkpoint path
+MODEL_CKPT = "best_model.pth"
+# --------------------
+# Pipeline
+# --------------------
+def run_pipeline(flat_img, pers_img, mockup_json, xml_gt):
+    # Temp dir for prediction + result
+    tmpdir = tempfile.mkdtemp()
+    xml_pred_path = os.path.join(tmpdir, "pred.xml")
+    result_path   = os.path.join(tmpdir, "result.png")
+    # Run prediction
+    predict_one(mockup_json, pers_img, MODEL_CKPT, out_path=xml_pred_path)
+    # --- Visualization ---
+    img_json = autocrop(cv2.cvtColor(cv2.imread(flat_img), cv2.COLOR_BGR2RGB))
+    img_xml  = autocrop(cv2.cvtColor(cv2.imread(pers_img), cv2.COLOR_BGR2RGB))
+    json_pts = get_json_corners(mockup_json)
+    gt_pts   = extract_points_from_xml(xml_gt)
+    pred_pts = extract_points_from_xml(xml_pred_path)
+    match_json_gt   = draw_feature_matching(img_json.copy(), json_pts, img_xml.copy(), gt_pts, draw_boxes=True)
+    match_json_pred = draw_feature_matching(img_json.copy(), json_pts, img_xml.copy(), pred_pts, draw_boxes=True)
+    stacked = stack_images_side_by_side(match_json_gt, match_json_pred)
+    # Save result
+    cv2.imwrite(result_path, cv2.cvtColor(stacked, cv2.COLOR_RGB2BGR))
+    return result_path, xml_pred_path
+# --------------------
+# Gradio UI
+# --------------------
+with gr.Blocks() as demo:
+    gr.Markdown("## Mesh Key Point Transformer Demo")
+    with gr.Row():
+        flat_in = gr.Image(type="filepath", label="Flat Image", width=300, height=300)
+        pers_in = gr.Image(type="filepath", label="Perspective Image", width=300, height=300)
+    with gr.Row():
+        mockup_json_in = gr.File(type="filepath", label="Mockup JSON")
+        xml_gt_in      = gr.File(type="filepath", label="Ground Truth XML")
+    run_btn = gr.Button("Run Prediction + Visualization")
+    with gr.Row():
+        out_img = gr.Image(type="filepath", label="Comparison Output", width=800, height=600)
+        out_xml = gr.File(type="filepath", label="Predicted XML")
+    run_btn.click(
+        fn=run_pipeline,
+        inputs=[flat_in, pers_in, mockup_json_in, xml_gt_in],
+        outputs=[out_img, out_xml]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2d1e28e3d30f64b129be39c8a6f3a2e88f042f8b24e0a1526e77cdd4c27b20f7
+size 14292417

plot.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import json
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+from lxml import etree
+# ========= Crop black borders =========
+def autocrop(image, tol=0):
+    """Crops black borders from an image."""
+    if len(image.shape) == 3:
+        mask = (image > tol).any(2)
+    else:
+        mask = image > tol
+    if mask.any():
+        coords = np.argwhere(mask)
+        y0, x0 = coords.min(axis=0)
+        y1, x1 = coords.max(axis=0) + 1
+        image = image[y0:y1, x0:x1]
+    return image
+# ========= Stack horizontally =========
+def stack_images_side_by_side(img1, img2):
+    """Resizes two images to a common height and stacks them horizontally."""
+    target_h = max(img1.shape[0], img2.shape[0])
+    w1 = int(img1.shape[1] * (target_h / img1.shape[0]))
+    w2 = int(img2.shape[1] * (target_h / img2.shape[0]))
+    img1_resized = cv2.resize(img1, (w1, target_h))
+    img2_resized = cv2.resize(img2, (w2, target_h))
+    return np.hstack([img1_resized, img2_resized])
+# ========= Extract rectangle from JSON =========
+def get_json_corners(json_file):
+    """Extracts rotated rectangle corners from mockup.json."""
+    with open(json_file, 'r') as f:
+        data = json.load(f)
+    area = data['printAreas'][0]
+    x, y = area['position']['x'], area['position']['y']
+    w, h, angle = area['width'], area['height'], area['rotation']
+    cx, cy = x + w / 2, y + h / 2
+    angle_rad = np.radians(angle)
+    dx, dy = w / 2, h / 2
+    corners = np.array([[-dx, -dy], [dx, -dy], [dx, dy], [-dx, dy]])
+    R = np.array([[np.cos(angle_rad), -np.sin(angle_rad)],
+                  [np.sin(angle_rad),  np.cos(angle_rad)]])
+    rotated = np.dot(corners, R.T) + np.array([cx, cy])
+    return rotated.astype(int)
+# ========= Extract polygon from XML =========
+def extract_points_from_xml(xml_file):
+    """Extracts corner points from a visual.xml file."""
+    tree = etree.parse(xml_file)
+    root = tree.getroot()
+    transform = root.find('.//transform')
+    points = {}
+    for pt in transform.findall('.//point'):
+        points[pt.attrib['type']] = (float(pt.attrib['x']), float(pt.attrib['y']))
+    order = ['TopLeft', 'TopRight', 'BottomRight', 'BottomLeft']
+    return np.array([points[p] for p in order], dtype=np.float32)
+# ========= Draw correspondences and (optional) boxes =========
+def draw_feature_matching(img1, pts1, img2, pts2, draw_boxes=True):
+    """
+    Draws feature correspondences between two images, handling different sizes.
+    """
+    # Resize images to a common height to avoid black padding bars
+    target_h = max(img1.shape[0], img2.shape[0])
+    # Calculate scaling factors and new widths
+    scale1 = target_h / img1.shape[0]
+    w1_new = int(img1.shape[1] * scale1)
+    scale2 = target_h / img2.shape[0]
+    w2_new = int(img2.shape[1] * scale2)
+    # Resize images
+    img1_resized = cv2.resize(img1, (w1_new, target_h))
+    img2_resized = cv2.resize(img2, (w2_new, target_h))
+    # Scale points to match the resized images
+    pts1_scaled = (pts1 * scale1).astype(int)
+    pts2_scaled = (pts2 * scale2).astype(int)
+    # Create the combined image canvas
+    h, w1, w2 = target_h, w1_new, w2_new
+    new_img = np.concatenate([img1_resized, img2_resized], axis=1)
+    # Optional: Draw polygons (boxes)
+    if draw_boxes:
+        cv2.polylines(new_img, [pts1_scaled.reshape((-1,1,2))], True, (0,255,0), 3)
+        cv2.polylines(new_img, [pts2_scaled.reshape((-1,1,2)) + np.array([w1,0])], True, (0,255,0), 3)
+    # Draw correspondences
+    for (x1, y1), (x2, y2) in zip(pts1_scaled, pts2_scaled):
+        color = tuple(np.random.randint(0, 255, 3).tolist())
+        cv2.circle(new_img, (x1, y1), 6, color, -1)
+        cv2.circle(new_img, (x2 + w1, y2), 6, color, -1)
+        cv2.line(new_img, (x1, y1), (x2 + w1, y2), color, 2)
+    return new_img

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+# Core ML / DL
+torch
+torchvision
+Pillow
+numpy
+opencv-python
+shapely
+pathlib
+gradio
+fastapi
+starlette
+pydantic
+uvicorn
+matplotlib
+lxml

test.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import torch
+import xml.etree.ElementTree as ET
+from PIL import Image
+import numpy as np
+from pathlib import Path
+import json
+from train import SimpleTransformer, flat_corners_from_mockup
+# --------------------
+# Utility: order 4 points (same as old)
+# --------------------
+def order_points_clockwise(pts):
+    pts = np.array(pts, dtype="float32")
+    y_sorted = pts[np.argsort(pts[:, 1]), :]
+    top_two = y_sorted[:2, :]
+    bottom_two = y_sorted[2:, :]
+    if top_two[0][0] < top_two[1][0]:
+        tl, tr = top_two
+    else:
+        tr, tl = top_two
+    if bottom_two[0][0] < bottom_two[1][0]:
+        bl, br = bottom_two
+    else:
+        br, bl = bottom_two
+    return np.array([tl, tr, br, bl], dtype="float32")
+# --------------------
+# Utility: save XML prediction
+# --------------------
+def save_prediction_xml(pred_pts, out_path, img_w, img_h):
+    ordered = order_points_clockwise(pred_pts)
+    TL, TR, BR, BL = ordered
+    root = ET.Element("visualization", version="1.0")
+    ET.SubElement(root, "effects", surfacecolor="", iswood="0")
+    ET.SubElement(root, "background",
+                  width=str(img_w), height=str(img_h),
+                  color1="#C4CDE4", color2="", color3="")
+    transforms_node = ET.SubElement(root, "transforms")
+    transform = ET.SubElement(transforms_node, "transform",
+                              type="FourPoint", offsetX="0", offsetY="0", offsetZ="0.0",
+                              rotationX="0.0", rotationY="0.0", rotationZ="0.0",
+                              name="Region", posCode="REGION", posName="Region",
+                              posDef="0", techCode="EMBF03", techName="Embroidery Fixed",
+                              techDef="0", areaWidth="100", areaHeight="100",
+                              maxColors="12", defaultLogoSize="100", sizeX="100", sizeY="100")
+    pts = {"TopLeft": TL, "TopRight": TR, "BottomRight": BR, "BottomLeft": BL}
+    for ptype, (x, y) in pts.items():
+        ET.SubElement(transform, "point",
+                      type=ptype, x=str(float(x)), y=str(float(y)),
+                      z="0.0", warp="0", warpShift="0")
+    overlays = ET.SubElement(root, "overlays")
+    overlay = ET.SubElement(overlays, "overlay")
+    for (x, y) in ordered:
+        ET.SubElement(overlay, "point", type="Next", x=str(float(x)), y=str(float(y)), z="0.0")
+    ET.SubElement(root, "ruler",
+                  startX=str(TL[0]), startY=str(TL[1]),
+                  stopX=str(BR[0]), stopY=str(BR[1]), value="100")
+    tree = ET.ElementTree(root)
+    tree.write(out_path, encoding="utf-8", xml_declaration=True)
+# --------------------
+# Predict one sample
+# --------------------
+def predict_one(mockup_json, pers_img_path, model_ckpt, out_path="prediction.xml"):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Load perspective image
+    pers_img = Image.open(pers_img_path).convert("RGB")
+    orig_w, orig_h = pers_img.size
+    # Load flat points from mockup.json
+    _, flat_norm = flat_corners_from_mockup(mockup_json)
+    flat_in = torch.tensor(flat_norm.flatten(), dtype=torch.float32).unsqueeze(0).to(device)  # (1,8)
+    # Load model
+    model = SimpleTransformer().to(device)
+    state = torch.load(model_ckpt, map_location=device, weights_only=False)
+    if "model_state" in state:  # resume checkpoint format
+        model.load_state_dict(state["model_state"])
+    else:  # final model
+        model.load_state_dict(state)
+    model.eval()
+    # Predict
+    with torch.no_grad():
+        pred = model(flat_in)  # (1,8)
+        pred = pred.view(4, 2).cpu().numpy()
+    # Convert normalized coords to pixel coords
+    pred_px = pred.copy()
+    pred_px[:, 0] *= orig_w
+    pred_px[:, 1] *= orig_h
+    # Save prediction
+    save_prediction_xml(pred_px, out_path, orig_w, orig_h)
+    print(f"Saved prediction -> {out_path}")
+# --------------------
+# Example usage
+# --------------------
+if __name__ == "__main__":
+    mockup_json = "Transformer/test/100847_TD/front/LAS02/mockup.json"
+    pers_img = "Transformer/test/100847_TD/front/LAS02/4BC13E58-1D8A-4E5D-8A40-C1F4B1248893_visual.jpg"
+    model_ckpt = "Transformer/transformer_model.pth"
+    predict_one(mockup_json, pers_img, model_ckpt, out_path="Transformer/Prediction/pred3.xml")

train.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import os
+import json
+import glob
+import xml.etree.ElementTree as ET
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader
+import torchvision.transforms as T
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from shapely.geometry import Polygon
+from pathlib import Path
+# =====================
+# Data Utils
+# # =====================
+import numpy as np
+import json
+def flat_corners_from_mockup(mockup_path):
+    """
+    Returns 4 corners of print area from mockup.json
+    ordered TL, TR, BR, BL and normalized [0,1] w.r.t background.
+    """
+    d = json.loads(Path(mockup_path).read_text())
+    bg_w = d["background"]["width"]
+    bg_h = d["background"]["height"]
+    area = d["printAreas"][0]
+    x, y = area["position"]["x"], area["position"]["y"]
+    w, h = area["width"], area["height"]
+    angle = area["rotation"]
+    cx, cy = x + w/2.0, y + h/2.0
+    # corners in px (TL,TR,BR,BL)
+    dx, dy = w/2.0, h/2.0
+    corners = np.array([[-dx, -dy], [dx, -dy], [dx, dy], [-dx, dy]], dtype=np.float32)
+    theta = np.deg2rad(angle)
+    R = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]], dtype=np.float32)
+    rot = (corners @ R.T) + np.array([cx, cy], dtype=np.float32)
+    # normalize
+    norm = np.zeros_like(rot)
+    norm[:,0] = rot[:,0] / bg_w
+    norm[:,1] = rot[:,1] / bg_h
+    return rot.astype(np.float32), norm.astype(np.float32)
+def parse_xml_points(xml_path):
+    """
+    Parse the 4 corner points from the XML (FourPoint transform).
+    Returns normalized coordinates (TL, TR, BR, BL).
+    """
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    points = []
+    bg_w = int(root.find("background").get("width"))
+    bg_h = int(root.find("background").get("height"))
+    for transform in root.findall(".//transform"):
+        if transform.get("type") == "FourPoint":
+            for pt in ["TopLeft", "TopRight", "BottomRight", "BottomLeft"]:
+                node = transform.find(f".//point[@type='{pt}']")
+                if node is not None:
+                    x = float(node.get("x")) / bg_w
+                    y = float(node.get("y")) / bg_h
+                    points.append([x, y])
+            break  # only first transform
+    return np.array(points, dtype=np.float32)  # (4,2)
+class KP4Dataset(Dataset):
+    def __init__(self, root, img_size=512):
+        self.root = Path(root)
+        self.img_size = img_size
+        self.samples = []
+        # Transform pipeline (resize + tensor + normalize)
+        self.transform = T.Compose([
+            T.Resize((img_size, img_size)),
+            T.ToTensor(),
+            T.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        ])
+        # Walk recursively
+        for xml_file in self.root.rglob("*.xml"):
+            if "_visual" not in xml_file.stem:
+                continue
+            # Find matching perspective image
+            base = xml_file.stem
+            img_file = None
+            for ext in [".png", ".jpg", ".jpeg"]:
+                cand = xml_file.with_suffix(ext)
+                if cand.exists():
+                    img_file = cand
+                    break
+            if img_file is None:
+                continue
+            # Flat image (background)
+            flat_img = xml_file.parent / (base.replace("_visual", "_background") + ".png")
+            if not flat_img.exists():
+                flat_img = xml_file.parent / (base.replace("_visual", "_background") + ".jpg")
+            if not flat_img.exists():
+                continue
+            # Mockup.json
+            json_file = xml_file.parent / "mockup.json"
+            if not json_file.exists():
+                continue
+            self.samples.append((img_file, xml_file, flat_img, json_file))
+        if not self.samples:
+            raise RuntimeError(f"No valid samples found under {root}")
+    def __len__(self):
+        return len(self.samples)
+    def __getitem__(self, idx):
+        img_file, xml_file, flat_img, json_file = self.samples[idx]
+        img = self.transform(Image.open(img_file).convert("RGB"))
+        flat = self.transform(Image.open(flat_img).convert("RGB"))
+        # flat points
+        _, flat_norm = flat_corners_from_mockup(json_file)
+        flat_pts = torch.tensor(flat_norm, dtype=torch.float32)
+        # perspective points
+        persp_norm = parse_xml_points(xml_file)
+        persp_pts = torch.tensor(persp_norm, dtype=torch.float32)
+        return {
+            "persp_img": img,
+            "flat_img": flat,
+            "flat_pts": flat_pts,
+            "persp_pts": persp_pts,
+            "xml": str(xml_file),
+            "json": str(json_file),
+        }
+# =====================
+# Model
+# =====================
+class SimpleTransformer(nn.Module):
+    def __init__(self, d_model=128, nhead=4, num_layers=2):
+        super().__init__()
+        self.fc_in = nn.Linear(8, d_model)  # 4 corners * 2
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
+        self.fc_out = nn.Linear(d_model, 8)  # predict 4 corners (x,y)*4
+    def forward(self, x):
+        x = self.fc_in(x).unsqueeze(1)  # (B,1,8)->(B,1,d_model)
+        x = self.transformer(x)
+        x = self.fc_out(x).squeeze(1)   # (B,d_model)->(B,8)
+        return x
+# =====================
+# Metrics
+# =====================
+def mse_loss(pred, gt):
+    return ((pred-gt)**2).mean()
+def mean_corner_error(pred, gt, img_w, img_h):
+    pred_px = pred * torch.tensor([img_w,img_h], device=pred.device)
+    gt_px = gt * torch.tensor([img_w,img_h], device=gt.device)
+    err = torch.norm(pred_px-gt_px, dim=-1).mean().item()
+    return err
+def iou_quad(pred, gt):
+    pred_poly = Polygon(pred.tolist())
+    gt_poly = Polygon(gt.tolist())
+    if not pred_poly.is_valid or not gt_poly.is_valid:
+        return 0.0
+    inter = pred_poly.intersection(gt_poly).area
+    union = pred_poly.union(gt_poly).area
+    return inter/union if union > 0 else 0.0
+# =====================
+# Training
+# =====================
+def train_model(
+    train_root,
+    test_root,
+    epochs=20,
+    batch_size=8,
+    lr=1e-3,
+    img_size=256,
+    save_dir="Transformer/checkpoints",
+    resume_path=None
+):
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    train_ds = KP4Dataset(train_root, img_size=img_size)
+    val_ds = KP4Dataset(test_root, img_size=img_size)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_ds, batch_size=1, shuffle=False)
+    model = SimpleTransformer().to(device)
+    optimizer = optim.Adam(model.parameters(), lr=lr)
+    start_epoch = 0
+    os.makedirs(save_dir, exist_ok=True)
+    # Resume Training
+    if resume_path is not None and os.path.exists(resume_path):
+        print(f"Loading checkpoint from {resume_path}")
+        checkpoint = torch.load(resume_path, map_location=device)
+        model.load_state_dict(checkpoint["model_state"])
+        optimizer.load_state_dict(checkpoint["optimizer_state"])
+        start_epoch = checkpoint["epoch"]
+        print(f"Resumed from epoch {start_epoch}")
+    # ===================== Track Best Model =====================
+    best_iou = -1.0
+    best_model_path = os.path.join(save_dir, "best_model.pth")
+    for epoch in range(start_epoch, epochs):
+        # -------- Training --------
+        model.train()
+        total_loss = 0
+        for batch in train_loader:
+            flat_pts = batch["flat_pts"].to(device)
+            persp_pts = batch["persp_pts"].to(device)
+            flat_pts_in = flat_pts.view(flat_pts.size(0), -1)
+            target = persp_pts.view(persp_pts.size(0), -1)
+            pred = model(flat_pts_in)
+            loss = mse_loss(pred, target)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {total_loss/len(train_loader):.6f}")
+        # -------- Validation --------
+        model.eval()
+        mse_all, ce_all, iou_all = [], [], []
+        with torch.no_grad():
+            for batch in val_loader:
+                flat_pts = batch["flat_pts"].to(device)
+                persp_pts = batch["persp_pts"].to(device)
+                flat_pts_in = flat_pts.view(1, -1)
+                target = persp_pts.view(1, -1)
+                pred = model(flat_pts_in)
+                mse_all.append(mse_loss(pred, target).item())
+                pred_quad = pred.view(4,2).cpu()
+                gt_quad = persp_pts.view(4,2).cpu()
+                w,h = batch["persp_img"].shape[2], batch["persp_img"].shape[1]
+                ce_all.append(mean_corner_error(pred_quad, gt_quad, w, h))
+                iou_all.append(iou_quad(pred_quad, gt_quad))
+        val_mse = np.mean(mse_all)
+        val_ce = np.mean(ce_all)
+        val_iou = np.mean(iou_all)
+        print(f"  Val MSE: {val_mse:.6f}, CornerErr(px): {val_ce:.2f}, IoU: {val_iou:.3f}")
+        if (epoch + 1) % 100 == 0:
+            # -------- Save Epoch Checkpoint (like before) --------
+            checkpoint_path = os.path.join(save_dir, f"epoch_{epoch+1}.pth")
+            torch.save({
+                "epoch": epoch+1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+                "val_iou": val_iou,
+            }, checkpoint_path)
+            print(f"Checkpoint saved: {checkpoint_path}")
+        # -------- Save Best Model --------
+        if val_iou > best_iou:
+            best_iou = val_iou
+            torch.save({
+                "epoch": epoch+1,
+                "model_state": model.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+                "best_iou": best_iou,
+            }, best_model_path)
+            print(f"Best model updated at epoch {epoch+1} (IoU={val_iou:.3f})")
+    # Save final model weights
+    final_path = os.path.join(save_dir, "final_model.pth")
+    torch.save(model.state_dict(), final_path)
+    print(f"Final model saved at {final_path}")
+    print(f"Best model saved at {best_model_path} with IoU={best_iou:.3f}")
+    return model
+# =====================
+# Main
+# =====================
+if __name__ == "__main__":
+    model = train_model(
+        train_root="Transformer/train",
+        test_root="Transformer/test",
+        epochs=3000,
+        batch_size=4,
+        lr=1e-3,
+        img_size=256,
+        resume_path=None
+    )