zhaxie commited on 4 days ago

Commit

fcd99cd

verified ·

1 Parent(s): b82e456

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

OFFICIAL_UVDoc_对标说明.txt +41 -0
UVDoc_official/.gitignore +136 -0
UVDoc_official/LICENSE +21 -0
UVDoc_official/README.md +160 -0
UVDoc_official/__pycache__/data_UVDoc.cpython-310.pyc +0 -0
UVDoc_official/__pycache__/data_UVDoc.cpython-313.pyc +0 -0
UVDoc_official/__pycache__/data_custom_augmentations.cpython-310.pyc +0 -0
UVDoc_official/__pycache__/data_doc3D.cpython-312.pyc +0 -0
UVDoc_official/__pycache__/data_utils.cpython-313.pyc +0 -0
UVDoc_official/compute_uvdoc_grid3d_stats.py +73 -0
UVDoc_official/data/readme.txt +1 -0
UVDoc_official/data_UVDoc.py +232 -0
UVDoc_official/data_custom_augmentations.py +148 -0
UVDoc_official/data_doc3D.py +95 -0
UVDoc_official/data_mixDataset.py +23 -0
UVDoc_official/data_utils.py +175 -0
UVDoc_official/demo.py +55 -0
UVDoc_official/docUnet_eval.py +106 -0
UVDoc_official/docUnet_pred.py +179 -0
UVDoc_official/model.py +374 -0
UVDoc_official/requirements_demo.txt +3 -0
UVDoc_official/requirements_eval.txt +10 -0
UVDoc_official/requirements_train.txt +5 -0
UVDoc_official/run_official_overfit_train_infer.sh +101 -0
UVDoc_official/train.py +552 -0
UVDoc_official/utils.py +66 -0
UVDoc_official/uvdocBenchmark_eval.py +129 -0
UVDoc_official/uvdocBenchmark_metric.py +152 -0
UVDoc_official/uvdocBenchmark_pred.py +131 -0
UVDoc_official/verify_ckpt_val_pipeline.py +153 -0
UVDoc_official/verify_uvdoc_train_infer_preprocess.py +169 -0
baseline_resnet_unet/__init__.py +5 -0
baseline_resnet_unet/dataset.py +197 -0
baseline_resnet_unet/model.py +89 -0
baseline_resnet_unet/train.py +187 -0
baseline_resnet_unet/warp.py +24 -0
log_full_uvdoc_gpu0.bak_20260411_122217/nohup.out +9 -0
log_full_uvdoc_gpu0.bak_20260411_122217/params8_lr=0.0002_nepochs50_nepochsdecay20_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc.txt +2 -0
log_full_uvdoc_gpu0/nohup.out +0 -0
log_full_uvdoc_gpu0/params8_lr=0.0002_nepochs25_nepochsdecay10_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc.txt +111 -0
log_full_uvdoc_gpu0/verify_val_ep12_infer/metrics.txt +1001 -0
requirements_baseline.txt +6 -0
requirements_uvdoc_train.txt +9 -0
run_overfit_official_uvdoc.sh +27 -0
run_overfit_train_infer_consistency.sh +75 -0
run_train_full_uvdoc_gpu0.sh +40 -0
run_train_official_config.sh +80 -0
run_train_uvdoc_baseline.py +11 -0
unzip_extract.log +1 -0
uvdoc_文档矫正_colab_技术路线（gemini_可执行版）.md +212 -0

OFFICIAL_UVDoc_对标说明.txt ADDED Viewed

	@@ -0,0 +1,41 @@

+【官方 UVDoc 代码（已克隆 + 小补丁）】
+路径: UvDoc/UVDoc_official/
+上游: https://github.com/tanguymagne/UVDoc
+【与论文/仓库一致的默认设定】
+- 模型: model.UVDocnet（稀疏 grid2D + grid3D，utils.bilinear_unwarping + grid_sample）
+- 损失: alpha * L1(grid2D) + beta * L1(grid3D) + gamma * L1(重建)，gamma 从 ep_gamma_start 起启用（与 train.py 一致）
+- 官方默认数据模式: --data_to_use both（Doc3D + UVDoc 混合训练）；验证集为 Doc3D 的 val（与原版一致）
+【严格对标时请准备】
+1) UVDoc_final 解压目录（含 img/ grid2d/ grid3d/ metadata_sample/ 等）
+2) Doc3D 数据 + 作者提供的 Doc3D_grid（见官方 README 链接）
+【仅 UVDoc_final、无 Doc3D 时（扩展模式）】
+本目录对官方 train.py 增加了 data_to_use=uvdoc：
+- 在同一套 UVDoc 上按样本 id 做 train/val 划分（默认 val_ratio=0.05，可改）
+- 验证仍用重建 MSE（与原版 val 形式一致，但数据域是 UVDoc 而非 Doc3D）
+注意：这与论文「Doc3D 上 val」不完全相同，仅便于本地先跑通官方网络与损失。
+【运行示例】
+cd /mnt/zsn/zsn_workspace/dzx/UvDoc/UVDoc_official
+# A) 官方默认（需 Doc3D + UVDoc）
+python train.py --data_to_use both \
+  --data_path_doc3D /path/to/data/doc3D/ \
+  --data_path_UVDoc /mnt/zsn/zsn_workspace/dzx/UvDoc/UVDoc_final \
+  --logdir ./log/uvdoc_official
+# B) 仅 UVDoc（无 Doc3D）
+python train.py --data_to_use uvdoc \
+  --data_path_UVDoc /mnt/zsn/zsn_workspace/dzx/UvDoc/UVDoc_final \
+  --logdir ./log/uvdoc_only
+# 依赖见 requirements_train.txt（版本较老；若你环境已是 torch 2.x，多数情况可直接试跑）
+【评估 / 推理】
+仓库内 demo、uvdocBenchmark_pred.py、docUnet_pred.py 等与上游一致，checkpoint 键名 model_state。
+【与 baseline_resnet_unet 的区别】
+- baseline_resnet_unet: ResNet50+UNet 密集 UV，技术笔记里的简化路线
+- UVDoc_official: 与 SIGGRAPH Asia 论文实现一致的网络与监督（grid2D+grid3D）

UVDoc_official/.gitignore ADDED Viewed

	@@ -0,0 +1,136 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Data
+data/doc3D/*
+data/DocUNet/*
+data/UVDoc/*
+data/UVDoc_benchmark/*

UVDoc_official/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Tanguy MAGNE
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

UVDoc_official/README.md ADDED Viewed

	@@ -0,0 +1,160 @@

+# UVDoc: Neural Grid-based Document Unwarping
+![Header](img/header.jpg)
+This repository contains the code for the "UVDoc: Neural Grid-based Document Unwarping" paper.
+If you are looking for (more information about) the UVDoc dataset, you can find it [here](https://github.com/tanguymagne/UVDoc-Dataset).
+The full UVDoc paper can be found [here](https://igl.ethz.ch/projects/uvdoc/).
+Three requirements files are provided for the three use cases made available in this repo.
+Each use case is detailed below.
+## Demo
+> **Note** : Requirements
+>
+> Before trying to unwarp a document using our model, you need to install the requirements. To do so, we advise you to create a virtual environment. Then run `pip install -r requirements_demo.txt`.
+To try our model (available in this repo at `model/best_model.pkl`) on your custom images, run the following:
+```shell
+python demo.py --img-path [PATH/TO/IMAGE]
+```
+You can also use a model you trained yourself by specifying the path to the model like this:
+```shell
+python demo.py --img-path [PATH/TO/IMAGE] --ckpt-path [PATH/TO/MODEL]
+```
+## Model training
+> **Note** : Requirements
+>
+> Before training a model, you need to install the requirements. To do so, we advise you to create a virtual environment. Then run `pip install -r requirements_train.txt`.
+To train a model, you first need to get the data:
+- UVDoc dataset can be accessed [here](https://igl.ethz.ch/projects/uvdoc/UVDoc_final.zip).
+- The Doc3D dataset can be downloaded from [here](https://github.com/cvlab-stonybrook/doc3D-dataset). We augmented this dataset with 2D grids and 3D grids that are available [here](https://igl.ethz.ch/projects/uvdoc/Doc3D_grid.zip).
+Then, unzip the downloaded archive into the data folder. The final structure of the data folder should be as follows:
+```
+data/
+├── doc3D
+│   ├── grid2D
+│   ├── grid3D
+│   ├── bm
+│   └── img
+└── UVDoc
+    ├── grid2d
+    ├── grid3d
+    ├── img
+    ├── img_geom
+    ├── metadata_geom
+    ├── metadata_sample
+    ├── seg
+    ├── textures
+    ├── uvmap
+    ├── warped_textures
+    └── wc
+```
+Once this is done, run the following:
+```shell
+python train.py
+```
+Several hyperparameters, such as data augmentations, number of epochs, learning rate, or batch size can be tuned. To learn about them, please run the following:
+```shell
+python train.py --help
+```
+## Evaluation
+> **Note** : Requirements
+>
+> Before evaluating a model, you need to install the requirements. To do so, we advise you to create a virtual environment. Then run `pip install -r requirements_eval.txt`.
+>
+> You will also need to install `matlab.engine`, to allow interfacing matlab with python. To do so, you first need to find the location of your matlab installation (for instance, by running `matlabroot` from within matlab). Then go to `<matlabroot>/extern/engines/python` and run `python setup.py install`. You can open a python prompt and run `import matlab.engine` followed by `eng = matlab.engine.start_matlab()` to see if it was successful.
+>
+> Finally you might need to install `tesseract` via `sudo apt install tesseract-ocr libtesseract-dev`.
+You can easily evaluate our model or a model you trained yourself using the provided script.
+Our model is available in this repo at `model/best_model.pkl`.
+### DocUNet benchmark
+To make predictions using a model on the DocUNet benchmark, please first download the DocUNet Benchmark (available [here](https://www3.cs.stonybrook.edu/~cvl/docunet.html)) and place it under data to have the following structure:
+```
+data/
+└── DocUNet
+    ├── crop
+    ├── original
+    └── scan
+```
+Then run:
+```shell
+python docUnet_pred.py --ckpt-path [PATH/TO/MODEL]
+```
+This will create a `docunet` folder next to the model, containing the unwarped images.
+Then to compute the metrics over these predictions, please run the following:
+```shell
+python docUnet_eval.py --pred-path [PATH/TO/UNWARPED]
+```
+### UVDoc benchmark
+To make predictions using a model on the UVDoc benchmark, please first download the UVDoc Benchmark (available [here](https://igl.ethz.ch/projects/uvdoc/)) and place it under data to have the following structure:
+```
+data/
+└── UVDoc_benchmark
+    ├── grid2d
+    ├── grid3d
+    └── ...
+```
+Then run:
+```shell
+python uvdocBenchmark_pred.py --ckpt-path [PATH/TO/MODEL]
+```
+This will create a `output_uvdoc` folder next to the model, containing the unwarped images.
+Then to compute the metrics over these predictions, please run the following:
+```shell
+python uvdocBenchmark_eval.py --pred-path [PATH/TO/UNWARPED]
+```
+#### :exclamation: Erratum
+The MS-SSIM and AD values for the UVDoc benchmark reported in our paper mistakenly were calculated based on only half of the UVDoc benchmark (for our method as well as related works).
+We here report the old and the corrected values on the entire UVDoc benchmark:
+|    :white_check_mark: New :white_check_mark:       | MS-SSIM | AD    |
+|-----------|---------|-------|
+| DewarpNet | 0.589   | 0.193 |
+| DocTr     | 0.697   | 0.160 |
+| DDCP      | 0.585   | 0.290 |
+| RDGR      | 0.610   | 0.280 |
+| DocGeoNet | 0.706   | 0.168 |
+| Ours      | 0.785   | 0.119 |
+|      :x: Old :x: | MS-SSIM | AD    |
+|-----------|---------|-------|
+| DewarpNet | 0.6     | 0.189 |
+| DocTr     | 0.684   | 0.176 |
+| DDCP      | 0.591   | 0.334 |
+| RDGR      | 0.603   | 0.314 |
+| DocGeoNet | 0.714   | 0.167 |
+| Ours      | 0.784   | 0.122 |
+## Resulting images
+You can download the unwarped images that we used in our paper:
+* [Our results for the DocUNet benchmark](https://igl.ethz.ch/projects/uvdoc/DocUnet_results.zip)
+* [Our results for the UVDoc benchmark](https://igl.ethz.ch/projects/uvdoc/UVDocBenchmark_results.zip)
+* [The results of related work for the UVDoc benchmark](https://igl.ethz.ch/projects/uvdoc/UVDocBenchmark_results_RelatedWorks.zip) (generated using their respective published pretrained models)
+## Citation
+If you used this code or the UVDoc dataset, please consider citing our work:
+```
+@inproceedings{UVDoc,
+title={{UVDoc}: Neural Grid-based Document Unwarping},
+author={Floor Verhoeven and Tanguy Magne and Olga Sorkine-Hornung},
+booktitle = {SIGGRAPH ASIA, Technical Papers},
+year = {2023},
+url={https://doi.org/10.1145/3610548.3618174}
+}
+```

UVDoc_official/__pycache__/data_UVDoc.cpython-310.pyc ADDED Viewed

Binary file (6.7 kB). View file

UVDoc_official/__pycache__/data_UVDoc.cpython-313.pyc ADDED Viewed

Binary file (11.4 kB). View file

UVDoc_official/__pycache__/data_custom_augmentations.cpython-310.pyc ADDED Viewed

Binary file (4.17 kB). View file

UVDoc_official/__pycache__/data_doc3D.cpython-312.pyc ADDED Viewed

Binary file (5.48 kB). View file

UVDoc_official/__pycache__/data_utils.cpython-313.pyc ADDED Viewed

Binary file (8.02 kB). View file

UVDoc_official/compute_uvdoc_grid3d_stats.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/usr/bin/env python3
+"""
+Scan UVDoc-style data_root (grid3d/*.mat) and write min/max per channel for raw grid3d.
+Output JSON is consumed by UVDocDataset via --uvdoc_grid3d_stats.
+Uses one file per geom name (same as training), so cost scales with unique geometries, not images.
+"""
+import argparse
+import json
+import os
+from os.path import join as pjoin
+import h5py as h5
+import numpy as np
+def main():
+    parser = argparse.ArgumentParser(description="Compute grid3d min/max stats for UVDoc normalization.")
+    parser.add_argument(
+        "--data_path",
+        type=str,
+        required=True,
+        help="Dataset root containing grid3d/*.mat (HDF5 mat with dataset 'grid3d').",
+    )
+    parser.add_argument(
+        "--out",
+        type=str,
+        required=True,
+        help="Output JSON path (x_max, x_min, y_max, y_min, z_max, z_min).",
+    )
+    args = parser.parse_args()
+    grid3d_dir = pjoin(args.data_path, "grid3d")
+    if not os.path.isdir(grid3d_dir):
+        raise FileNotFoundError(f"Missing grid3d directory: {grid3d_dir}")
+    mats = sorted(f[:-4] for f in os.listdir(grid3d_dir) if f.endswith(".mat"))
+    if not mats:
+        raise RuntimeError(f"No .mat files under {grid3d_dir}")
+    xmn = ymn = zmn = float("inf")
+    xmx = ymx = zmx = float("-inf")
+    for name in mats:
+        path = pjoin(grid3d_dir, f"{name}.mat")
+        with h5.File(path, "r") as file:
+            grid3d = np.array(file["grid3d"][:].T)
+        xmn = min(xmn, float(grid3d[:, :, 0].min()))
+        xmx = max(xmx, float(grid3d[:, :, 0].max()))
+        ymn = min(ymn, float(grid3d[:, :, 1].min()))
+        ymx = max(ymx, float(grid3d[:, :, 1].max()))
+        zmn = min(zmn, float(grid3d[:, :, 2].min()))
+        zmx = max(zmx, float(grid3d[:, :, 2].max()))
+    stats = {
+        "x_max": xmx,
+        "x_min": xmn,
+        "y_max": ymx,
+        "y_min": ymn,
+        "z_max": zmx,
+        "z_min": zmn,
+        "num_grid3d_files": len(mats),
+    }
+    out_dir = os.path.dirname(os.path.abspath(args.out))
+    if out_dir:
+        os.makedirs(out_dir, exist_ok=True)
+    with open(args.out, "w", encoding="utf-8") as f:
+        json.dump(stats, f, indent=2)
+    print(f"Wrote {args.out} from {len(mats)} grid3d files.")
+if __name__ == "__main__":
+    main()

UVDoc_official/data/readme.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Add doc3D and UVDoc data here

UVDoc_official/data_UVDoc.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import json
+import math
+import os
+import random
+import warnings
+from os.path import join as pjoin
+from typing import List, Optional, Tuple
+import cv2
+import h5py as h5
+import numpy as np
+import torch
+from data_utils import BaseDataset, get_geometric_transform
+from utils import GRID_SIZE, IMG_SIZE, bilinear_unwarping
+# Default stats from the original UVDoc release (x_max, x_min, y_max, y_min, z_max, z_min).
+DEFAULT_GRID3D_NORMALIZATION = (
+    0.11433014,
+    -0.12551452,
+    0.12401487,
+    -0.12401487,
+    0.1952378,
+    -0.1952378,
+)
+def load_grid3d_stats_json(path: str) -> Tuple[float, float, float, float, float, float]:
+    """Load (x_max, x_min, y_max, y_min, z_max, z_min) from JSON written by compute_uvdoc_grid3d_stats.py."""
+    with open(path, "r", encoding="utf-8") as f:
+        d = json.load(f)
+    keys = ("x_max", "x_min", "y_max", "y_min", "z_max", "z_min")
+    missing = [k for k in keys if k not in d]
+    if missing:
+        raise KeyError(f"grid3d stats JSON missing keys {missing}: {path}")
+    return tuple(float(d[k]) for k in keys)  # type: ignore[return-value]
+def _split_samples_by_id(
+    ids: List[str],
+    split: str,
+    val_ratio: float,
+    split_seed: int,
+) -> List[str]:
+    rng = random.Random(split_seed)
+    order = ids[:]
+    rng.shuffle(order)
+    n_val = max(1, int(round(len(order) * float(val_ratio))))
+    n_train = len(order) - n_val
+    if split == "train":
+        return order[:n_train]
+    return order[n_train:]
+def _split_samples_by_geom(
+    ids: List[str],
+    dataroot: str,
+    split: str,
+    val_ratio: float,
+    split_seed: int,
+) -> Optional[List[str]]:
+    """
+    Split so that no geom_name appears in both train and val.
+    Returns None if splitting by geom is impossible (e.g. a single unique geometry).
+    """
+    geom_to_samples = {}
+    for sid in ids:
+        with open(pjoin(dataroot, "metadata_sample", f"{sid}.json"), "r", encoding="utf-8") as f:
+            g = json.load(f)["geom_name"]
+        geom_to_samples.setdefault(g, []).append(sid)
+    geoms = list(geom_to_samples.keys())
+    n_geoms = len(geoms)
+    if n_geoms <= 1:
+        warnings.warn(
+            "UVDocDataset: split_mode=geom but unique geom_name count is <= 1; "
+            "falling back to sample-level split.",
+            UserWarning,
+            stacklevel=3,
+        )
+        return None
+    rng = random.Random(split_seed)
+    order = geoms[:]
+    rng.shuffle(order)
+    n_val_geoms = max(1, int(round(n_geoms * float(val_ratio))))
+    if n_val_geoms >= n_geoms:
+        n_val_geoms = n_geoms - 1
+    val_geom_set = set(order[-n_val_geoms:])
+    train_samples = []
+    val_samples = []
+    for g, sids in geom_to_samples.items():
+        if g in val_geom_set:
+            val_samples.extend(sids)
+        else:
+            train_samples.extend(sids)
+    train_samples.sort()
+    val_samples.sort()
+    return train_samples if split == "train" else val_samples
+class UVDocDataset(BaseDataset):
+    """
+    Torch dataset class for the UVDoc dataset.
+    """
+    def __init__(
+        self,
+        data_path="./data/UVdoc",
+        appearance_augmentation=[],
+        geometric_augmentations=[],
+        grid_size=GRID_SIZE,
+        split=None,
+        val_ratio=0.05,
+        split_seed=42,
+        split_mode="sample",
+        grid3d_stats_path: Optional[str] = None,
+        deterministic_crop=None,
+        max_samples=None,
+        overfit=False,
+    ) -> None:
+        super().__init__(
+            data_path=data_path,
+            appearance_augmentation=appearance_augmentation,
+            img_size=IMG_SIZE,
+            grid_size=grid_size,
+        )
+        self.original_grid_size = (89, 61)  # size of the captured data
+        if grid3d_stats_path:
+            self.grid3d_normalization = load_grid3d_stats_json(grid3d_stats_path)
+        else:
+            self.grid3d_normalization = DEFAULT_GRID3D_NORMALIZATION
+        self.geometric_transform = get_geometric_transform(geometric_augmentations, gridsize=self.original_grid_size)
+        ids = sorted([x[:-4] for x in os.listdir(pjoin(self.dataroot, "img")) if x.endswith(".png")])
+        if max_samples is not None:
+            ids = ids[: int(max_samples)]
+        if overfit:
+            self.all_samples = ids
+        elif split in ("train", "val"):
+            if split_mode == "geom":
+                assigned = _split_samples_by_geom(ids, self.dataroot, split, val_ratio, split_seed)
+                if assigned is None:
+                    assigned = _split_samples_by_id(ids, split, val_ratio, split_seed)
+                self.all_samples = assigned
+            elif split_mode == "sample":
+                self.all_samples = _split_samples_by_id(ids, split, val_ratio, split_seed)
+            else:
+                raise ValueError(f"split_mode must be 'sample' or 'geom', got {split_mode!r}")
+        else:
+            self.all_samples = ids
+        if deterministic_crop is None:
+            self.deterministic_crop = split == "val"
+        else:
+            self.deterministic_crop = bool(deterministic_crop)
+    def __getitem__(self, index):
+        # Get all paths
+        sample_id = self.all_samples[index]
+        with open(pjoin(self.dataroot, "metadata_sample", f"{sample_id}.json"), "r", encoding="utf-8") as f:
+            sample_name = json.load(f)["geom_name"]
+        img_path = pjoin(self.dataroot, "img", f"{sample_id}.png")
+        grid2D_path = pjoin(self.dataroot, "grid2d", f"{sample_name}.mat")
+        grid3D_path = pjoin(self.dataroot, "grid3d", f"{sample_name}.mat")
+        # Load 2D grid, 3D grid and image. Normalize 3D grid
+        with h5.File(grid2D_path, "r") as file:
+            grid2D_ = np.array(file["grid2d"][:].T.transpose(2, 0, 1))  # scale in range of img resolution
+        with h5.File(grid3D_path, "r") as file:
+            grid3D = np.array(file["grid3d"][:].T)
+        if self.normalize_3Dgrid:  # scale grid3D to [0,1], based on stats computed over the entire dataset
+            xmx, xmn, ymx, ymn, zmx, zmn = self.grid3d_normalization
+            eps = 1e-12
+            for c, cmn, cmx in ((0, xmn, xmx), (1, ymn, ymx), (2, zmn, zmx)):
+                denom = cmx - cmn
+                if abs(denom) < eps:
+                    grid3D[:, :, c] = 0.0
+                else:
+                    grid3D[:, :, c] = (grid3D[:, :, c] - cmn) / denom
+            grid3D = np.array(grid3D, dtype=np.float32)
+        grid3D = torch.from_numpy(grid3D.transpose(2, 0, 1))
+        img_RGB_ = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
+        # Pixel-wise augmentation
+        img_RGB_ = self.appearance_transform(image=img_RGB_)["image"]
+        # Geometric Augmentations
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore")
+            transformed = self.geometric_transform(
+                image=img_RGB_,
+                keypoints=grid2D_.transpose(1, 2, 0).reshape(-1, 2),
+            )
+            img_RGB_ = transformed["image"]
+            grid2D_ = np.array(transformed["keypoints"]).reshape(*self.original_grid_size, 2).transpose(2, 0, 1)
+            flipped = False
+            for x in transformed["replay"]["transforms"]:
+                if "SafeHorizontalFlip" in x["__class_fullname__"]:
+                    flipped = x["applied"]
+            if flipped:
+                grid3D[1] = 1 - grid3D[1]
+                grid3D = torch.flip(grid3D, dims=(2,))
+        # Tight crop
+        grid2Dtmp = grid2D_
+        img_RGB, grid2D = self.crop_tight(img_RGB_, grid2Dtmp, deterministic=self.deterministic_crop)
+        # Subsample grids to desired resolution
+        row_sampling_factor = math.ceil(self.original_grid_size[0] / self.grid_size[0])
+        col_sampling_factor = math.ceil(self.original_grid_size[1] / self.grid_size[1])
+        grid3D = grid3D[:, ::row_sampling_factor, ::col_sampling_factor]
+        grid2D = grid2D[:, ::row_sampling_factor, ::col_sampling_factor]
+        grid2D = torch.from_numpy(grid2D).float()
+        # Unwarp the image according to grid
+        img_RGB_unwarped = bilinear_unwarping(img_RGB.unsqueeze(0), grid2D.unsqueeze(0), self.img_size).squeeze()
+        return (
+            img_RGB.float() / 255.0,
+            img_RGB_unwarped.float() / 255.0,
+            grid2D,
+            grid3D,
+        )

UVDoc_official/data_custom_augmentations.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import albumentations as A
+import cv2
+import numpy as np
+from utils import GRID_SIZE
+class SafeHorizontalFlip(A.HorizontalFlip):
+    """
+    Horizontal Flip that changes the order of the keypoints so that the top left one remains in the top left position.
+    """
+    def __init__(self, gridsize=GRID_SIZE, always_apply: bool = False, p: float = 0.5):
+        super().__init__(always_apply, p)
+        self.gridsize = gridsize
+    def apply_to_keypoints(self, keypoints, **params):
+        keypoints = super().apply_to_keypoints(keypoints, **params)
+        keypoints = np.array(keypoints).reshape(*self.gridsize, -1)[:, ::-1, :]
+        keypoints = keypoints.reshape(np.product(self.gridsize), -1)
+        return keypoints
+    def get_transform_init_args_names(self):
+        return ("gridsize",)
+class SafePerspective(A.Perspective):
+    """
+    Perspective augmentation that keeps all keypoints in the image visible.
+    Mostly copied from the original Perspective augmentation from Albumentation.
+    """
+    def __init__(
+        self,
+        scale=(0.05, 0.1),
+        keep_size=True,
+        pad_mode=cv2.BORDER_CONSTANT,
+        pad_val=0,
+        mask_pad_val=0,
+        fit_output=False,
+        interpolation=cv2.INTER_LINEAR,
+        always_apply=False,
+        p=0.5,
+    ):
+        super().__init__(
+            scale,
+            keep_size,
+            pad_mode,
+            pad_val,
+            mask_pad_val,
+            fit_output,
+            interpolation,
+            always_apply,
+            p,
+        )
+    @property
+    def targets_as_params(self):
+        return ["image", "keypoints"]
+    def get_params_dependent_on_targets(self, params):
+        h, w = params["image"].shape[:2]
+        keypoints = np.array(params["keypoints"])[:, :2] / np.array([w, h])
+        left = np.min(keypoints[:, 0])
+        right = np.max(keypoints[:, 0])
+        top = np.min(keypoints[:, 1])
+        bottom = np.max(keypoints[:, 1])
+        points = np.zeros([4, 2])
+        # Top Left point
+        points[0, 0] = A.random_utils.uniform(0, max(left - 0.01, left / 2))
+        points[0, 1] = A.random_utils.uniform(0, max(top - 0.01, top / 2))
+        # Top right point
+        points[1, 0] = A.random_utils.uniform(min(right + 0.01, (right + 1) / 2), 1)
+        points[1, 1] = A.random_utils.uniform(0, max(top - 0.01, top / 2))
+        # Bottom Right point
+        points[2, 0] = A.random_utils.uniform(min(right + 0.01, (right + 1) / 2), 1)
+        points[2, 1] = A.random_utils.uniform(min(bottom + 0.01, (bottom + 1) / 2), 1)
+        # Bottom Left point
+        points[3, 0] = A.random_utils.uniform(0, max(left - 0.01, left / 2))
+        points[3, 1] = A.random_utils.uniform(min(bottom + 0.01, (bottom + 1) / 2), 1)
+        points[:, 0] *= w
+        points[:, 1] *= h
+        # Obtain a consistent order of the points and unpack them individually.
+        # Warning: don't just do (tl, tr, br, bl) = _order_points(...)
+        # here, because the reordered points is used further below.
+        points = self._order_points(points)
+        tl, tr, br, bl = points
+        # compute the width of the new image, which will be the
+        # maximum distance between bottom-right and bottom-left
+        # x-coordiates or the top-right and top-left x-coordinates
+        min_width = None
+        max_width = None
+        while min_width is None or min_width < 2:
+            width_top = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
+            width_bottom = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
+            max_width = int(max(width_top, width_bottom))
+            min_width = int(min(width_top, width_bottom))
+            if min_width < 2:
+                step_size = (2 - min_width) / 2
+                tl[0] -= step_size
+                tr[0] += step_size
+                bl[0] -= step_size
+                br[0] += step_size
+        # compute the height of the new image, which will be the maximum distance between the top-right
+        # and bottom-right y-coordinates or the top-left and bottom-left y-coordinates
+        min_height = None
+        max_height = None
+        while min_height is None or min_height < 2:
+            height_right = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
+            height_left = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
+            max_height = int(max(height_right, height_left))
+            min_height = int(min(height_right, height_left))
+            if min_height < 2:
+                step_size = (2 - min_height) / 2
+                tl[1] -= step_size
+                tr[1] -= step_size
+                bl[1] += step_size
+                br[1] += step_size
+        # now that we have the dimensions of the new image, construct
+        # the set of destination points to obtain a "birds eye view",
+        # (i.e. top-down view) of the image, again specifying points
+        # in the top-left, top-right, bottom-right, and bottom-left order
+        # do not use width-1 or height-1 here, as for e.g. width=3, height=2
+        # the bottom right coordinate is at (3.0, 2.0) and not (2.0, 1.0)
+        dst = np.array(
+            [[0, 0], [max_width, 0], [max_width, max_height], [0, max_height]],
+            dtype=np.float32,
+        )
+        # compute the perspective transform matrix and then apply it
+        m = cv2.getPerspectiveTransform(points, dst)
+        if self.fit_output:
+            m, max_width, max_height = self._expand_transform(m, (h, w))
+        return {
+            "matrix": m,
+            "max_height": max_height,
+            "max_width": max_width,
+            "interpolation": self.interpolation,
+        }

UVDoc_official/data_doc3D.py ADDED Viewed

	@@ -0,0 +1,95 @@

+from os.path import join as pjoin
+import cv2
+import h5py as h5
+import numpy as np
+import torch
+from data_utils import BaseDataset
+from utils import GRID_SIZE, IMG_SIZE, bilinear_unwarping
+class doc3DDataset(BaseDataset):
+    """
+    Torch dataset class for the Doc3D dataset.
+    """
+    def __init__(
+        self,
+        data_path="./data/doc3D",
+        split="train",
+        appearance_augmentation=[],
+        grid_size=GRID_SIZE,
+    ):
+        super().__init__(
+            data_path=data_path,
+            appearance_augmentation=appearance_augmentation,
+            img_size=IMG_SIZE,
+            grid_size=grid_size,
+        )
+        self.grid3d_normalization = (1.2539363, -1.2442188, 1.2396319, -1.2289206, 0.6436657, -0.67492497)
+        if split == "train":
+            path = pjoin(self.dataroot, "traindoc.txt")
+        elif split == "val":
+            path = pjoin(self.dataroot, "valdoc3D.txt")
+        with open(path, "r") as files:
+            file_list = tuple(files)
+        self.all_samples = np.array([id_.rstrip() for id_ in file_list], dtype=np.string_)
+    def __getitem__(self, index):
+        # Get all paths
+        im_name = self.all_samples[index].decode("UTF-8")
+        img_path = pjoin(self.dataroot, "img", im_name + ".png")
+        grid2D_path = pjoin(self.dataroot, "grid2D", im_name + ".mat")
+        grid3D_path = pjoin(self.dataroot, "grid3D", im_name + ".mat")
+        bm_path = pjoin(self.dataroot, "bm", im_name + ".mat")
+        # Load 2D grid, 3D grid and image. Normalize 3D grid
+        with h5.File(grid2D_path, "r") as file:
+            grid2D_ = np.array(file["grid2D"][:].T.transpose(2, 0, 1))  # scale in range of img resolution
+        with h5.File(grid3D_path, "r") as file:
+            grid3D = np.array(file["grid3D"][:].T)
+        if self.normalize_3Dgrid:  # scale grid3D to [0,1], based on stats computed over the entire dataset
+            xmx, xmn, ymx, ymn, zmx, zmn = self.grid3d_normalization
+            grid3D[:, :, 0] = (grid3D[:, :, 0] - zmn) / (zmx - zmn)
+            grid3D[:, :, 1] = (grid3D[:, :, 1] - ymn) / (ymx - ymn)
+            grid3D[:, :, 2] = (grid3D[:, :, 2] - xmn) / (xmx - xmn)
+            grid3D = np.array(grid3D, dtype=np.float32)
+        grid3D[:, :, 1] = grid3D[:, :, 1][:, ::-1]
+        grid3D[:, :, 1] = 1 - grid3D[:, :, 1]
+        grid3D = torch.from_numpy(grid3D.transpose(2, 0, 1))
+        img_RGB_ = cv2.cvtColor(cv2.imread(img_path, cv2.IMREAD_COLOR), cv2.COLOR_BGR2RGB)
+        # Pixel-wise augmentation
+        img_RGB_ = self.appearance_transform(image=img_RGB_)["image"]
+        # Create unwarped image according to the backward mapping (first load the backward mapping)
+        with h5.File(bm_path, "r") as file:
+            bm = np.array(file["bm"][:].T.transpose(2, 0, 1))
+        bm = ((bm / 448) - 0.5) * 2.0
+        bm = torch.from_numpy(bm).float()
+        img_RGB_unwarped = bilinear_unwarping(
+            torch.from_numpy(img_RGB_.transpose(2, 0, 1)).float().unsqueeze(0),
+            bm.unsqueeze(0),
+            self.img_size,
+        ).squeeze()
+        # Tight crop
+        grid2Dtmp = grid2D_
+        img_RGB, grid2D = self.crop_tight(img_RGB_, grid2Dtmp)
+        # Convert 2D grid to torch tensor
+        grid2D = torch.from_numpy(grid2D).float()
+        return (
+            img_RGB.float() / 255.0,
+            img_RGB_unwarped.float() / 255.0,
+            grid2D,
+            grid3D,
+        )

UVDoc_official/data_mixDataset.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import torch
+class mixDataset(torch.utils.data.Dataset):
+    """
+    Class to use both UVDoc and Doc3D datasets at the same time.
+    """
+    def __init__(self, *datasets):
+        self.datasets = datasets
+    def __getitem__(self, ii):
+        if len(self.datasets[0]) < len(self.datasets[1]):
+            len_shortest = len(self.datasets[0])
+            i_shortest = ii % len_shortest
+            return self.datasets[0][i_shortest], self.datasets[1][ii]
+        else:
+            len_shortest = len(self.datasets[1])
+            jj = ii % len_shortest
+            return self.datasets[0][ii], self.datasets[1][jj]
+    def __len__(self):
+        return max(len(d) for d in self.datasets)

UVDoc_official/data_utils.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import random
+import albumentations as A
+import cv2
+import numpy as np
+import torch
+from data_custom_augmentations import SafeHorizontalFlip, SafePerspective
+from utils import GRID_SIZE, IMG_SIZE
+def get_appearance_transform(transform_types):
+    """
+    Returns an albumentation compose augmentation.
+    transform_type is a list containing types of pixel-wise data augmentation to use.
+    Possible augmentations are 'shadow', 'blur', 'visual', 'noise', 'color'.
+    """
+    transforms = []
+    if "shadow" in transform_types:
+        transforms.append(A.RandomShadow(p=0.1))
+    if "blur" in transform_types:
+        transforms.append(
+            A.OneOf(
+                transforms=[
+                    A.Defocus(p=0.05),
+                    A.Downscale(p=0.15, interpolation=cv2.INTER_LINEAR),
+                    A.GaussianBlur(p=0.65),
+                    A.MedianBlur(p=0.15),
+                ],
+                p=0.75,
+            )
+        )
+    if "visual" in transform_types:
+        transforms.append(
+            A.OneOf(
+                transforms=[
+                    A.ToSepia(p=0.15),
+                    A.ToGray(p=0.20),
+                    A.Equalize(p=0.15),
+                    A.Sharpen(p=0.20),
+                ],
+                p=0.5,
+            )
+        )
+    if "noise" in transform_types:
+        transforms.append(
+            A.OneOf(
+                transforms=[
+                    A.GaussNoise(var_limit=(10.0, 20.0), p=0.70),
+                    A.ISONoise(intensity=(0.1, 0.25), p=0.30),
+                ],
+                p=0.6,
+            )
+        )
+    if "color" in transform_types:
+        transforms.append(
+            A.OneOf(
+                transforms=[
+                    A.ColorJitter(p=0.05),
+                    A.HueSaturationValue(p=0.10),
+                    A.RandomBrightnessContrast(brightness_limit=[-0.05, 0.25], p=0.85),
+                ],
+                p=0.95,
+            )
+        )
+    return A.Compose(transforms=transforms)
+def get_geometric_transform(transform_types, gridsize):
+    """
+    Returns an albumentation compose augmentation.
+    transform_type is a list containing types of geometric data augmentation to use.
+    Possible augmentations are 'rotate', 'flip' and 'perspective'.
+    """
+    transforms = []
+    if "rotate" in transform_types:
+        transforms.append(
+            A.SafeRotate(
+                limit=[-30, 30],
+                interpolation=cv2.INTER_LINEAR,
+                border_mode=cv2.BORDER_REPLICATE,
+                p=0.5,
+            )
+        )
+    if "flip" in transform_types:
+        transforms.append(SafeHorizontalFlip(gridsize=gridsize, p=0.25))
+    if "perspective" in transform_types:
+        transforms.append(SafePerspective(p=0.5))
+    return A.ReplayCompose(
+        transforms=transforms,
+        keypoint_params=A.KeypointParams(format="xy", remove_invisible=False),
+    )
+def crop_image_tight(img, grid2D, deterministic=False):
+    """
+    Crops the image tightly around the keypoints in grid2D.
+    This function creates a tight crop around the document in the image.
+    """
+    size = img.shape
+    minx = np.floor(np.amin(grid2D[0, :, :])).astype(int)
+    maxx = np.ceil(np.amax(grid2D[0, :, :])).astype(int)
+    miny = np.floor(np.amin(grid2D[1, :, :])).astype(int)
+    maxy = np.ceil(np.amax(grid2D[1, :, :])).astype(int)
+    s = 20
+    s = min(min(s, minx), miny)  # s shouldn't be smaller than actually available natural padding is
+    s = min(min(s, size[1] - 1 - maxx), size[0] - 1 - maxy)
+    # Crop the image slightly larger than necessary
+    img = img[miny - s : maxy + s, minx - s : maxx + s, :]
+    hi = max(s - 5, 1)
+    if deterministic:
+        cx1 = cy1 = max(hi // 2, 0)
+        cx2 = cy2 = max(hi // 2, 0) + 1
+    else:
+        cx1 = random.randint(0, hi)
+        cx2 = random.randint(0, hi) + 1
+        cy1 = random.randint(0, hi)
+        cy2 = random.randint(0, hi) + 1
+    img = img[cy1:-cy2, cx1:-cx2, :]
+    top = miny - s + cy1
+    bot = size[0] - maxy - s + cy2
+    left = minx - s + cx1
+    right = size[1] - maxx - s + cx2
+    return img, top, bot, left, right
+class BaseDataset(torch.utils.data.Dataset):
+    """
+    Base torch dataset class for all unwarping dataset.
+    """
+    def __init__(
+        self,
+        data_path,
+        appearance_augmentation=[],
+        img_size=IMG_SIZE,
+        grid_size=GRID_SIZE,
+    ) -> None:
+        super().__init__()
+        self.dataroot = data_path
+        self.img_size = img_size
+        self.grid_size = grid_size
+        self.normalize_3Dgrid = True
+        self.appearance_transform = get_appearance_transform(appearance_augmentation)
+        self.all_samples = []
+    def __len__(self):
+        return len(self.all_samples)
+    def crop_tight(self, img_RGB, grid2D, deterministic=False):
+        # The incoming grid2D array is expressed in pixel coordinates (resolution of img_RGB before crop/resize)
+        size = img_RGB.shape
+        img, top, bot, left, right = crop_image_tight(img_RGB, grid2D, deterministic=deterministic)
+        img = cv2.resize(img, self.img_size)
+        img = img.transpose(2, 0, 1)
+        img = torch.from_numpy(img).float()
+        grid2D[0, :, :] = (grid2D[0, :, :] - left) / (size[1] - left - right)
+        grid2D[1, :, :] = (grid2D[1, :, :] - top) / (size[0] - top - bot)
+        grid2D = (grid2D * 2.0) - 1.0
+        return img, grid2D

UVDoc_official/demo.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import argparse
+import os
+import cv2
+import numpy as np
+import torch
+from utils import IMG_SIZE, bilinear_unwarping, load_model
+def unwarp_img(ckpt_path, img_path, img_size):
+    """
+    Unwarp a document image using the model from ckpt_path.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load model
+    model = load_model(ckpt_path)
+    model.to(device)
+    model.eval()
+    # Load image
+    img = cv2.imread(img_path)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32) / 255
+    inp = torch.from_numpy(cv2.resize(img, img_size).transpose(2, 0, 1)).unsqueeze(0)
+    # Make prediction
+    inp = inp.to(device)
+    point_positions2D, _ = model(inp)
+    # Unwarp
+    size = img.shape[:2][::-1]
+    unwarped = bilinear_unwarping(
+        warped_img=torch.from_numpy(img.transpose(2, 0, 1)).unsqueeze(0).to(device),
+        point_positions=torch.unsqueeze(point_positions2D[0], dim=0),
+        img_size=tuple(size),
+    )
+    unwarped = (unwarped[0].detach().cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
+    # Save result
+    unwarped_BGR = cv2.cvtColor(unwarped, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(os.path.splitext(img_path)[0] + "_unwarp.png", unwarped_BGR)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt-path", type=str, default="./model/best_model.pkl", help="Path to the model weights as pkl."
+    )
+    parser.add_argument("--img-path", type=str, help="Path to the document image to unwarp.")
+    args = parser.parse_args()
+    unwarp_img(args.ckpt_path, args.img_path, IMG_SIZE)

UVDoc_official/docUnet_eval.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import argparse
+import json
+import multiprocessing as mp
+import os
+from utils import get_version
+def visual_metrics_process(queue, docunet_path, preds_path, verbose):
+    """
+    Subprocess function that computes visual metrics (MS-SSIM, LD, and AD) based on a matlab script.
+    """
+    import matlab.engine
+    eng = matlab.engine.start_matlab()
+    eng.cd(r"./eval/eval_code/", nargout=0)
+    mean_ms, mean_ld, mean_ad = eng.evalScript(os.path.join(docunet_path, "scan"), preds_path, verbose, nargout=3)
+    queue.put(dict(ms=mean_ms, ld=mean_ld, ad=mean_ad))
+def ocr_process(queue, docunet_path, preds_path, crop_type):
+    """
+    Subprocess function that computes OCR metrics (CER and ED).
+    """
+    from eval.ocr_eval.ocr_eval import OCR_eval_docunet
+    CERmean, EDmean, OCR_dict_results = OCR_eval_docunet(
+        os.path.join(docunet_path, "scan"), preds_path, os.path.join(docunet_path, crop_type)
+    )
+    with open(os.path.join(preds_path, "ocr_res.json"), "w") as f:
+        json.dump(OCR_dict_results, f)
+    queue.put(dict(cer=CERmean, ed=EDmean))
+def compute_metrics(docunet_path, preds_path, crop_type, verbose=False):
+    """
+    Compute and save all metrics.
+    """
+    if not preds_path.endswith("/"):
+        preds_path += "/"
+    q = mp.Queue()
+    # Create process to compute MS-SSIM, LD, AD
+    p1 = mp.Process(target=visual_metrics_process, args=(q, docunet_path, preds_path, verbose))
+    p1.start()
+    # Create process to compute OCR metrics
+    p2 = mp.Process(target=ocr_process, args=(q, docunet_path, preds_path, crop_type))
+    p2.start()
+    p1.join()
+    p2.join()
+    # Get results
+    res = {}
+    for _ in range(q.qsize()):
+        ret = q.get()
+        for k, v in ret.items():
+            res[k] = v
+    # Print and saves results
+    print("--- Results ---")
+    print(f"  Mean MS-SSIM      : {res['ms']}")
+    print(f"  Mean LD           : {res['ld']}")
+    print(f"  Mean AD           : {res['ad']}")
+    print(f"  Mean CER          : {res['cer']}")
+    print(f"  Mean ED           : {res['ed']}")
+    with open(os.path.join(preds_path, "res.txt"), "w") as f:
+        f.write(f"Mean MS-SSIM      : {res['ms']}\n")
+        f.write(f"Mean LD           : {res['ld']}\n")
+        f.write(f"Mean AD           : {res['ad']}\n")
+        f.write(f"Mean CER          : {res['cer']}\n")
+        f.write(f"Mean ED           : {res['ed']}\n")
+        model_info_path = os.path.join(preds_path, "model_info.txt")
+        if os.path.isfile(model_info_path):
+            with open(model_info_path) as modinf_f:
+                for x in modinf_f.readlines():
+                    f.write(x)
+        f.write("\n--- Module Version ---\n")
+        for module, version in get_version().items():
+            f.write(f"{module:25s}: {version}\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--docunet-path", type=str, default="./data/DocUNet/", help="Path to the DocUNet scans. Needs to be absolute."
+    )
+    parser.add_argument("--pred-path", type=str, help="Path to the DocUnet predictions. Needs to be absolute.")
+    parser.add_argument(
+        "--crop-type",
+        type=str,
+        default="crop",
+        help="The type of cropping to use as input of the model : 'crop' or 'original'",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    args = parser.parse_args()
+    compute_metrics(
+        os.path.abspath(args.docunet_path), os.path.abspath(args.pred_path), args.crop_type, verbose=args.verbose
+    )

UVDoc_official/docUnet_pred.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import argparse
+import os
+import platform
+import re
+import subprocess
+import time
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+from utils import IMG_SIZE, bilinear_unwarping, load_model
+def get_processor_name():
+    """
+    Returns information about the processor used.
+    Taken from https://stackoverflow.com/a/13078519.
+    """
+    if platform.system() == "Windows":
+        return platform.processor()
+    elif platform.system() == "Darwin":
+        os.environ["PATH"] = os.environ["PATH"] + os.pathsep + "/usr/sbin"
+        command = "sysctl -n machdep.cpu.brand_string"
+        return subprocess.check_output(command).strip()
+    elif platform.system() == "Linux":
+        command = "cat /proc/cpuinfo"
+        all_info = subprocess.check_output(command, shell=True).decode().strip()
+        for line in all_info.split("\n"):
+            if "model name" in line:
+                return re.sub(".*model name.*:", "", line, 1)
+    return ""
+def count_parameters(model):
+    """
+    Returns the number of parameters of a model.
+    Taken from https://discuss.pytorch.org/t/how-do-i-check-the-number-of-parameters-of-a-model/4325/9.
+    """
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+class docUnetLoader(torch.utils.data.Dataset):
+    """
+    Torch dataset class for the DocUNet benchmark dataset.
+    """
+    def __init__(
+        self,
+        data_path,
+        crop="original",
+        img_size=(488, 712),
+    ):
+        self.dataroot = data_path
+        self.crop = crop
+        self.im_list = os.listdir(os.path.join(self.dataroot, self.crop))
+        self.img_size = img_size
+    def __len__(self):
+        return len(self.im_list)
+    def __getitem__(self, index):
+        im_name = self.im_list[index]
+        img_path = os.path.join(self.dataroot, self.crop, im_name)
+        img_RGB = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
+        img_RGB = torch.from_numpy(cv2.resize(img_RGB, tuple(self.img_size)).transpose(2, 0, 1))
+        return img_RGB, im_name
+def infer_docUnet(model, dataloader, device, save_path):
+    """
+    Unwarp all images in the DocUNet benchmark and save them.
+    Also measure the times it takes to perform this operation.
+    """
+    model.eval()
+    inference_times = []
+    inferenceGPU_times = []
+    for img_RGB, im_names in tqdm(dataloader):
+        # Inference
+        start_toGPU = time.time()
+        img_RGB = img_RGB.to(device)
+        start_inf = time.time()
+        point_positions2D, _ = model(img_RGB)
+        end_inf = time.time()
+        # Warped image need to be re-open to get full resolution (downsampled in data loader)
+        warped = cv2.imread(os.path.join(dataloader.dataset.dataroot, dataloader.dataset.crop, im_names[0]))
+        warped = cv2.cvtColor(warped, cv2.COLOR_BGR2RGB)
+        warped = torch.from_numpy(warped.transpose(2, 0, 1) / 255.0).float()
+        # To unwarp using the GT aspect ratio, uncomment following lines and replace
+        # `size = warped.shape[:2]` by `size = gt.shape[:2]`
+        # gt = cv2.imread(
+        #     os.path.join(
+        #         dataloader.dataset.dataroot,
+        #         "scan",
+        #         im_names[0].split("_")[0] + ".png",
+        #     )
+        # )
+        size = warped.shape[1:][::-1]
+        # Unwarping
+        start_unwarp = time.time()
+        unwarped = bilinear_unwarping(
+            warped_img=torch.unsqueeze(warped, dim=0).to(device),
+            point_positions=torch.unsqueeze(point_positions2D[0], dim=0),
+            img_size=tuple(size),
+        )
+        end_unwarp = time.time()
+        unwarped = (unwarped[0].detach().cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
+        unwarped_BGR = cv2.cvtColor(unwarped, cv2.COLOR_RGB2BGR)
+        end_toGPU = time.time()
+        cv2.imwrite(
+            os.path.join(save_path, im_names[0].split(" ")[0].split(".")[0] + ".png"),
+            unwarped_BGR,
+        )
+        inference_times.append(end_inf - start_inf + end_unwarp - start_unwarp)
+        inferenceGPU_times.append(end_inf - start_toGPU + end_toGPU - start_unwarp)
+    # Computes average inference time and the number of parameters of the model
+    avg_inference_time = np.mean(inference_times)
+    avg_inferenceGPU_time = np.mean(inferenceGPU_times)
+    n_params = count_parameters(model)
+    return avg_inference_time, avg_inferenceGPU_time, n_params
+def create_results(ckpt_path, docUnet_path, crop, img_size):
+    """
+    Create results for the DocUNet benchmark.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load model, create dataset and save directory
+    model = load_model(ckpt_path)
+    model.to(device)
+    dataset = docUnetLoader(docUnet_path, crop, img_size=img_size)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, drop_last=False)
+    save_path = os.path.join("/".join(ckpt_path.split("/")[:-1]), "docunet", crop)
+    os.makedirs(save_path, exist_ok=False)
+    print(f"    Results will be saved at {save_path}", flush=True)
+    # Infer results from the model and saves metadata
+    inference_time, inferenceGPU_time, n_params = infer_docUnet(model, dataloader, device, save_path)
+    with open(os.path.join(save_path, "model_info.txt"), "w") as f:
+        f.write("\n---Model and Hardware Information---\n")
+        f.write(f"Inference Time : {inference_time:.5f}s\n")
+        f.write(f"  FPS : {1/inference_time:.1f}\n")
+        f.write(f"Inference Time (Include Loading To/From GPU) : {inferenceGPU_time:.5f}s\n")
+        f.write(f"  FPS : {1/inferenceGPU_time:.1f}\n")
+        f.write("Using :\n")
+        f.write(f"  CPU : {get_processor_name()}\n")
+        f.write(f"  GPU : {torch.cuda.get_device_name(0)}\n")
+        f.write(f"Number of Parameters : {n_params:,}\n")
+    return save_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt-path", type=str, default="./model/best_model.pkl", help="Path to the model weights as pkl."
+    )
+    parser.add_argument("--docunet-path", type=str, default="./data/DocUNet", help="Path to the docunet benchmark.")
+    parser.add_argument(
+        "--crop-type",
+        type=str,
+        default="crop",
+        help="The type of cropping to use as input of the model : 'crop' or 'original'.",
+    )
+    args = parser.parse_args()
+    create_results(args.ckpt_path, os.path.abspath(args.docunet_path), args.crop_type, IMG_SIZE)

UVDoc_official/model.py ADDED Viewed

	@@ -0,0 +1,374 @@

+import torch
+import torch.nn as nn
+def conv3x3(in_channels, out_channels, kernel_size, stride=1):
+    return nn.Conv2d(
+        in_channels,
+        out_channels,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=kernel_size // 2,
+    )
+def dilated_conv_bn_act(in_channels, out_channels, act_fn, BatchNorm, dilation):
+    model = nn.Sequential(
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            bias=False,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            dilation=dilation,
+        ),
+        BatchNorm(out_channels),
+        act_fn,
+    )
+    return model
+def dilated_conv(in_channels, out_channels, kernel_size, dilation, stride=1):
+    model = nn.Sequential(
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=dilation * (kernel_size // 2),
+            dilation=dilation,
+        )
+    )
+    return model
+class ResidualBlockWithDilation(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        BatchNorm,
+        kernel_size,
+        stride=1,
+        downsample=None,
+        is_activation=True,
+        is_top=False,
+    ):
+        super(ResidualBlockWithDilation, self).__init__()
+        self.stride = stride
+        self.downsample = downsample
+        self.is_activation = is_activation
+        self.is_top = is_top
+        if self.stride != 1 or self.is_top:
+            self.conv1 = conv3x3(in_channels, out_channels, kernel_size, self.stride)
+            self.conv2 = conv3x3(out_channels, out_channels, kernel_size)
+        else:
+            self.conv1 = dilated_conv(in_channels, out_channels, kernel_size, dilation=3)
+            self.conv2 = dilated_conv(out_channels, out_channels, kernel_size, dilation=3)
+        self.bn1 = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.bn2 = BatchNorm(out_channels)
+    def forward(self, x):
+        residual = x
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out1 = self.relu(self.bn1(self.conv1(x)))
+        out2 = self.bn2(self.conv2(out1))
+        out2 += residual
+        out = self.relu(out2)
+        return out
+class ResnetStraight(nn.Module):
+    def __init__(
+        self,
+        num_filter,
+        map_num,
+        BatchNorm,
+        block_nums=[3, 4, 6, 3],
+        block=ResidualBlockWithDilation,
+        kernel_size=5,
+        stride=[1, 1, 2, 2],
+    ):
+        super(ResnetStraight, self).__init__()
+        self.in_channels = num_filter * map_num[0]
+        self.stride = stride
+        self.relu = nn.ReLU(inplace=True)
+        self.block_nums = block_nums
+        self.kernel_size = kernel_size
+        self.layer1 = self.blocklayer(
+            block,
+            num_filter * map_num[0],
+            self.block_nums[0],
+            BatchNorm,
+            kernel_size=self.kernel_size,
+            stride=self.stride[0],
+        )
+        self.layer2 = self.blocklayer(
+            block,
+            num_filter * map_num[1],
+            self.block_nums[1],
+            BatchNorm,
+            kernel_size=self.kernel_size,
+            stride=self.stride[1],
+        )
+        self.layer3 = self.blocklayer(
+            block,
+            num_filter * map_num[2],
+            self.block_nums[2],
+            BatchNorm,
+            kernel_size=self.kernel_size,
+            stride=self.stride[2],
+        )
+    def blocklayer(self, block, out_channels, block_nums, BatchNorm, kernel_size, stride=1):
+        downsample = None
+        if (stride != 1) or (self.in_channels != out_channels):
+            downsample = nn.Sequential(
+                conv3x3(
+                    self.in_channels,
+                    out_channels,
+                    kernel_size=kernel_size,
+                    stride=stride,
+                ),
+                BatchNorm(out_channels),
+            )
+        layers = []
+        layers.append(
+            block(
+                self.in_channels,
+                out_channels,
+                BatchNorm,
+                kernel_size,
+                stride,
+                downsample,
+                is_top=True,
+            )
+        )
+        self.in_channels = out_channels
+        for i in range(1, block_nums):
+            layers.append(
+                block(
+                    out_channels,
+                    out_channels,
+                    BatchNorm,
+                    kernel_size,
+                    is_activation=True,
+                    is_top=False,
+                )
+            )
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        out1 = self.layer1(x)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        return out3
+class UVDocnet(nn.Module):
+    def __init__(self, num_filter, kernel_size=5):
+        super(UVDocnet, self).__init__()
+        self.num_filter = num_filter
+        self.in_channels = 3
+        self.kernel_size = kernel_size
+        self.stride = [1, 2, 2, 2]
+        BatchNorm = nn.BatchNorm2d
+        act_fn = nn.ReLU(inplace=True)
+        map_num = [1, 2, 4, 8, 16]
+        self.resnet_head = nn.Sequential(
+            nn.Conv2d(
+                self.in_channels,
+                self.num_filter * map_num[0],
+                bias=False,
+                kernel_size=self.kernel_size,
+                stride=2,
+                padding=self.kernel_size // 2,
+            ),
+            BatchNorm(self.num_filter * map_num[0]),
+            act_fn,
+            nn.Conv2d(
+                self.num_filter * map_num[0],
+                self.num_filter * map_num[0],
+                bias=False,
+                kernel_size=self.kernel_size,
+                stride=2,
+                padding=self.kernel_size // 2,
+            ),
+            BatchNorm(self.num_filter * map_num[0]),
+            act_fn,
+        )
+        self.resnet_down = ResnetStraight(
+            self.num_filter,
+            map_num,
+            BatchNorm,
+            block_nums=[3, 4, 6, 3],
+            block=ResidualBlockWithDilation,
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+        )
+        map_num_i = 2
+        self.bridge_1 = nn.Sequential(
+            dilated_conv_bn_act(
+                self.num_filter * map_num[map_num_i],
+                self.num_filter * map_num[map_num_i],
+                act_fn,
+                BatchNorm,
+                dilation=1,
+            )
+        )
+        self.bridge_2 = nn.Sequential(
+            dilated_conv_bn_act(
+                self.num_filter * map_num[map_num_i],
+                self.num_filter * map_num[map_num_i],
+                act_fn,
+                BatchNorm,
+                dilation=2,
+            )
+        )
+        self.bridge_3 = nn.Sequential(
+            dilated_conv_bn_act(
+                self.num_filter * map_num[map_num_i],
+                self.num_filter * map_num[map_num_i],
+                act_fn,
+                BatchNorm,
+                dilation=5,
+            )
+        )
+        self.bridge_4 = nn.Sequential(
+            *[
+                dilated_conv_bn_act(
+                    self.num_filter * map_num[map_num_i],
+                    self.num_filter * map_num[map_num_i],
+                    act_fn,
+                    BatchNorm,
+                    dilation=d,
+                )
+                for d in [8, 3, 2]
+            ]
+        )
+        self.bridge_5 = nn.Sequential(
+            *[
+                dilated_conv_bn_act(
+                    self.num_filter * map_num[map_num_i],
+                    self.num_filter * map_num[map_num_i],
+                    act_fn,
+                    BatchNorm,
+                    dilation=d,
+                )
+                for d in [12, 7, 4]
+            ]
+        )
+        self.bridge_6 = nn.Sequential(
+            *[
+                dilated_conv_bn_act(
+                    self.num_filter * map_num[map_num_i],
+                    self.num_filter * map_num[map_num_i],
+                    act_fn,
+                    BatchNorm,
+                    dilation=d,
+                )
+                for d in [18, 12, 6]
+            ]
+        )
+        self.bridge_concat = nn.Sequential(
+            nn.Conv2d(
+                self.num_filter * map_num[map_num_i] * 6,
+                self.num_filter * map_num[2],
+                bias=False,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            BatchNorm(self.num_filter * map_num[2]),
+            act_fn,
+        )
+        self.out_point_positions2D = nn.Sequential(
+            nn.Conv2d(
+                self.num_filter * map_num[2],
+                self.num_filter * map_num[0],
+                bias=False,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode="reflect",
+            ),
+            BatchNorm(self.num_filter * map_num[0]),
+            nn.PReLU(),
+            nn.Conv2d(
+                self.num_filter * map_num[0],
+                2,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode="reflect",
+            ),
+        )
+        self.out_point_positions3D = nn.Sequential(
+            nn.Conv2d(
+                self.num_filter * map_num[2],
+                self.num_filter * map_num[0],
+                bias=False,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode="reflect",
+            ),
+            BatchNorm(self.num_filter * map_num[0]),
+            nn.PReLU(),
+            nn.Conv2d(
+                self.num_filter * map_num[0],
+                3,
+                kernel_size=self.kernel_size,
+                stride=1,
+                padding=self.kernel_size // 2,
+                padding_mode="reflect",
+            ),
+        )
+        self._initialize_weights()
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.xavier_normal_(m.weight, gain=0.2)
+            if isinstance(m, nn.ConvTranspose2d):
+                assert m.kernel_size[0] == m.kernel_size[1]
+                nn.init.xavier_normal_(m.weight, gain=0.2)
+    def forward(self, x):
+        resnet_head = self.resnet_head(x)
+        resnet_down = self.resnet_down(resnet_head)
+        bridge_1 = self.bridge_1(resnet_down)
+        bridge_2 = self.bridge_2(resnet_down)
+        bridge_3 = self.bridge_3(resnet_down)
+        bridge_4 = self.bridge_4(resnet_down)
+        bridge_5 = self.bridge_5(resnet_down)
+        bridge_6 = self.bridge_6(resnet_down)
+        bridge_concat = torch.cat([bridge_1, bridge_2, bridge_3, bridge_4, bridge_5, bridge_6], dim=1)
+        bridge = self.bridge_concat(bridge_concat)
+        out_point_positions2D = self.out_point_positions2D(bridge)
+        out_point_positions3D = self.out_point_positions3D(bridge)
+        return out_point_positions2D, out_point_positions3D

UVDoc_official/requirements_demo.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy==1.23.4
+opencv_python_headless==4.7.0.68
+torch==1.13.0

UVDoc_official/requirements_eval.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+hdf5storage==0.1.18
+jiwer==3.0.1
+numpy==1.23.4
+opencv_python_headless==4.7.0.68
+Pillow==9.4.0
+pytesseract==0.3.10
+python_Levenshtein
+scikit-image
+torch==1.13.0
+tqdm==4.64.1

UVDoc_official/requirements_train.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+albumentations==1.3.0
+h5py==3.7.0
+numpy==1.23.4
+opencv_python_headless==4.7.0.68
+torch==1.13.0

UVDoc_official/run_official_overfit_train_infer.sh ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/env bash
+# Official-style UVDoc overfit: training uses the same UVDocDataset settings as
+# verify_ckpt_val_pipeline.py (deterministic crop, no appearance/geo aug).
+# Inference MUST use verify_ckpt_val_pipeline.py — not demo.py (full-image resize).
+#
+# Default OVERFIT_N=8 with BATCH_SIZE=8 so each batch is full (BatchNorm behaves
+# closer to normal training). Set OVERFIT_N=1 for a single sample if you accept BN quirks.
+#
+# Env (optional):
+#   PYTHON, UV_DOC_ROOT, LOGDIR, OUT_DIR, DEVICE, NUM_WORKERS
+#   OVERFIT_N (default 8), BATCH_SIZE (default 8)
+#   SKIP_PREPROCESS_CHECK=1, SKIP_TRAIN=1, CKPT=/path/to/ep_*_best_model.pkl
+#   N_EPOCHS (default 10), N_EPOCHS_DECAY (default 10)
+set -euo pipefail
+OFFICIAL_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+UV_ROOT="$(cd "${OFFICIAL_ROOT}/.." && pwd)"
+PY="${PYTHON:-python3}"
+UV="${UV_DOC_ROOT:-${UV_ROOT}/UVDoc_final}"
+LOGDIR="${LOGDIR:-${UV_ROOT}/log_official_overfit_train_infer}"
+OUT_DIR="${OUT_DIR:-${LOGDIR}/verify_infer}"
+DEVICE="${DEVICE:-cuda:0}"
+NUM_WORKERS="${NUM_WORKERS:-4}"
+OVERFIT_N="${OVERFIT_N:-8}"
+BATCH_SIZE="${BATCH_SIZE:-8}"
+N_EPOCHS="${N_EPOCHS:-10}"
+N_EPOCHS_DECAY="${N_EPOCHS_DECAY:-10}"
+cd "${OFFICIAL_ROOT}"
+if [[ ! -d "${UV}/img" ]]; then
+  echo "ERROR: UVDoc data not found: ${UV}/img" >&2
+  exit 1
+fi
+echo "== UVDoc root: ${UV}"
+echo "== Log dir:    ${LOGDIR}"
+echo "== Overfit N:  ${OVERFIT_N}  batch: ${BATCH_SIZE}"
+if [[ "${SKIP_PREPROCESS_CHECK:-0}" != "1" ]]; then
+  echo "== (1) Preprocess: train vs verify_ckpt dataset tensors"
+  "${PY}" verify_uvdoc_train_infer_preprocess.py \
+    --data_path_UVDoc "${UV}" \
+    --overfit_n "${OVERFIT_N}" \
+    --check_dataloader \
+    --batch_size "${BATCH_SIZE}" \
+    --num_workers 0
+fi
+if [[ "${SKIP_TRAIN:-0}" != "1" ]]; then
+  echo "== (2) Train (official defaults: lr=2e-4, alpha=beta=5, gamma=1, ep_gamma_start=10)"
+  mkdir -p "${LOGDIR}"
+  "${PY}" train.py \
+    --data_to_use uvdoc \
+    --data_path_UVDoc "${UV}" \
+    --overfit_n "${OVERFIT_N}" \
+    --batch_size "${BATCH_SIZE}" \
+    --n_epochs "${N_EPOCHS}" \
+    --n_epochs_decay "${N_EPOCHS_DECAY}" \
+    --lr 0.0002 \
+    --alpha_w 5.0 \
+    --beta_w 5.0 \
+    --gamma_w 1.0 \
+    --ep_gamma_start 10 \
+    --num_workers "${NUM_WORKERS}" \
+    --device "${DEVICE}" \
+    --log_eval_mse_train \
+    --logdir "${LOGDIR}"
+fi
+if [[ -n "${CKPT:-}" ]]; then
+  :
+else
+  EXP_DIR="$(ls -td "${LOGDIR}"/params* 2>/dev/null | head -1 || true)"
+  if [[ -z "${EXP_DIR}" ]]; then
+    echo "ERROR: No params* under ${LOGDIR}; set CKPT=... or run training." >&2
+    exit 1
+  fi
+  CKPT="$(ls -t "${EXP_DIR}"/ep_*_best_model.pkl 2>/dev/null | head -1 || true)"
+  if [[ -z "${CKPT}" ]]; then
+    echo "ERROR: No ep_*_best_model.pkl under ${EXP_DIR}" >&2
+    exit 1
+  fi
+fi
+echo "== (3) Infer (same UVDocDataset kwargs as train val): ${CKPT}"
+rm -rf "${OUT_DIR}"
+mkdir -p "${OUT_DIR}"
+"${PY}" verify_ckpt_val_pipeline.py \
+  --ckpt "${CKPT}" \
+  --data_path_UVDoc "${UV}" \
+  --overfit_n "${OVERFIT_N}" \
+  --out_dir "${OUT_DIR}" \
+  --max_save_images "${OVERFIT_N}" \
+  --device "${DEVICE}"
+echo "== Done"
+echo "    Metrics: ${OUT_DIR}/metrics.txt"
+echo "    Images:  ${OUT_DIR}/*.png"
+echo "Compare mean_mse in metrics.txt to train log Val MSE and train_mse_eval (eval mode)."
+cat "${OUT_DIR}/metrics.txt"

UVDoc_official/train.py ADDED Viewed

	@@ -0,0 +1,552 @@

+import argparse
+import gc
+import os
+import sys
+import torch
+try:
+    from tqdm import tqdm
+except ImportError:
+    tqdm = None
+import data_UVDoc
+import model
+import utils
+from data_mixDataset import mixDataset
+train_mse = 0.0
+losscount = 0
+gamma_w = 0.0
+def setup_data(args):
+    """
+    Returns train and validation dataloader.
+    """
+    UVDoc = data_UVDoc.UVDocDataset
+    traindata = "train"
+    valdata = "val"
+    if args.data_to_use == "uvdoc":
+        if getattr(args, "overfit_n", 0) and int(args.overfit_n) > 0:
+            # Same preprocessing as val / verify_ckpt_val_pipeline.py (deterministic crop, no aug).
+            t_UVDoc_data = UVDoc(
+                data_path=args.data_path_UVDoc,
+                appearance_augmentation=[],
+                geometric_augmentations=[],
+                overfit=True,
+                max_samples=int(args.overfit_n),
+                deterministic_crop=True,
+                grid3d_stats_path=args.uvdoc_grid3d_stats,
+            )
+            v_UVDoc_data = UVDoc(
+                data_path=args.data_path_UVDoc,
+                appearance_augmentation=[],
+                geometric_augmentations=[],
+                overfit=True,
+                max_samples=int(args.overfit_n),
+                deterministic_crop=True,
+                grid3d_stats_path=args.uvdoc_grid3d_stats,
+            )
+        else:
+            t_UVDoc_data = UVDoc(
+                data_path=args.data_path_UVDoc,
+                appearance_augmentation=args.appearance_augmentation,
+                geometric_augmentations=args.geometric_augmentationsUVDoc,
+                split="train",
+                val_ratio=args.uvdoc_val_ratio,
+                split_seed=args.uvdoc_split_seed,
+                split_mode=args.uvdoc_split_mode,
+                grid3d_stats_path=args.uvdoc_grid3d_stats,
+            )
+            v_UVDoc_data = UVDoc(
+                data_path=args.data_path_UVDoc,
+                appearance_augmentation=[],
+                geometric_augmentations=[],
+                split="val",
+                val_ratio=args.uvdoc_val_ratio,
+                split_seed=args.uvdoc_split_seed,
+                split_mode=args.uvdoc_split_mode,
+                grid3d_stats_path=args.uvdoc_grid3d_stats,
+            )
+        trainloader = torch.utils.data.DataLoader(
+            t_UVDoc_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True
+        )
+        valloader = torch.utils.data.DataLoader(
+            v_UVDoc_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True
+        )
+        return trainloader, valloader
+    import data_doc3D
+    doc3D = data_doc3D.doc3DDataset
+    # Training data
+    t_doc3D_data = doc3D(
+        data_path=args.data_path_doc3D,
+        split=traindata,
+        appearance_augmentation=args.appearance_augmentation,
+    )
+    t_UVDoc_data = UVDoc(
+        data_path=args.data_path_UVDoc,
+        appearance_augmentation=args.appearance_augmentation,
+        geometric_augmentations=args.geometric_augmentationsUVDoc,
+        grid3d_stats_path=args.uvdoc_grid3d_stats,
+    )
+    t_mix_data = mixDataset(t_doc3D_data, t_UVDoc_data)
+    if args.data_to_use == "both":
+        trainloader = torch.utils.data.DataLoader(
+            t_mix_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True
+        )
+    elif args.data_to_use == "doc3d":
+        trainloader = torch.utils.data.DataLoader(
+            t_doc3D_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True
+        )
+    else:
+        raise ValueError(f"data_to_use should be either doc3d, both, or uvdoc, provided {args.data_to_use}.")
+    # Validation data (doc3D only) — matches upstream UVDoc repo
+    v_doc3D_data = doc3D(data_path=args.data_path_doc3D, split=valdata, appearance_augmentation=[])
+    valloader = torch.utils.data.DataLoader(
+        v_doc3D_data, batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True
+    )
+    return trainloader, valloader
+def get_scheduler(optimizer, args, epoch_start):
+    """Return a learning rate scheduler
+    Parameters:
+        optimizer          -- the optimizer of the network
+        args               -- stores all the experiment flags
+        epoch_start        -- the epoch number we started/continued from
+    We keep the same learning rate for the first <args.n_epochs> epochs
+    and linearly decay the rate to zero over the next <args.n_epochs_decay> epochs.
+    """
+    def lambda_rule(epoch):
+        lr_l = 1.0 - max(0, epoch + epoch_start - args.n_epochs) / float(args.n_epochs_decay + 1)
+        return lr_l
+    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_rule)
+    return scheduler
+def update_learning_rate(scheduler, optimizer):
+    """Update learning rates; called at the end of every epoch"""
+    old_lr = optimizer.param_groups[0]["lr"]
+    scheduler.step()
+    lr = optimizer.param_groups[0]["lr"]
+    print("learning rate update from %.7f -> %.7f" % (old_lr, lr))
+    return lr
+def write_log_file(log_file_name, loss, epoch, lrate, phase):
+    with open(log_file_name, "a") as f:
+        f.write("\n{} LRate: {} Epoch: {} MSE: {:.5f} ".format(phase, lrate, epoch, loss))
+def write_train_loss_detail_log(log_file_name, epoch, lrate, avg_net, avg_g2, avg_g3, avg_rec, gamma_w):
+    """Append per-epoch mean L1 losses (same definition as netLoss) to the experiment log."""
+    with open(log_file_name, "a") as f:
+        f.write(
+            "\nTrainLoss LRate: {} Epoch: {} net: {:.5f} L1_g2d: {:.5f} L1_g3d: {:.5f} L1_recon: {:.5f} gamma_w: {:.5f} ".format(
+                lrate, epoch, avg_net, avg_g2, avg_g3, avg_rec, gamma_w
+            )
+        )
+def main_worker(args):
+    # setup training data
+    trainloader, valloader = setup_data(args)
+    device = torch.device(args.device)
+    UVDocnet = model.UVDocnet(num_filter=32, kernel_size=5)
+    UVDocnet.to(device)
+    # define loss functions
+    criterionL1 = torch.nn.L1Loss()
+    criterionMSE = torch.nn.MSELoss()
+    # initialize optimizers
+    optimizer = torch.optim.Adam(UVDocnet.parameters(), lr=args.lr, betas=(0.9, 0.999))
+    global gamma_w
+    epoch_start = 0
+    if args.resume is not None:
+        if os.path.isfile(args.resume):
+            print("Loading model and optimizer from checkpoint '{}'".format(args.resume))
+            checkpoint = torch.load(args.resume)
+            UVDocnet.load_state_dict(checkpoint["model_state"])
+            optimizer.load_state_dict(checkpoint["optimizer_state"])
+            print("Loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint["epoch"]))
+            epoch_start = checkpoint["epoch"]
+            if epoch_start >= args.ep_gamma_start:
+                gamma_w = args.gamma_w
+        else:
+            print("No checkpoint found at '{}'".format(args.resume))
+    # initialize learning rate schedulers
+    scheduler = get_scheduler(optimizer, args, epoch_start)
+    # Log file:
+    if not os.path.exists(args.logdir):
+        os.makedirs(args.logdir)
+    experiment_name = (
+        "params"
+        + str(args.batch_size)
+        + "_lr="
+        + str(args.lr)
+        + "_nepochs"
+        + str(args.n_epochs)
+        + "_nepochsdecay"
+        + str(args.n_epochs_decay)
+        + "_alpha"
+        + str(args.alpha_w)
+        + "_beta"
+        + str(args.beta_w)
+        + "_gamma="
+        + str(args.gamma_w)
+        + "_gammastartep"
+        + str(args.ep_gamma_start)
+        + "_data"
+        + args.data_to_use
+    )
+    if args.resume:
+        experiment_name = "RESUME" + experiment_name
+    log_file_name = os.path.join(args.logdir, experiment_name + ".txt")
+    if os.path.isfile(log_file_name):
+        log_file = open(log_file_name, "a")
+    else:
+        log_file = open(log_file_name, "w+")
+    log_file.write("\n---------------  " + experiment_name + "  ---------------\n")
+    log_file.close()
+    exp_log_dir = os.path.join(args.logdir, experiment_name, "")
+    if not os.path.exists(exp_log_dir):
+        os.makedirs(exp_log_dir)
+    global losscount
+    global train_mse
+    # Run training
+    best_val_mse = float("inf")
+    for epoch in range(epoch_start, args.n_epochs + args.n_epochs_decay + 1):
+        print(f"\n----- Epoch {epoch} -----")
+        if epoch >= args.ep_gamma_start:
+            gamma_w = args.gamma_w
+            print("epoch ", epoch, "gamma_w is now", gamma_w)
+        train_mse = 0.0
+        losscount = 0
+        sum_l1_g2 = 0.0
+        sum_l1_g3 = 0.0
+        sum_l1_rec = 0.0
+        sum_net = 0.0
+        # Train
+        UVDocnet.train()
+        train_iter = trainloader
+        if tqdm is not None and not args.no_tqdm:
+            train_iter = tqdm(
+                trainloader,
+                desc=f"train epoch {epoch}",
+                dynamic_ncols=True,
+                mininterval=2.0,
+                file=sys.stdout,
+            )
+        for batch in train_iter:
+            if args.data_to_use == "both":
+                (
+                    imgs_doc3D_,
+                    imgs_unwarped_doc3D_,
+                    grid2D_doc3D_,
+                    grid3D_doc3D_,
+                ) = batch[0]
+                (
+                    imgs_UVDoc_,
+                    imgs_unwarped_UVDoc_,
+                    grid2D_UVDoc_,
+                    grid3D_UVDoc_,
+                ) = batch[1]
+            elif args.data_to_use == "uvdoc":
+                (
+                    imgs_UVDoc_,
+                    imgs_unwarped_UVDoc_,
+                    grid2D_UVDoc_,
+                    grid3D_UVDoc_,
+                ) = batch
+            elif args.data_to_use == "doc3d":
+                (
+                    imgs_doc3D_,
+                    imgs_unwarped_doc3D_,
+                    grid2D_doc3D_,
+                    grid3D_doc3D_,
+                ) = batch
+            # Train Doc3D step (official default; skipped for uvdoc-only)
+            if args.data_to_use in ("both", "doc3d"):
+                imgs_doc3D = imgs_doc3D_.to(device, non_blocking=True)
+                unwarped_GT_doc3D = imgs_unwarped_doc3D_.to(device, non_blocking=True)
+                grid2D_GT_doc3D = grid2D_doc3D_.to(device, non_blocking=True)
+                grid3D_GT_doc3D = grid3D_doc3D_.to(device, non_blocking=True)
+                grid2D_pred_doc3D, grid3D_pred_doc3D = UVDocnet(imgs_doc3D)
+                unwarped_pred_doc3D = utils.bilinear_unwarping(imgs_doc3D, grid2D_pred_doc3D, utils.IMG_SIZE)
+                optimizer.zero_grad(set_to_none=True)
+                recon_loss = criterionL1(unwarped_pred_doc3D, unwarped_GT_doc3D)
+                loss_grid2D = criterionL1(grid2D_pred_doc3D, grid2D_GT_doc3D)
+                loss_grid3D = criterionL1(grid3D_pred_doc3D, grid3D_GT_doc3D)
+                netLoss = args.alpha_w * loss_grid2D + args.beta_w * loss_grid3D + gamma_w * recon_loss
+                sum_l1_g2 += float(loss_grid2D.detach())
+                sum_l1_g3 += float(loss_grid3D.detach())
+                sum_l1_rec += float(recon_loss.detach())
+                sum_net += float(netLoss.detach())
+                netLoss.backward()
+                optimizer.step()
+                tmp_mse = criterionMSE(unwarped_pred_doc3D, unwarped_GT_doc3D)
+                train_mse += float(tmp_mse)
+                losscount += 1
+            # Train UVDoc step
+            if args.data_to_use in ("both", "uvdoc"):
+                imgs_UVDoc = imgs_UVDoc_.to(device, non_blocking=True)
+                unwarped_GT_UVDoc = imgs_unwarped_UVDoc_.to(device, non_blocking=True)
+                grid2D_GT_UVDoc = grid2D_UVDoc_.to(device, non_blocking=True)
+                grid3D_GT_UVDoc = grid3D_UVDoc_.to(device, non_blocking=True)
+                grid2D_pred_UVDoc, grid3D_pred_UVDoc = UVDocnet(imgs_UVDoc)
+                unwarped_pred_UVDoc = utils.bilinear_unwarping(imgs_UVDoc, grid2D_pred_UVDoc, utils.IMG_SIZE)
+                optimizer.zero_grad(set_to_none=True)
+                recon_loss = criterionL1(unwarped_pred_UVDoc, unwarped_GT_UVDoc)
+                loss_grid2D = criterionL1(grid2D_pred_UVDoc, grid2D_GT_UVDoc)
+                loss_grid3D = criterionL1(grid3D_pred_UVDoc, grid3D_GT_UVDoc)
+                netLoss = args.alpha_w * loss_grid2D + args.beta_w * loss_grid3D + gamma_w * recon_loss
+                sum_l1_g2 += float(loss_grid2D.detach())
+                sum_l1_g3 += float(loss_grid3D.detach())
+                sum_l1_rec += float(recon_loss.detach())
+                sum_net += float(netLoss.detach())
+                netLoss.backward()
+                optimizer.step()
+                tmp_mse = criterionMSE(unwarped_pred_UVDoc, unwarped_GT_UVDoc)
+                train_mse += float(tmp_mse)
+                losscount += 1
+            gc.collect()
+        train_mse = train_mse / max(1, losscount)
+        curr_lr = update_learning_rate(scheduler, optimizer)
+        write_log_file(log_file_name, train_mse, epoch + 1, curr_lr, "Train")
+        if losscount > 0:
+            avg_g2 = sum_l1_g2 / losscount
+            avg_g3 = sum_l1_g3 / losscount
+            avg_rec = sum_l1_rec / losscount
+            avg_net = sum_net / losscount
+            write_train_loss_detail_log(
+                log_file_name, epoch + 1, curr_lr, avg_net, avg_g2, avg_g3, avg_rec, gamma_w
+            )
+            print(
+                f"Epoch {epoch} train L1 | net={avg_net:.5f} g2d={avg_g2:.5f} g3d={avg_g3:.5f} recon={avg_rec:.5f} gamma_w={gamma_w}",
+                flush=True,
+            )
+        # Evaluation
+        train_mse_eval = None
+        UVDocnet.eval()
+        with torch.no_grad():
+            mse_loss_val = 0.0
+            val_iter = valloader
+            if tqdm is not None and not args.no_tqdm:
+                val_iter = tqdm(
+                    valloader,
+                    desc=f"val epoch {epoch}",
+                    dynamic_ncols=True,
+                    mininterval=2.0,
+                    file=sys.stdout,
+                    leave=False,
+                )
+            for imgs_val_, imgs_unwarped_val_, _, _ in val_iter:
+                imgs_val = imgs_val_.to(device)
+                unwarped_GT_val = imgs_unwarped_val_.to(device)
+                grid2D_pred_val, grid3D_pred_val = UVDocnet(imgs_val)
+                unwarped_pred_val = utils.bilinear_unwarping(imgs_val, grid2D_pred_val, utils.IMG_SIZE)
+                loss_img_val = criterionMSE(unwarped_pred_val, unwarped_GT_val)
+                mse_loss_val += float(loss_img_val)
+            denom = max(1, len(valloader))
+            val_mse = mse_loss_val / denom
+            write_log_file(log_file_name, val_mse, epoch + 1, curr_lr, "Val")
+            if getattr(args, "log_eval_mse_train", False) and args.data_to_use == "uvdoc":
+                mse_tr = 0.0
+                denom_tr = max(1, len(trainloader))
+                for imgs_tr_, imgs_uw_tr_, _, _ in trainloader:
+                    imgs_tr = imgs_tr_.to(device)
+                    uw_gt_tr = imgs_uw_tr_.to(device)
+                    g2_tr, _ = UVDocnet(imgs_tr)
+                    pred_tr = utils.bilinear_unwarping(imgs_tr, g2_tr, utils.IMG_SIZE)
+                    mse_tr += float(criterionMSE(pred_tr, uw_gt_tr))
+                train_mse_eval = mse_tr / denom_tr
+                write_log_file(log_file_name, train_mse_eval, epoch + 1, curr_lr, "TrainEval")
+        if train_mse_eval is not None:
+            print(
+                f"Epoch {epoch} summary | train_mse={train_mse:.5f} val_mse={val_mse:.5f} "
+                f"train_mse_eval={train_mse_eval:.5f} lr={curr_lr:.7f}",
+                flush=True,
+            )
+        else:
+            print(
+                f"Epoch {epoch} summary | train_mse={train_mse:.5f} val_mse={val_mse:.5f} lr={curr_lr:.7f}",
+                flush=True,
+            )
+        # save best models
+        if val_mse < best_val_mse or epoch == args.n_epochs + args.n_epochs_decay:
+            best_val_mse = val_mse
+            state = {
+                "epoch": epoch + 1,
+                "model_state": UVDocnet.state_dict(),
+                "optimizer_state": optimizer.state_dict(),
+            }
+            model_path = exp_log_dir + f"ep_{epoch + 1}_{val_mse:.5f}_{train_mse:.5f}_best_model.pkl"
+            torch.save(state, model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Hyperparams")
+    parser.add_argument(
+        "--data_path_doc3D", nargs="?", type=str, default="./data/doc3D/", help="Data path to load Doc3D data."
+    )
+    parser.add_argument(
+        "--data_path_UVDoc", nargs="?", type=str, default="./data/UVDoc/", help="Data path to load UVDoc data."
+    )
+    parser.add_argument(
+        "--device", type=str, default="cuda:0", help="Torch device, e.g. cuda:0 or cpu."
+    )
+    parser.add_argument(
+        "--uvdoc_val_ratio",
+        type=float,
+        default=0.05,
+        help="Hold-out ratio for UVDoc train/val when data_to_use=uvdoc.",
+    )
+    parser.add_argument(
+        "--uvdoc_split_seed",
+        type=int,
+        default=42,
+        help="Random seed for UVDoc train/val split when data_to_use=uvdoc.",
+    )
+    parser.add_argument(
+        "--uvdoc_split_mode",
+        type=str,
+        default="sample",
+        choices=["sample", "geom"],
+        help="UVDoc train/val split: random by image id (sample) or by geom_name so no geometry appears in both splits (geom).",
+    )
+    parser.add_argument(
+        "--uvdoc_grid3d_stats",
+        type=str,
+        default=None,
+        help="Optional JSON from compute_uvdoc_grid3d_stats.py; overrides built-in grid3d min/max normalization.",
+    )
+    parser.add_argument(
+        "--overfit_n",
+        type=int,
+        default=0,
+        help="If >0 (with data_to_use=uvdoc), use only the first N sorted samples for BOTH train and val to sanity-check fitting.",
+    )
+    parser.add_argument(
+        "--data_to_use",
+        type=str,
+        default="both",
+        choices=["both", "doc3d", "uvdoc"],
+        help="Dataset: both (Doc3D+UVDoc, official), doc3d only, or uvdoc (UVDoc only; extension for local training without Doc3D).",
+    )
+    parser.add_argument("--batch_size", nargs="?", type=int, default=8, help="Batch size.")
+    parser.add_argument(
+        "--n_epochs",
+        nargs="?",
+        type=int,
+        default=10,
+        help="Number of epochs with initial (constant) learning rate.",
+    )
+    parser.add_argument(
+        "--n_epochs_decay",
+        nargs="?",
+        type=int,
+        default=10,
+        help="Number of epochs to linearly decay learning rate to zero.",
+    )
+    parser.add_argument("--lr", nargs="?", type=float, default=0.0002, help="Initial learning rate.")
+    parser.add_argument("--alpha_w", nargs="?", type=float, default=5.0, help="Weight for the 2D grid L1 loss.")
+    parser.add_argument("--beta_w", nargs="?", type=float, default=5.0, help="Weight for the 3D grid L1 loss.")
+    parser.add_argument(
+        "--gamma_w", nargs="?", type=float, default=1.0, help="Weight for the image reconstruction loss."
+    )
+    parser.add_argument(
+        "--ep_gamma_start",
+        nargs="?",
+        type=int,
+        default=10,
+        help="Epoch from which to start using image reconstruction loss.",
+    )
+    parser.add_argument(
+        "--resume",
+        nargs="?",
+        type=str,
+        default=None,
+        help="Path to previous saved model to restart from.",
+    )
+    parser.add_argument("--logdir", nargs="?", type=str, default="./log/default", help="Path to store the logs.")
+    parser.add_argument(
+        "-a",
+        "--appearance_augmentation",
+        nargs="*",
+        type=str,
+        default=["visual", "noise", "color"],
+        choices=["shadow", "blur", "visual", "noise", "color"],
+        help="Appearance augmentations to use.",
+    )
+    parser.add_argument(
+        "-gUVDoc",
+        "--geometric_augmentationsUVDoc",
+        nargs="*",
+        type=str,
+        default=["rotate"],
+        choices=["rotate", "flip", "perspective"],
+        help="Geometric augmentations to use for the UVDoc dataset.",
+    )
+    parser.add_argument("--num_workers", type=int, default=8, help="Number of workers to use for the dataloaders.")
+    parser.add_argument(
+        "--no_tqdm",
+        action="store_true",
+        help="Disable tqdm progress bars (use plain loops + epoch summary prints).",
+    )
+    parser.add_argument(
+        "--log_eval_mse_train",
+        action="store_true",
+        help="After val, also MSE(unwarp) in eval() on UVDoc train loader; matches verify_ckpt_val_pipeline.py (not train-mode train_mse).",
+    )
+    args = parser.parse_args()
+    main_worker(args)

UVDoc_official/utils.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import os
+import torch
+import torch.nn.functional as F
+from model import UVDocnet
+IMG_SIZE = [488, 712]
+GRID_SIZE = [45, 31]
+def load_model(ckpt_path):
+    """
+    Load UVDocnet model.
+    """
+    model = UVDocnet(num_filter=32, kernel_size=5)
+    ckpt = torch.load(ckpt_path)
+    model.load_state_dict(ckpt["model_state"])
+    return model
+def get_version():
+    """
+    Returns the version of the various packages used for evaluation.
+    """
+    import pytesseract
+    return {
+        "tesseract": str(pytesseract.get_tesseract_version()),
+        "pyesseract": os.popen("pip list | grep pytesseract").read().split()[-1],
+        "Levenshtein": os.popen("pip list | grep Levenshtein").read().split()[-1],
+        "jiwer": os.popen("pip list | grep jiwer").read().split()[-1],
+        "matlabengineforpython": os.popen("pip list | grep matlab").read().split()[-1],
+    }
+def bilinear_unwarping(warped_img, point_positions, img_size):
+    """
+    Utility function that unwarps an image.
+    Unwarp warped_img based on the 2D grid point_positions with a size img_size.
+    Args:
+        warped_img  :       torch.Tensor of shape BxCxHxW (dtype float)
+        point_positions:    torch.Tensor of shape Bx2xGhxGw (dtype float)
+        img_size:           tuple of int [w, h]
+    """
+    upsampled_grid = F.interpolate(
+        point_positions, size=(img_size[1], img_size[0]), mode="bilinear", align_corners=True
+    )
+    unwarped_img = F.grid_sample(warped_img, upsampled_grid.transpose(1, 2).transpose(2, 3), align_corners=True)
+    return unwarped_img
+def bilinear_unwarping_from_numpy(warped_img, point_positions, img_size):
+    """
+    Utility function that unwarps an image.
+    Unwarp warped_img based on the 2D grid point_positions with a size img_size.
+    Accept numpy arrays as input.
+    """
+    warped_img = torch.unsqueeze(torch.from_numpy(warped_img.transpose(2, 0, 1)).float(), dim=0)
+    point_positions = torch.unsqueeze(torch.from_numpy(point_positions.transpose(2, 0, 1)).float(), dim=0)
+    unwarped_img = bilinear_unwarping(warped_img, point_positions, img_size)
+    unwarped_img = unwarped_img[0].numpy().transpose(1, 2, 0)
+    return unwarped_img

UVDoc_official/uvdocBenchmark_eval.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import argparse
+import json
+import multiprocessing as mp
+import os
+from utils import get_version
+N_LINES = 25
+def visual_metrics_process(queue, uvdoc_path, preds_path, verbose):
+    """
+    Subprocess function that computes visual metrics (MS-SSIM, LD, and AD) based on a matlab script.
+    """
+    import matlab.engine
+    eng = matlab.engine.start_matlab()
+    eng.cd(r"./eval/eval_code/", nargout=0)
+    mean_ms, mean_ad = eng.evalScriptUVDoc(uvdoc_path, preds_path, verbose, nargout=2)
+    queue.put(dict(ms=mean_ms, ad=mean_ad))
+def ocr_process(queue, uvdoc_path, preds_path):
+    """
+    Subprocess function that computes OCR metrics (CER and ED).
+    """
+    from eval.ocr_eval.ocr_eval import OCR_eval_UVDoc
+    CERmean, EDmean, OCR_dict_results = OCR_eval_UVDoc(uvdoc_path, preds_path)
+    with open(os.path.join(preds_path, "ocr_res.json"), "w") as f:
+        json.dump(OCR_dict_results, f)
+    queue.put(dict(cer=CERmean, ed=EDmean))
+def new_line_metric_process(queue, uvdoc_path, preds_path, n_lines):
+    """
+    Subprocess function that computes the new line metrics on the UVDoc benchmark.
+    """
+    from uvdocBenchmark_metric import compute_line_metric
+    hor_metric, ver_metric = compute_line_metric(uvdoc_path, preds_path, n_lines)
+    queue.put(dict(hor_line=hor_metric, ver_line=ver_metric))
+def compute_metrics(uvdoc_path, pred_path, pred_type, verbose=False):
+    """
+    Compute and save all metrics.
+    """
+    if not pred_path.endswith("/"):
+        pred_path += "/"
+    q = mp.Queue()
+    # Create process to compute MS-SSIM, LD, AD
+    p1 = mp.Process(
+        target=visual_metrics_process,
+        args=(q, os.path.join(uvdoc_path, "texture_sample"), os.path.join(pred_path, pred_type), verbose),
+    )
+    p1.start()
+    # Create process to compute new line metrics
+    p2 = mp.Process(
+        target=new_line_metric_process,
+        args=(q, uvdoc_path, os.path.join(pred_path, "bm"), N_LINES),
+    )
+    p2.start()
+    # Create process to compute OCR metrics
+    p3 = mp.Process(
+        target=ocr_process, args=(q, os.path.join(uvdoc_path, "texture_sample"), os.path.join(pred_path, pred_type))
+    )
+    p3.start()
+    p1.join()
+    p2.join()
+    p3.join()
+    # Get results
+    res = {}
+    for _ in range(q.qsize()):
+        ret = q.get()
+        for k, v in ret.items():
+            res[k] = v
+    # Print and saves results
+    print("--- Results ---")
+    print(f"  Mean MS-SSIM      : {res['ms']}")
+    print(f"  Mean AD           : {res['ad']}")
+    print(f"  Mean CER          : {res['cer']}")
+    print(f"  Mean ED           : {res['ed']}")
+    print(f"  Hor Line          : {res['hor_line']}")
+    print(f"  Ver Line          : {res['ver_line']}")
+    with open(os.path.join(pred_path, pred_type, "resUVDoc.txt"), "w") as f:
+        f.write(f"Mean MS-SSIM      : {res['ms']}\n")
+        f.write(f"Mean AD           : {res['ad']}\n")
+        f.write(f"Mean CER          : {res['cer']}\n")
+        f.write(f"Mean ED           : {res['ed']}\n")
+        f.write(f"Hor Line          : {res['hor_line']}\n")
+        f.write(f"Ver Line          : {res['ver_line']}\n")
+        f.write("\n--- Module Version ---\n")
+        for module, version in get_version().items():
+            f.write(f"{module:25s}: {version}\n")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--uvdoc-path", type=str, default="./data/UVDoc_benchmark/", help="Path to the uvdoc benchmark dataset"
+    )
+    parser.add_argument("--pred-path", type=str, help="Path to the UVDoc benchmark predictions. Need to be absolute.")
+    parser.add_argument(
+        "--pred-type",
+        type=str,
+        default="uwp_texture",
+        choices=["uwp_texture", "uwp_img"],
+        help="Which type of prediction to compare. Either the unwarped textures or the unwarped litted images.",
+    )
+    parser.add_argument("-v", "--verbose", action="store_true")
+    args = parser.parse_args()
+    compute_metrics(
+        uvdoc_path=os.path.abspath(args.uvdoc_path),
+        pred_path=os.path.abspath(args.pred_path),
+        pred_type=args.pred_type,
+        verbose=args.verbose,
+    )

UVDoc_official/uvdocBenchmark_metric.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import json
+import os
+from os.path import join as pjoin
+import hdf5storage as h5
+import numpy as np
+import torch
+import torch.nn.functional as F
+from skimage.morphology import binary_erosion
+from tqdm import tqdm
+from utils import bilinear_unwarping_from_numpy
+WIDTH = 1000
+HEIGHT = 1000
+def create_vertical_stripe_texture(width, height, stripe_width=1, position=0):
+    """
+    Create an image with a vertical stripe.
+    """
+    im = np.ones((height, width, 3), dtype=np.uint8) * 255
+    im[:, position : position + stripe_width] = 0
+    return im
+def create_horizontal_stripe_texture(width, height, stripe_width=1, position=0):
+    """
+    Create an image with a horizontal stripe.
+    """
+    im = np.ones((height, width, 3), dtype=np.uint8) * 255
+    im[position : position + stripe_width, :] = 0
+    return im
+def warp_texture(texture, uvmap):
+    """
+    Warp an input texture based on the provided uvmap.
+    """
+    # Warp the texture based on the uv
+    torch_texture_unwarp = torch.from_numpy(np.expand_dims(texture.transpose(2, 0, 1), axis=0)).float()
+    uvmap_torch = torch.from_numpy(np.expand_dims(uvmap * 2 - 1, axis=0)).float()
+    warped_texture = F.grid_sample(torch_texture_unwarp, uvmap_torch, align_corners=False)
+    warped_texture = np.clip(warped_texture[0].numpy().transpose(1, 2, 0), 0, 255) / 255
+    # Postprocessing to have nicer results
+    grey = np.all(warped_texture == 0.5, axis=-1)
+    warped_texture[grey] = np.nan
+    mask = 1 - np.all(np.isnan(warped_texture), axis=-1).astype(int)
+    mask_small = binary_erosion(mask).astype(int)
+    mask_small = np.expand_dims(mask_small, axis=-1)
+    warped_texture[np.repeat(~mask_small.astype(bool), 3, axis=-1)] = 1
+    warped_texture = (warped_texture * 255).astype(np.uint8)
+    return warped_texture
+def compute_metric_single_line(uvmap, bm, pos, direction="horizontal"):
+    """
+    Compute the line metric for a single line.
+    args:
+        uvmap: uvmap of the document, shape (height, width, 2)
+        bm: predicted backward mapping, shape (height, width, 2)
+        pos: position of the line to compute the metric
+        direction: direction of the line to compute the metric (horizontal or vertical)
+    """
+    # Create the original straight line
+    if direction == "horizontal":
+        stripe = create_horizontal_stripe_texture(WIDTH, HEIGHT, stripe_width=1, position=pos)
+    elif direction == "vertical":
+        stripe = create_vertical_stripe_texture(WIDTH, HEIGHT, stripe_width=1, position=pos)
+    else:
+        raise ValueError("Direction must be horizontal or vertical")
+    # Warp the stripe according to the ground truth uvmap and unwarp it according to the predicted bm
+    warped_stripe = warp_texture(stripe, uvmap)
+    unwarped_stripe = bilinear_unwarping_from_numpy(warped_stripe.astype(float) / 255.0, bm, (WIDTH, HEIGHT))
+    # Binarize the result
+    THRESH = 0.5
+    unwarped_stripe = unwarped_stripe[:, :, 0]
+    unwarped_stripe[unwarped_stripe < THRESH] = 0
+    unwarped_stripe[unwarped_stripe >= THRESH] = 1
+    # Find the black pixels
+    xs, ys = np.where(unwarped_stripe == 0)
+    if len(xs) == 0 or len(ys) == 0:
+        # No black pixels in the line, this means that the backward mapping is pretty bad
+        return np.nan
+    # Compute the metric
+    if direction == "horizontal":
+        return np.std(xs)
+    elif direction == "vertical":
+        return np.std(ys)
+def compute_sample_line_metric(uvdoc_path, pred_path, sample, n_lines):
+    """
+    Compute all lines metric for a given sample.
+    """
+    # Load ground truth UVmap
+    metadata_path = pjoin(uvdoc_path, "metadata_sample", f"{sample}.json")
+    with open(metadata_path, "r") as f:
+        metadata = json.load(f)
+    uvmap_path = pjoin(uvdoc_path, "uvmap", f"{metadata['geom_name']}.mat")
+    uvmap = h5.loadmat(uvmap_path)["uv"]
+    # Load predicted backward mapping
+    bm_path = pjoin(pred_path, f"{sample}.mat")
+    bm = h5.loadmat(bm_path)["bm"]
+    # Compute metric
+    stds_hor = []
+    stds_ver = []
+    for pos in np.linspace(50, 950, n_lines, dtype=int):
+        uvmap = h5.loadmat(uvmap_path)["uv"]
+        stds_hor.append(compute_metric_single_line(uvmap, bm, pos, direction="horizontal"))
+        stds_ver.append(compute_metric_single_line(uvmap, bm, pos, direction="vertical"))
+    return np.nanmean(stds_hor), np.nanmean(stds_ver)
+def compute_line_metric(uvdoc_path, pred_path, n_lines=25):
+    """
+    Compute the line metric over the whole UVDoc dataset.
+    """
+    # Find all samples
+    all_samples = sorted([x[:-4] for x in os.listdir(pjoin(uvdoc_path, "img"))])
+    # Compute the metric for each sample
+    lines = []
+    cols = []
+    for sample in tqdm(all_samples):
+        hor, ver = compute_sample_line_metric(uvdoc_path, pred_path, sample, n_lines)
+        lines.append(hor)
+        cols.append(ver)
+    # Saves all results including individual ones
+    with open(os.path.join(pred_path, "line_metric.json"), "w") as f:
+        json.dump(
+            {sample: {"hor": lines[i], "ver": cols[i]} for i, sample in enumerate(all_samples)},
+            f,
+        )
+    with open(os.path.join(pred_path, "line_metric_mean.json"), "w") as f:
+        json.dump(
+            {"hor": np.mean(lines), "ver": np.mean(cols)},
+            f,
+        )
+    return np.mean(lines), np.mean(cols)

UVDoc_official/uvdocBenchmark_pred.py ADDED Viewed

	@@ -0,0 +1,131 @@

+import argparse
+import os
+import cv2
+import hdf5storage as h5
+import numpy as np
+import torch
+from tqdm import tqdm
+from utils import IMG_SIZE, bilinear_unwarping, load_model
+class UVDocBenchmarkLoader(torch.utils.data.Dataset):
+    """
+    Torch dataset class for the UVDoc benchmark dataset.
+    """
+    def __init__(
+        self,
+        data_path,
+        img_size=(488, 712),
+    ):
+        self.dataroot = data_path
+        self.im_list = os.listdir(os.path.join(self.dataroot, "img"))
+        self.img_size = img_size
+    def __len__(self):
+        return len(self.im_list)
+    def __getitem__(self, index):
+        im_name = self.im_list[index]
+        img_path = os.path.join(self.dataroot, "img", im_name)
+        img_RGB = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
+        img_RGB = torch.from_numpy(cv2.resize(img_RGB, self.img_size).transpose(2, 0, 1))
+        return img_RGB, im_name
+def infer_uvdoc(model, dataloader, device, save_path):
+    """
+    Unwarp all images in the UVDoc benchmark and save them, along with the mappings.
+    """
+    model.eval()
+    os.makedirs(os.path.join(save_path, "uwp_img"), exist_ok=True)
+    os.makedirs(os.path.join(save_path, "bm"), exist_ok=True)
+    os.makedirs(os.path.join(save_path, "uwp_texture"), exist_ok=True)
+    for img_RGB, im_names in tqdm(dataloader):
+        # Inference
+        img_RGB = img_RGB.to(device)
+        point_positions2D, _ = model(img_RGB)
+        # Warped image need to be re-open to get full resolution (downsampled in data loader)
+        warped = cv2.imread(os.path.join(dataloader.dataset.dataroot, "img", im_names[0]))
+        warped = cv2.cvtColor(warped, cv2.COLOR_BGR2RGB)
+        warped = torch.from_numpy(warped.transpose(2, 0, 1) / 255.0).float()
+        size = warped.shape[1:][::-1]
+        # Unwarping
+        unwarped = bilinear_unwarping(
+            warped_img=torch.unsqueeze(warped, dim=0).to(device),
+            point_positions=torch.unsqueeze(point_positions2D[0], dim=0),
+            img_size=tuple(size),
+        )
+        unwarped = (unwarped[0].detach().cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
+        unwarped_BGR = cv2.cvtColor(unwarped, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(
+            os.path.join(save_path, "uwp_img", im_names[0].split(" ")[0].split(".")[0] + ".png"),
+            unwarped_BGR,
+        )
+        # Unwarp and save the texture
+        warp_texture = cv2.imread(os.path.join(dataloader.dataset.dataroot, "warped_textures", im_names[0]))
+        warp_texture = cv2.cvtColor(warp_texture, cv2.COLOR_BGR2RGB)
+        warp_texture = torch.from_numpy(warp_texture.transpose(2, 0, 1) / 255.0).float()
+        size = warp_texture.shape[1:][::-1]
+        unwarped_texture = bilinear_unwarping(
+            warped_img=torch.unsqueeze(warp_texture, dim=0).to(device),
+            point_positions=torch.unsqueeze(point_positions2D[0], dim=0),
+            img_size=tuple(size),
+        )
+        unwarped_texture = (unwarped_texture[0].detach().cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
+        unwarped_texture_BGR = cv2.cvtColor(unwarped_texture, cv2.COLOR_RGB2BGR)
+        cv2.imwrite(
+            os.path.join(save_path, "uwp_texture", im_names[0].split(" ")[0].split(".")[0] + ".png"),
+            unwarped_texture_BGR,
+        )
+        # Save Backward Map
+        h5.savemat(
+            os.path.join(save_path, "bm", im_names[0].split(" ")[0].split(".")[0] + ".mat"),
+            {"bm": point_positions2D[0].detach().cpu().numpy().transpose(1, 2, 0)},
+        )
+def create_uvdoc_results(ckpt_path, uvdoc_path, img_size):
+    """
+    Create results for the UVDoc benchmark.
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # Load model, create dataset and save directory
+    model = load_model(ckpt_path)
+    model.to(device)
+    dataset = UVDocBenchmarkLoader(data_path=uvdoc_path, img_size=img_size)
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=False, drop_last=False)
+    save_path = os.path.join("/".join(ckpt_path.split("/")[:-1]), "output_uvdoc")
+    os.makedirs(save_path, exist_ok=True)
+    print(f"    Results will be saved at {save_path}", flush=True)
+    # Infer results
+    infer_uvdoc(model, dataloader, "cuda:0", save_path)
+    return save_path
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ckpt-path", type=str, default="./model/best_model.pkl", help="Path to the model weights as pkl."
+    )
+    parser.add_argument(
+        "--uvdoc-path", type=str, default="./data/UVDoc_benchmark/", help="Path to the UVDocBenchmark dataset."
+    )
+    args = parser.parse_args()
+    create_uvdoc_results(args.ckpt_path, os.path.abspath(args.uvdoc_path), IMG_SIZE)

UVDoc_official/verify_ckpt_val_pipeline.py ADDED Viewed

	@@ -0,0 +1,153 @@

+#!/usr/bin/env python3
+"""
+Run inference with the *same* preprocessing as train.py validation (UVDocDataset).
+Official demo.py resizes the full image only; training uses tight crop + resize.
+Use this script to check train/val vs inference consistency on UVDoc.
+Mean MSE printed at the end matches train.py val when batch_size divides val set evenly
+(mean of per-batch MSE with default MSELoss reduction='mean' equals mean per-image MSE here).
+"""
+import argparse
+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import data_UVDoc
+import utils
+from model import UVDocnet
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt", type=str, required=True, help="Path to ep_*_best_model.pkl")
+    parser.add_argument("--data_path_UVDoc", type=str, required=True)
+    parser.add_argument(
+        "--overfit_n",
+        type=int,
+        default=0,
+        help="If >0, same as train.py: first N sorted samples, deterministic crop.",
+    )
+    parser.add_argument(
+        "--uvdoc_val_ratio",
+        type=float,
+        default=0.05,
+        help="Must match train.py when using split=val (not overfit).",
+    )
+    parser.add_argument(
+        "--uvdoc_split_seed",
+        type=int,
+        default=42,
+        help="Must match train.py when using split=val.",
+    )
+    parser.add_argument(
+        "--uvdoc_split_mode",
+        type=str,
+        default="sample",
+        choices=["sample", "geom"],
+        help="Must match train.py when using split=val.",
+    )
+    parser.add_argument(
+        "--uvdoc_grid3d_stats",
+        type=str,
+        default=None,
+        help="Must match train.py if used during training.",
+    )
+    parser.add_argument("--out_dir", type=str, required=True, help="Directory for metrics.txt (and PNGs unless --no_save_images).")
+    parser.add_argument(
+        "--no_save_images",
+        action="store_true",
+        help="Only write metrics.txt + mean MSE (for full val set, avoids huge I/O).",
+    )
+    parser.add_argument(
+        "--max_save_images",
+        type=int,
+        default=0,
+        help="If >0, save at most this many samples' PNG triplets (first indices).",
+    )
+    parser.add_argument("--device", type=str, default="cuda:0")
+    args = parser.parse_args()
+    device = torch.device(args.device)
+    if args.overfit_n and int(args.overfit_n) > 0:
+        ds = data_UVDoc.UVDocDataset(
+            data_path=args.data_path_UVDoc,
+            appearance_augmentation=[],
+            geometric_augmentations=[],
+            overfit=True,
+            max_samples=int(args.overfit_n),
+            deterministic_crop=True,
+            grid3d_stats_path=args.uvdoc_grid3d_stats,
+        )
+    else:
+        ds = data_UVDoc.UVDocDataset(
+            data_path=args.data_path_UVDoc,
+            appearance_augmentation=[],
+            geometric_augmentations=[],
+            split="val",
+            val_ratio=float(args.uvdoc_val_ratio),
+            split_seed=int(args.uvdoc_split_seed),
+            split_mode=args.uvdoc_split_mode,
+            grid3d_stats_path=args.uvdoc_grid3d_stats,
+            deterministic_crop=True,
+        )
+    os.makedirs(args.out_dir, exist_ok=True)
+    net = UVDocnet(num_filter=32, kernel_size=5)
+    ckpt = torch.load(args.ckpt, map_location=device)
+    net.load_state_dict(ckpt["model_state"])
+    net.to(device)
+    net.eval()
+    criterion_mse = nn.MSELoss()
+    lines = []
+    mses = []
+    n_save = 0
+    save_cap = args.max_save_images if args.max_save_images and args.max_save_images > 0 else None
+    with torch.no_grad():
+        for idx in range(len(ds)):
+            img_w, img_uw_gt, _, _ = ds[idx]
+            x = img_w.unsqueeze(0).to(device)
+            gt = img_uw_gt.unsqueeze(0).to(device)
+            g2d, _ = net(x)
+            pred_uw = utils.bilinear_unwarping(x, g2d, tuple(utils.IMG_SIZE))
+            mse = float(criterion_mse(pred_uw, gt))
+            mses.append(mse)
+            sid = ds.all_samples[idx]
+            lines.append(f"{sid} mse={mse:.6f}\n")
+            do_save = not args.no_save_images and (save_cap is None or n_save < save_cap)
+            if do_save:
+                def to_bgr_u8(t):
+                    a = (t.squeeze(0).cpu().numpy().transpose(1, 2, 0) * 255.0).clip(0, 255).astype(np.uint8)
+                    return cv2.cvtColor(a, cv2.COLOR_RGB2BGR)
+                cv2.imwrite(os.path.join(args.out_dir, f"{sid}_in_warped.png"), to_bgr_u8(x))
+                cv2.imwrite(os.path.join(args.out_dir, f"{sid}_gt_unwarp.png"), to_bgr_u8(gt))
+                cv2.imwrite(os.path.join(args.out_dir, f"{sid}_pred_unwarp.png"), to_bgr_u8(pred_uw))
+                n_save += 1
+    mean_mse = float(np.mean(mses)) if mses else 0.0
+    lines.append(f"mean_mse {mean_mse:.8f} n={len(mses)}\n")
+    with open(os.path.join(args.out_dir, "metrics.txt"), "w") as f:
+        f.writelines(lines)
+    print(f"Wrote metrics for {len(mses)} samples to {args.out_dir}")
+    print(f"mean_mse={mean_mse:.8f}  (compare to train log Val MSE for the same checkpoint epoch)")
+    if not args.no_save_images and save_cap is not None:
+        print(f"Saved PNG triplets for first {n_save} samples (max_save_images={save_cap}).")
+if __name__ == "__main__":
+    main()

UVDoc_official/verify_uvdoc_train_infer_preprocess.py ADDED Viewed

	@@ -0,0 +1,169 @@

+#!/usr/bin/env python3
+"""
+Assert UVDoc preprocessing is identical between:
+- train.py (overfit / val UVDocDataset kwargs)
+- verify_ckpt_val_pipeline.py (same kwargs)
+This catches silent drift if one path changes crop/aug/split/grid3d stats.
+Optional: compare DataLoader batch tensors to manual stack (shuffle=False).
+"""
+from __future__ import annotations
+import argparse
+import os
+import sys
+import torch
+from torch.utils.data import DataLoader
+import data_UVDoc
+def _ds_train_overfit(args) -> data_UVDoc.UVDocDataset:
+    """Mirror train.py setup_data() when overfit_n > 0 (train branch)."""
+    return data_UVDoc.UVDocDataset(
+        data_path=args.data_path_UVDoc,
+        appearance_augmentation=[],
+        geometric_augmentations=[],
+        overfit=True,
+        max_samples=int(args.overfit_n),
+        deterministic_crop=True,
+        grid3d_stats_path=args.uvdoc_grid3d_stats,
+    )
+def _ds_verify_overfit(args) -> data_UVDoc.UVDocDataset:
+    """Mirror verify_ckpt_val_pipeline.py when overfit_n > 0."""
+    return data_UVDoc.UVDocDataset(
+        data_path=args.data_path_UVDoc,
+        appearance_augmentation=[],
+        geometric_augmentations=[],
+        overfit=True,
+        max_samples=int(args.overfit_n),
+        deterministic_crop=True,
+        grid3d_stats_path=args.uvdoc_grid3d_stats,
+    )
+def _ds_train_val_split(args) -> data_UVDoc.UVDocDataset:
+    """Mirror train.py val loader (non-overfit)."""
+    return data_UVDoc.UVDocDataset(
+        data_path=args.data_path_UVDoc,
+        appearance_augmentation=[],
+        geometric_augmentations=[],
+        split="val",
+        val_ratio=float(args.uvdoc_val_ratio),
+        split_seed=int(args.uvdoc_split_seed),
+        split_mode=args.uvdoc_split_mode,
+        grid3d_stats_path=args.uvdoc_grid3d_stats,
+    )
+def _ds_verify_val_split(args) -> data_UVDoc.UVDocDataset:
+    """Mirror verify_ckpt_val_pipeline.py val branch (explicit deterministic_crop)."""
+    return data_UVDoc.UVDocDataset(
+        data_path=args.data_path_UVDoc,
+        appearance_augmentation=[],
+        geometric_augmentations=[],
+        split="val",
+        val_ratio=float(args.uvdoc_val_ratio),
+        split_seed=int(args.uvdoc_split_seed),
+        split_mode=args.uvdoc_split_mode,
+        grid3d_stats_path=args.uvdoc_grid3d_stats,
+        deterministic_crop=True,
+    )
+def _assert_close(name: str, a: torch.Tensor, b: torch.Tensor, rtol: float, atol: float) -> None:
+    if a.shape != b.shape:
+        raise AssertionError(f"{name}: shape mismatch {tuple(a.shape)} vs {tuple(b.shape)}")
+    if not torch.allclose(a, b, rtol=rtol, atol=atol):
+        d = (a.float() - b.float()).abs().max().item()
+        raise AssertionError(f"{name}: max abs diff {d} (rtol={rtol}, atol={atol})")
+def _compare_sample(ds_a, ds_b, idx: int, rtol: float, atol: float) -> None:
+    wa, ua, g2a, g3a = ds_a[idx]
+    wb, ub, g2b, g3b = ds_b[idx]
+    sid_a = ds_a.all_samples[idx]
+    sid_b = ds_b.all_samples[idx]
+    if sid_a != sid_b:
+        raise AssertionError(f"index {idx}: sample id mismatch {sid_a!r} vs {sid_b!r}")
+    _assert_close(f"[{sid_a}] warped", wa, wb, rtol, atol)
+    _assert_close(f"[{sid_a}] unwarped_gt", ua, ub, rtol, atol)
+    _assert_close(f"[{sid_a}] grid2d", g2a, g2b, rtol, atol)
+    _assert_close(f"[{sid_a}] grid3d", g3a, g3b, rtol, atol)
+def main() -> int:
+    p = argparse.ArgumentParser()
+    p.add_argument("--data_path_UVDoc", type=str, required=True)
+    p.add_argument("--overfit_n", type=int, default=1, help=">0: compare overfit train vs verify constructors.")
+    p.add_argument(
+        "--mode",
+        type=str,
+        default="overfit",
+        choices=["overfit", "val_split", "both"],
+        help="overfit: first N sorted ids; val_split: hold-out val set kwargs alignment.",
+    )
+    p.add_argument("--uvdoc_val_ratio", type=float, default=0.05)
+    p.add_argument("--uvdoc_split_seed", type=int, default=42)
+    p.add_argument("--uvdoc_split_mode", type=str, default="sample", choices=["sample", "geom"])
+    p.add_argument("--uvdoc_grid3d_stats", type=str, default=None)
+    p.add_argument("--rtol", type=float, default=1e-6)
+    p.add_argument("--atol", type=float, default=1e-6)
+    p.add_argument(
+        "--check_dataloader",
+        action="store_true",
+        help="Stack dataset tensors and compare to first batch (shuffle=False, matches index order).",
+    )
+    p.add_argument("--batch_size", type=int, default=8)
+    p.add_argument("--num_workers", type=int, default=0)
+    args = p.parse_args()
+    if not os.path.isdir(args.data_path_UVDoc):
+        print(f"data_path_UVDoc is not a directory: {args.data_path_UVDoc}", file=sys.stderr)
+        return 2
+    modes = ["overfit", "val_split"] if args.mode == "both" else [args.mode]
+    for m in modes:
+        if m == "overfit":
+            if int(args.overfit_n) <= 0:
+                print("--overfit_n must be > 0 for mode=overfit", file=sys.stderr)
+                return 2
+            a = _ds_train_overfit(args)
+            b = _ds_verify_overfit(args)
+            tag = f"overfit_n={args.overfit_n}"
+        else:
+            a = _ds_train_val_split(args)
+            b = _ds_verify_val_split(args)
+            tag = "val_split"
+        if len(a) != len(b):
+            raise AssertionError(f"{tag}: len mismatch {len(a)} vs {len(b)}")
+        for i in range(len(a)):
+            _compare_sample(a, b, i, args.rtol, args.atol)
+        if args.check_dataloader:
+            loader = DataLoader(a, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=False)
+            imgs_w, imgs_uw, g2, g3 = next(iter(loader))
+            n = imgs_w.shape[0]
+            w_stack = torch.stack([a[i][0] for i in range(n)], dim=0)
+            uw_stack = torch.stack([a[i][1] for i in range(n)], dim=0)
+            g2_stack = torch.stack([a[i][2] for i in range(n)], dim=0)
+            g3_stack = torch.stack([a[i][3] for i in range(n)], dim=0)
+            _assert_close(f"{tag} dataloader warped", imgs_w, w_stack, args.rtol, args.atol)
+            _assert_close(f"{tag} dataloader unwarped", imgs_uw, uw_stack, args.rtol, args.atol)
+            _assert_close(f"{tag} dataloader grid2d", g2, g2_stack, args.rtol, args.atol)
+            _assert_close(f"{tag} dataloader grid3d", g3, g3_stack, args.rtol, args.atol)
+        print(f"OK: {tag} — train.py vs verify_ckpt_val_pipeline.py UVDocDataset tensors match ({len(a)} samples).")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

baseline_resnet_unet/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""ResNet50 + U-Net UV baseline for UVDoc (see tech note in repo)."""
+from .model import ResNet50UNetUV
+__all__ = ["ResNet50UNetUV"]

baseline_resnet_unet/dataset.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from __future__ import annotations
+import json
+import os
+import random
+from os.path import join as pjoin
+import cv2
+import h5py
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+try:
+    import albumentations as A
+except ImportError:  # pragma: no cover
+    A = None
+ORIGINAL_GRID_SIZE = (89, 61)
+def _appearance_compose(names: list[str]):
+    if not names or A is None:
+        return None
+    transforms = []
+    if "visual" in names:
+        transforms.append(
+            A.OneOf(
+                [A.ToSepia(p=15), A.ToGray(p=20), A.Equalize(p=15), A.Sharpen(p=20)],
+                p=0.5,
+            )
+        )
+    if "noise" in names:
+        transforms.append(
+            A.OneOf(
+                [
+                    A.GaussNoise(var_limit=(10.0, 20.0), p=70),
+                    A.ISONoise(intensity=(0.1, 0.25), p=30),
+                ],
+                p=0.6,
+            )
+        )
+    if "color" in names:
+        transforms.append(
+            A.OneOf(
+                [
+                    A.ColorJitter(p=5),
+                    A.HueSaturationValue(p=10),
+                    A.RandomBrightnessContrast(brightness_limit=[-0.05, 0.25], p=85),
+                ],
+                p=0.95,
+            )
+        )
+    return A.Compose(transforms=transforms) if transforms else None
+def crop_image_tight(img: np.ndarray, grid2d: np.ndarray, deterministic: bool) -> tuple[np.ndarray, int, int, int, int]:
+    """grid2d: 2 x Gh x Gw in pixel coords of img."""
+    size = img.shape
+    minx = int(np.floor(np.amin(grid2d[0, :, :])))
+    maxx = int(np.ceil(np.amax(grid2d[0, :, :])))
+    miny = int(np.floor(np.amin(grid2d[1, :, :])))
+    maxy = int(np.ceil(np.amax(grid2d[1, :, :])))
+    s = 20
+    s = min(min(s, minx), miny)
+    s = min(min(s, size[1] - 1 - maxx), size[0] - 1 - maxy)
+    img = img[miny - s : maxy + s, minx - s : maxx + s, :]
+    if deterministic:
+        cx1 = cy1 = max((s - 5) // 2, 0)
+        cx2 = cy2 = max((s - 5) // 2, 0) + 1
+    else:
+        cx1 = random.randint(0, max(s - 5, 1))
+        cx2 = random.randint(0, max(s - 5, 1)) + 1
+        cy1 = random.randint(0, max(s - 5, 1))
+        cy2 = random.randint(0, max(s - 5, 1)) + 1
+    img = img[cy1:-cy2, cx1:-cx2, :]
+    top = miny - s + cy1
+    bot = size[0] - maxy - s + cy2
+    left = minx - s + cx1
+    right = size[1] - maxx - s + cx2
+    return img, top, bot, left, right
+def crop_tight_resize(
+    img_rgb: np.ndarray,
+    grid2d: np.ndarray,
+    out_wh: tuple[int, int],
+    deterministic: bool,
+) -> tuple[torch.Tensor, np.ndarray]:
+    """Returns image tensor CxHxW float [0,255] and normalized grid2d 2xGh x Gw in [-1,1]."""
+    size = img_rgb.shape
+    img, top, bot, left, right = crop_image_tight(img_rgb, grid2d, deterministic)
+    img = cv2.resize(img, out_wh)
+    img = img.transpose(2, 0, 1)
+    img_t = torch.from_numpy(img).float()
+    grid2d = grid2d.copy()
+    grid2d[0, :, :] = (grid2d[0, :, :] - left) / (size[1] - left - right)
+    grid2d[1, :, :] = (grid2d[1, :, :] - top) / (size[0] - top - bot)
+    grid2d = (grid2d * 2.0) - 1.0
+    return img_t, grid2d
+class UVDocDenseUVDataset(Dataset):
+    """
+    UVDoc final dataset: img/*.png + grid2d from metadata geom + HDF5 grid2d.
+    Produces dense UV GT at (out_h, out_w) via bilinear upsampling (official unwarp convention).
+    """
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        out_hw: tuple[int, int] = (256, 256),
+        appearance_augmentation: list[str] | None = None,
+        deterministic_crop: bool | None = None,
+    ) -> None:
+        super().__init__()
+        self.root = root
+        self.out_hw = out_hw
+        self.out_wh = (out_hw[1], out_hw[0])  # cv2 (W,H)
+        appearance_augmentation = appearance_augmentation or []
+        self.appearance = _appearance_compose(appearance_augmentation)
+        split_path = pjoin(root, "split.json")
+        all_ids = [x[:-4] for x in os.listdir(pjoin(root, "img")) if x.endswith(".png")]
+        all_ids.sort()
+        if os.path.isfile(split_path):
+            with open(split_path, "r", encoding="utf-8") as f:
+                sp = json.load(f)
+            key = "train" if split == "train" else "val"
+            allowed = set(str(x) for x in sp.get(key, []))
+            picked = [i for i in all_ids if i in allowed]
+            if not picked and split == "val" and all_ids:
+                n = max(1, len(all_ids) // 10)
+                picked = all_ids[-n:]
+            if not picked:
+                picked = list(all_ids)
+            self.sample_ids = picked
+        else:
+            self.sample_ids = all_ids
+        if deterministic_crop is None:
+            deterministic_crop = split != "train"
+        self.deterministic_crop = deterministic_crop
+    def __len__(self) -> int:
+        return len(self.sample_ids)
+    def __getitem__(self, index: int) -> dict[str, torch.Tensor]:
+        sample_id = self.sample_ids[index]
+        with open(pjoin(self.root, "metadata_sample", f"{sample_id}.json"), "r", encoding="utf-8") as f:
+            geom_name = json.load(f)["geom_name"]
+        img_path = pjoin(self.root, "img", f"{sample_id}.png")
+        grid2d_path = pjoin(self.root, "grid2d", f"{geom_name}.mat")
+        with h5py.File(grid2d_path, "r") as f:
+            grid2d = np.array(f["grid2d"][:].T.transpose(2, 0, 1), dtype=np.float32)
+        img_bgr = cv2.imread(img_path)
+        if img_bgr is None:
+            raise FileNotFoundError(img_path)
+        img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
+        if self.appearance is not None:
+            img_rgb = self.appearance(image=img_rgb)["image"]
+        img_t, grid2d_n = crop_tight_resize(
+            img_rgb, grid2d, self.out_wh, deterministic=self.deterministic_crop
+        )
+        grid_t = torch.from_numpy(grid2d_n).float()
+        grid_dense = torch.nn.functional.interpolate(
+            grid_t.unsqueeze(0),
+            size=self.out_hw,
+            mode="bilinear",
+            align_corners=True,
+        ).squeeze(0)
+        img_bchw = (img_t / 255.0).unsqueeze(0)
+        unwarped = torch.nn.functional.grid_sample(
+            img_bchw,
+            grid_dense.permute(1, 2, 0).unsqueeze(0),
+            mode="bilinear",
+            padding_mode="border",
+            align_corners=True,
+        ).squeeze(0)
+        im = img_t / 255.0
+        return {
+            "warped": im,
+            "uv_gt": grid_dense,
+            "unwarped": unwarped,
+            "sample_id": sample_id,
+        }

baseline_resnet_unet/model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import ResNet50_Weights, resnet50
+class ConvBNReLU(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int) -> None:
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_ch),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class UpSkip(nn.Module):
+    def __init__(self, in_ch: int, skip_ch: int, out_ch: int) -> None:
+        super().__init__()
+        self.up = nn.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2)
+        self.conv = nn.Sequential(
+            ConvBNReLU(out_ch + skip_ch, out_ch),
+            ConvBNReLU(out_ch, out_ch),
+        )
+    def forward(self, x: torch.Tensor, skip: torch.Tensor) -> torch.Tensor:
+        x = self.up(x)
+        if x.shape[-2:] != skip.shape[-2:]:
+            x = F.interpolate(x, size=skip.shape[-2:], mode="bilinear", align_corners=True)
+        x = torch.cat([x, skip], dim=1)
+        return self.conv(x)
+class UpNoSkip(nn.Module):
+    def __init__(self, in_ch: int, out_ch: int) -> None:
+        super().__init__()
+        self.up = nn.ConvTranspose2d(in_ch, out_ch, kernel_size=2, stride=2)
+        self.conv = nn.Sequential(ConvBNReLU(out_ch, out_ch), ConvBNReLU(out_ch, out_ch))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.conv(self.up(x))
+class ResNet50UNetUV(nn.Module):
+    """
+    ImageNet ResNet50 encoder + U-Net-style decoder.
+    Output: B×2×H×W UV in [-1, 1] (Tanh), aligned with UVDoc grid_sample convention.
+    """
+    def __init__(self, pretrained: bool = True) -> None:
+        super().__init__()
+        w = ResNet50_Weights.IMAGENET1K_V1 if pretrained else None
+        backbone = resnet50(weights=w)
+        self.conv1 = backbone.conv1
+        self.bn1 = backbone.bn1
+        self.relu = backbone.relu
+        self.maxpool = backbone.maxpool
+        self.layer1 = backbone.layer1
+        self.layer2 = backbone.layer2
+        self.layer3 = backbone.layer3
+        self.layer4 = backbone.layer4
+        self.dec43 = UpSkip(2048, 1024, 1024)
+        self.dec32 = UpSkip(1024, 512, 512)
+        self.dec21 = UpSkip(512, 256, 256)
+        self.dec10 = UpNoSkip(256, 128)
+        self.dec00 = UpNoSkip(128, 64)
+        self.head = nn.Conv2d(64, 2, kernel_size=1)
+        self.act = nn.Tanh()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x0 = self.relu(self.bn1(self.conv1(x)))
+        x0 = self.maxpool(x0)
+        s1 = self.layer1(x0)
+        s2 = self.layer2(s1)
+        s3 = self.layer3(s2)
+        s4 = self.layer4(s3)
+        d3 = self.dec43(s4, s3)
+        d2 = self.dec32(d3, s2)
+        d1 = self.dec21(d2, s1)
+        d0 = self.dec10(d1)
+        d = self.dec00(d0)
+        return self.act(self.head(d))

baseline_resnet_unet/train.py ADDED Viewed

	@@ -0,0 +1,187 @@

+from __future__ import annotations
+import argparse
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.cuda.amp import GradScaler, autocast
+from torch.utils.data import DataLoader
+from .dataset import UVDocDenseUVDataset
+from .model import ResNet50UNetUV
+from .warp import grid_sample_unwarp
+IMAGENET_MEAN = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
+IMAGENET_STD = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
+def collate_batch(samples: list[dict]) -> dict[str, torch.Tensor]:
+    return {
+        "warped": torch.stack([s["warped"] for s in samples], dim=0),
+        "uv_gt": torch.stack([s["uv_gt"] for s in samples], dim=0),
+        "unwarped": torch.stack([s["unwarped"] for s in samples], dim=0),
+    }
+def run_epoch(
+    model: nn.Module,
+    loader: DataLoader,
+    device: torch.device,
+    optimizer: torch.optim.Optimizer | None,
+    scaler: GradScaler | None,
+    l1: nn.Module,
+    grad_accum: int,
+    train: bool,
+    lambda_uv: float,
+    lambda_img: float,
+) -> tuple[float, float]:
+    if train:
+        model.train()
+    else:
+        model.eval()
+    tot_uv = 0.0
+    tot_img = 0.0
+    n = 0
+    optimizer_was = optimizer is not None and train
+    with torch.set_grad_enabled(train):
+        for step, batch in enumerate(loader):
+            warped = batch["warped"].to(device, non_blocking=True)
+            uv_gt = batch["uv_gt"].to(device, non_blocking=True)
+            unwarped_gt = batch["unwarped"].to(device, non_blocking=True)
+            mean = IMAGENET_MEAN.to(device)
+            std = IMAGENET_STD.to(device)
+            warped_in = (warped - mean) / std
+            with autocast(enabled=device.type == "cuda"):
+                uv_pred = model(warped_in)
+                unwarped_pred = grid_sample_unwarp(warped, uv_pred)
+                loss_uv = l1(uv_pred, uv_gt)
+                loss_img = l1(unwarped_pred, unwarped_gt)
+                loss = lambda_uv * loss_uv + lambda_img * loss_img
+                loss_scaled = loss / grad_accum
+            if optimizer_was:
+                if scaler is not None:
+                    scaler.scale(loss_scaled).backward()
+                else:
+                    loss_scaled.backward()
+                if (step + 1) % grad_accum == 0:
+                    if scaler is not None:
+                        scaler.step(optimizer)
+                        scaler.update()
+                    else:
+                        optimizer.step()
+                    optimizer.zero_grad(set_to_none=True)
+            tot_uv += float(loss_uv.detach())
+            tot_img += float(loss_img.detach())
+            n += 1
+    return tot_uv / max(1, n), tot_img / max(1, n)
+def main() -> None:
+    p = argparse.ArgumentParser(description="Train ResNet50+UNet UV baseline on UVDoc")
+    p.add_argument("--data_root", type=str, required=True, help="Path to UVDoc final folder (contains img/, grid2d/, ...)")
+    p.add_argument("--out_dir", type=str, default="./runs/uvdoc_baseline")
+    p.add_argument("--epochs", type=int, default=50)
+    p.add_argument("--batch_size", type=int, default=4)
+    p.add_argument("--lr", type=float, default=1e-4)
+    p.add_argument("--weight_decay", type=float, default=1e-2)
+    p.add_argument("--num_workers", type=int, default=4)
+    p.add_argument("--grad_accum", type=int, default=1)
+    p.add_argument("--lambda_uv", type=float, default=1.0)
+    p.add_argument("--lambda_img", type=float, default=1.0)
+    p.add_argument("--h", type=int, default=256)
+    p.add_argument("--w", type=int, default=256)
+    p.add_argument("--no_pretrained", action="store_true")
+    p.add_argument("--appearance_aug", nargs="*", default=["visual", "noise", "color"])
+    args = p.parse_args()
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    out = Path(args.out_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    train_set = UVDocDenseUVDataset(
+        args.data_root,
+        split="train",
+        out_hw=(args.h, args.w),
+        appearance_augmentation=list(args.appearance_aug),
+    )
+    val_set = UVDocDenseUVDataset(
+        args.data_root,
+        split="val",
+        out_hw=(args.h, args.w),
+        appearance_augmentation=[],
+        deterministic_crop=True,
+    )
+    train_loader = DataLoader(
+        train_set,
+        batch_size=args.batch_size,
+        shuffle=True,
+        num_workers=args.num_workers,
+        pin_memory=device.type == "cuda",
+        collate_fn=collate_batch,
+    )
+    val_loader = DataLoader(
+        val_set,
+        batch_size=args.batch_size,
+        shuffle=False,
+        num_workers=args.num_workers,
+        pin_memory=device.type == "cuda",
+        collate_fn=collate_batch,
+    )
+    model = ResNet50UNetUV(pretrained=not args.no_pretrained).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    l1 = nn.L1Loss()
+    scaler = GradScaler(enabled=device.type == "cuda")
+    for epoch in range(1, args.epochs + 1):
+        tr_uv, tr_img = run_epoch(
+            model,
+            train_loader,
+            device,
+            optimizer,
+            scaler,
+            l1,
+            args.grad_accum,
+            train=True,
+            lambda_uv=args.lambda_uv,
+            lambda_img=args.lambda_img,
+        )
+        va_uv, va_img = run_epoch(
+            model,
+            val_loader,
+            device,
+            None,
+            None,
+            l1,
+            1,
+            train=False,
+            lambda_uv=args.lambda_uv,
+            lambda_img=args.lambda_img,
+        )
+        print(
+            f"epoch {epoch:03d} | train L1_uv {tr_uv:.5f} L1_img {tr_img:.5f} | "
+            f"val L1_uv {va_uv:.5f} L1_img {va_img:.5f}"
+        )
+        ckpt = {
+            "epoch": epoch,
+            "model": model.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "args": vars(args),
+        }
+        torch.save(ckpt, out / "last.pt")
+        torch.save(ckpt, out / f"epoch_{epoch:03d}.pt")
+if __name__ == "__main__":
+    main()

baseline_resnet_unet/warp.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from __future__ import annotations
+import torch
+import torch.nn.functional as F
+def upsample_uv_grid(grid_b2hw: torch.Tensor, out_hw: tuple[int, int]) -> torch.Tensor:
+    """Upsample sparse 2×H×W control grid (UVDoc grid2d) to dense H×W."""
+    return F.interpolate(grid_b2hw, size=out_hw, mode="bilinear", align_corners=True)
+def grid_sample_unwarp(
+    warped_bchw: torch.Tensor,
+    grid_b2hw: torch.Tensor,
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    warped_bchw: B×3×H×W
+    grid_b2hw: B×2×H×W — x,y in [-1, 1] for torch.grid_sample (ch0=x, ch1=y).
+    """
+    g = grid_b2hw.permute(0, 2, 3, 1).contiguous()
+    return F.grid_sample(
+        warped_bchw, g, mode="bilinear", padding_mode="border", align_corners=align_corners
+    )

log_full_uvdoc_gpu0.bak_20260411_122217/nohup.out ADDED Viewed

	@@ -0,0 +1,9 @@

+/root/miniconda3/envs/o3dedit/lib/python3.10/site-packages/albumentations/check_version.py:147: UserWarning: Error fetching version info <urlopen error [SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1017)>
+  data = fetch_version_info()
+/mnt/zsn/zsn_workspace/dzx/UvDoc/UVDoc_official/data_utils.py:51: UserWarning: Argument(s) 'var_limit' are not valid for transform GaussNoise
+  A.GaussNoise(var_limit=(10.0, 20.0), p=0.70),
+/root/miniconda3/envs/o3dedit/lib/python3.10/site-packages/albumentations/core/composition.py:331: UserWarning: Got processor for keypoints, but no transform to process it.
+  self._set_keys()
+----- Epoch 0 -----

log_full_uvdoc_gpu0.bak_20260411_122217/params8_lr=0.0002_nepochs50_nepochsdecay20_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+ --------------- params8_lr=0.0002_nepochs50_nepochsdecay20_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc ---------------

log_full_uvdoc_gpu0/nohup.out ADDED Viewed

The diff for this file is too large to render. See raw diff

log_full_uvdoc_gpu0/params8_lr=0.0002_nepochs25_nepochsdecay10_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc.txt ADDED Viewed

	@@ -0,0 +1,111 @@

+---------------  params8_lr=0.0002_nepochs25_nepochsdecay10_alpha5.0_beta5.0_gamma=1.0_gammastartep10_datauvdoc  ---------------
+Train LRate: 0.0002 Epoch: 1 MSE: 0.02950
+TrainLoss LRate: 0.0002 Epoch: 1 net: 0.31961 L1_g2d: 0.04204 L1_g3d: 0.02189 L1_recon: 0.11414 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 1 MSE: 0.01576
+Train LRate: 0.0002 Epoch: 2 MSE: 0.02713
+TrainLoss LRate: 0.0002 Epoch: 2 net: 0.24550 L1_g2d: 0.03096 L1_g3d: 0.01814 L1_recon: 0.10817 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 2 MSE: 0.01688
+Train LRate: 0.0002 Epoch: 3 MSE: 0.02606
+TrainLoss LRate: 0.0002 Epoch: 3 net: 0.21935 L1_g2d: 0.02744 L1_g3d: 0.01643 L1_recon: 0.10558 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 3 MSE: 0.01589
+Train LRate: 0.0002 Epoch: 4 MSE: 0.02522
+TrainLoss LRate: 0.0002 Epoch: 4 net: 0.19993 L1_g2d: 0.02460 L1_g3d: 0.01539 L1_recon: 0.10323 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 4 MSE: 0.01283
+Train LRate: 0.0002 Epoch: 5 MSE: 0.02427
+TrainLoss LRate: 0.0002 Epoch: 5 net: 0.18300 L1_g2d: 0.02206 L1_g3d: 0.01454 L1_recon: 0.10088 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 5 MSE: 0.00968
+Train LRate: 0.0002 Epoch: 6 MSE: 0.02299
+TrainLoss LRate: 0.0002 Epoch: 6 net: 0.16705 L1_g2d: 0.01960 L1_g3d: 0.01381 L1_recon: 0.09772 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 6 MSE: 0.01058
+Train LRate: 0.0002 Epoch: 7 MSE: 0.02203
+TrainLoss LRate: 0.0002 Epoch: 7 net: 0.15387 L1_g2d: 0.01759 L1_g3d: 0.01318 L1_recon: 0.09528 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 7 MSE: 0.01079
+Train LRate: 0.0002 Epoch: 8 MSE: 0.02145
+TrainLoss LRate: 0.0002 Epoch: 8 net: 0.14555 L1_g2d: 0.01636 L1_g3d: 0.01275 L1_recon: 0.09364 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 8 MSE: 0.00838
+Train LRate: 0.0002 Epoch: 9 MSE: 0.02079
+TrainLoss LRate: 0.0002 Epoch: 9 net: 0.13673 L1_g2d: 0.01516 L1_g3d: 0.01219 L1_recon: 0.09175 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 9 MSE: 0.00880
+Train LRate: 0.0002 Epoch: 10 MSE: 0.02033
+TrainLoss LRate: 0.0002 Epoch: 10 net: 0.13079 L1_g2d: 0.01431 L1_g3d: 0.01185 L1_recon: 0.09071 gamma_w: 0.00000
+Val LRate: 0.0002 Epoch: 10 MSE: 0.00919
+Train LRate: 0.0002 Epoch: 11 MSE: 0.01994
+TrainLoss LRate: 0.0002 Epoch: 11 net: 0.21906 L1_g2d: 0.01408 L1_g3d: 0.01181 L1_recon: 0.08961 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 11 MSE: 0.00806
+Train LRate: 0.0002 Epoch: 12 MSE: 0.01935
+TrainLoss LRate: 0.0002 Epoch: 12 net: 0.21137 L1_g2d: 0.01310 L1_g3d: 0.01159 L1_recon: 0.08793 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 12 MSE: 0.00672
+Train LRate: 0.0002 Epoch: 13 MSE: 0.01916
+TrainLoss LRate: 0.0002 Epoch: 13 net: 0.20786 L1_g2d: 0.01276 L1_g3d: 0.01135 L1_recon: 0.08733 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 13 MSE: 0.00796
+Train LRate: 0.0002 Epoch: 14 MSE: 0.01898
+TrainLoss LRate: 0.0002 Epoch: 14 net: 0.20452 L1_g2d: 0.01237 L1_g3d: 0.01116 L1_recon: 0.08685 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 14 MSE: 0.00735
+Train LRate: 0.0002 Epoch: 15 MSE: 0.01891
+TrainLoss LRate: 0.0002 Epoch: 15 net: 0.20277 L1_g2d: 0.01234 L1_g3d: 0.01088 L1_recon: 0.08666 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 15 MSE: 0.00655
+Train LRate: 0.0002 Epoch: 16 MSE: 0.01871
+TrainLoss LRate: 0.0002 Epoch: 16 net: 0.19896 L1_g2d: 0.01190 L1_g3d: 0.01068 L1_recon: 0.08606 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 16 MSE: 0.00819
+Train LRate: 0.0002 Epoch: 17 MSE: 0.01849
+TrainLoss LRate: 0.0002 Epoch: 17 net: 0.19557 L1_g2d: 0.01158 L1_g3d: 0.01046 L1_recon: 0.08541 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 17 MSE: 0.00602
+Train LRate: 0.0002 Epoch: 18 MSE: 0.01825
+TrainLoss LRate: 0.0002 Epoch: 18 net: 0.19157 L1_g2d: 0.01111 L1_g3d: 0.01029 L1_recon: 0.08459 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 18 MSE: 0.00702
+Train LRate: 0.0002 Epoch: 19 MSE: 0.01806
+TrainLoss LRate: 0.0002 Epoch: 19 net: 0.18915 L1_g2d: 0.01089 L1_g3d: 0.01013 L1_recon: 0.08407 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 19 MSE: 0.00634
+Train LRate: 0.0002 Epoch: 20 MSE: 0.01788
+TrainLoss LRate: 0.0002 Epoch: 20 net: 0.18630 L1_g2d: 0.01060 L1_g3d: 0.00996 L1_recon: 0.08348 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 20 MSE: 0.00589
+Train LRate: 0.0002 Epoch: 21 MSE: 0.01795
+TrainLoss LRate: 0.0002 Epoch: 21 net: 0.18629 L1_g2d: 0.01067 L1_g3d: 0.00986 L1_recon: 0.08361 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 21 MSE: 0.00598
+Train LRate: 0.0002 Epoch: 22 MSE: 0.01767
+TrainLoss LRate: 0.0002 Epoch: 22 net: 0.18384 L1_g2d: 0.01047 L1_g3d: 0.00970 L1_recon: 0.08299 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 22 MSE: 0.00614
+Train LRate: 0.0002 Epoch: 23 MSE: 0.01750
+TrainLoss LRate: 0.0002 Epoch: 23 net: 0.18043 L1_g2d: 0.01009 L1_g3d: 0.00951 L1_recon: 0.08240 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 23 MSE: 0.00605
+Train LRate: 0.0002 Epoch: 24 MSE: 0.01747
+TrainLoss LRate: 0.0002 Epoch: 24 net: 0.17951 L1_g2d: 0.01000 L1_g3d: 0.00942 L1_recon: 0.08240 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 24 MSE: 0.00744
+Train LRate: 0.0002 Epoch: 25 MSE: 0.01731
+TrainLoss LRate: 0.0002 Epoch: 25 net: 0.17702 L1_g2d: 0.00977 L1_g3d: 0.00927 L1_recon: 0.08180 gamma_w: 1.00000
+Val LRate: 0.0002 Epoch: 25 MSE: 0.00602
+Train LRate: 0.00018181818181818183 Epoch: 26 MSE: 0.01726
+TrainLoss LRate: 0.00018181818181818183 Epoch: 26 net: 0.17589 L1_g2d: 0.00973 L1_g3d: 0.00913 L1_recon: 0.08159 gamma_w: 1.00000
+Val LRate: 0.00018181818181818183 Epoch: 26 MSE: 0.00608
+Train LRate: 0.00016363636363636363 Epoch: 27 MSE: 0.01683
+TrainLoss LRate: 0.00016363636363636363 Epoch: 27 net: 0.17074 L1_g2d: 0.00917 L1_g3d: 0.00892 L1_recon: 0.08029 gamma_w: 1.00000
+Val LRate: 0.00016363636363636363 Epoch: 27 MSE: 0.00621
+Train LRate: 0.00014545454545454546 Epoch: 28 MSE: 0.01659
+TrainLoss LRate: 0.00014545454545454546 Epoch: 28 net: 0.16742 L1_g2d: 0.00890 L1_g3d: 0.00867 L1_recon: 0.07958 gamma_w: 1.00000
+Val LRate: 0.00014545454545454546 Epoch: 28 MSE: 0.00510
+Train LRate: 0.00012727272727272728 Epoch: 29 MSE: 0.01617
+TrainLoss LRate: 0.00012727272727272728 Epoch: 29 net: 0.16291 L1_g2d: 0.00849 L1_g3d: 0.00842 L1_recon: 0.07833 gamma_w: 1.00000
+Val LRate: 0.00012727272727272728 Epoch: 29 MSE: 0.00602
+Train LRate: 0.00010909090909090909 Epoch: 30 MSE: 0.01584
+TrainLoss LRate: 0.00010909090909090909 Epoch: 30 net: 0.15880 L1_g2d: 0.00810 L1_g3d: 0.00822 L1_recon: 0.07720 gamma_w: 1.00000
+Val LRate: 0.00010909090909090909 Epoch: 30 MSE: 0.00571
+Train LRate: 9.090909090909092e-05 Epoch: 31 MSE: 0.01553
+TrainLoss LRate: 9.090909090909092e-05 Epoch: 31 net: 0.15492 L1_g2d: 0.00774 L1_g3d: 0.00800 L1_recon: 0.07619 gamma_w: 1.00000
+Val LRate: 9.090909090909092e-05 Epoch: 31 MSE: 0.00517
+Train LRate: 7.272727272727273e-05 Epoch: 32 MSE: 0.01512
+TrainLoss LRate: 7.272727272727273e-05 Epoch: 32 net: 0.15098 L1_g2d: 0.00742 L1_g3d: 0.00779 L1_recon: 0.07493 gamma_w: 1.00000
+Val LRate: 7.272727272727273e-05 Epoch: 32 MSE: 0.00449
+Train LRate: 5.4545454545454546e-05 Epoch: 33 MSE: 0.01474
+TrainLoss LRate: 5.4545454545454546e-05 Epoch: 33 net: 0.14681 L1_g2d: 0.00704 L1_g3d: 0.00758 L1_recon: 0.07368 gamma_w: 1.00000
+Val LRate: 5.4545454545454546e-05 Epoch: 33 MSE: 0.00450
+Train LRate: 3.636363636363636e-05 Epoch: 34 MSE: 0.01435
+TrainLoss LRate: 3.636363636363636e-05 Epoch: 34 net: 0.14318 L1_g2d: 0.00676 L1_g3d: 0.00740 L1_recon: 0.07237 gamma_w: 1.00000
+Val LRate: 3.636363636363636e-05 Epoch: 34 MSE: 0.00439
+Train LRate: 1.818181818181819e-05 Epoch: 35 MSE: 0.01392
+TrainLoss LRate: 1.818181818181819e-05 Epoch: 35 net: 0.13949 L1_g2d: 0.00648 L1_g3d: 0.00721 L1_recon: 0.07104 gamma_w: 1.00000
+Val LRate: 1.818181818181819e-05 Epoch: 35 MSE: 0.00406
+Train LRate: 0.0 Epoch: 36 MSE: 0.01356
+TrainLoss LRate: 0.0 Epoch: 36 net: 0.13632 L1_g2d: 0.00626 L1_g3d: 0.00705 L1_recon: 0.06977 gamma_w: 1.00000
+Val LRate: 0.0 Epoch: 36 MSE: 0.00392

log_full_uvdoc_gpu0/verify_val_ep12_infer/metrics.txt ADDED Viewed

	@@ -0,0 +1,1001 @@

+13024 mse=0.013592
+11093 mse=0.004805
+08684 mse=0.010333
+15384 mse=0.019671
+02451 mse=0.006121
+05058 mse=0.004789
+13434 mse=0.003694
+04723 mse=0.001129
+00502 mse=0.013174
+14972 mse=0.003360
+17234 mse=0.019906
+03267 mse=0.005335
+14427 mse=0.005367
+03288 mse=0.002329
+10878 mse=0.009916
+16459 mse=0.000456
+19366 mse=0.002625
+01213 mse=0.005532
+16230 mse=0.010599
+04984 mse=0.006088
+12651 mse=0.002175
+12492 mse=0.000191
+19785 mse=0.005227
+10426 mse=0.016221
+01430 mse=0.005738
+13460 mse=0.003349
+07653 mse=0.018954
+12549 mse=0.001932
+18736 mse=0.004445
+07825 mse=0.012490
+06181 mse=0.003514
+10544 mse=0.006107
+14920 mse=0.012550
+12360 mse=0.000237
+17807 mse=0.002510
+18085 mse=0.002518
+10893 mse=0.004129
+11309 mse=0.007160
+15870 mse=0.001475
+17648 mse=0.001272
+14632 mse=0.006853
+04869 mse=0.004018
+16044 mse=0.010716
+19002 mse=0.004499
+09783 mse=0.004143
+09268 mse=0.007667
+00084 mse=0.009441
+08754 mse=0.001593
+16637 mse=0.004336
+11137 mse=0.020461
+13596 mse=0.003262
+19295 mse=0.022811
+02287 mse=0.006038
+04473 mse=0.002072
+16654 mse=0.003670
+06245 mse=0.003957
+12569 mse=0.003345
+04739 mse=0.001998
+18063 mse=0.002227
+14967 mse=0.003881
+07986 mse=0.001125
+15473 mse=0.007439
+15810 mse=0.006675
+13614 mse=0.003320
+08959 mse=0.003891
+16674 mse=0.008336
+09563 mse=0.006463
+16330 mse=0.003470
+05045 mse=0.003382
+03849 mse=0.011235
+15120 mse=0.002886
+03717 mse=0.002930
+19813 mse=0.004372
+03054 mse=0.006552
+13369 mse=0.013005
+18096 mse=0.008426
+00448 mse=0.004613
+18049 mse=0.004277
+10182 mse=0.008457
+05189 mse=0.006347
+07841 mse=0.012988
+00945 mse=0.005872
+03978 mse=0.003278
+15883 mse=0.004921
+14356 mse=0.007920
+09438 mse=0.003210
+04293 mse=0.000493
+17996 mse=0.005766
+08750 mse=0.000794
+06670 mse=0.008790
+01691 mse=0.003391
+14484 mse=0.009287
+16334 mse=0.011175
+11854 mse=0.011717
+16988 mse=0.005557
+06146 mse=0.006612
+01776 mse=0.005343
+13541 mse=0.003095
+15138 mse=0.011562
+17779 mse=0.004368
+00037 mse=0.009647
+16862 mse=0.000283
+13244 mse=0.008471
+03798 mse=0.012445
+10593 mse=0.004637
+08270 mse=0.006847
+10306 mse=0.001354
+02885 mse=0.006194
+14903 mse=0.003955
+01202 mse=0.004690
+18025 mse=0.009777
+13122 mse=0.007263
+09444 mse=0.003730
+16235 mse=0.007111
+13488 mse=0.002116
+10807 mse=0.006151
+05269 mse=0.010568
+03427 mse=0.014200
+10520 mse=0.015780
+10921 mse=0.002616
+13380 mse=0.007439
+11401 mse=0.005065
+15401 mse=0.005917
+02989 mse=0.020380
+16112 mse=0.014466
+14283 mse=0.010622
+05947 mse=0.010291
+09114 mse=0.020833
+14490 mse=0.001560
+17005 mse=0.005340
+15417 mse=0.021213
+05740 mse=0.002117
+01318 mse=0.005710
+18882 mse=0.002263
+08831 mse=0.000438
+00679 mse=0.003458
+18848 mse=0.004376
+19801 mse=0.006360
+02992 mse=0.011109
+00847 mse=0.004666
+08204 mse=0.001943
+19315 mse=0.014072
+14156 mse=0.004043
+01306 mse=0.005419
+02635 mse=0.009990
+14344 mse=0.003637
+16180 mse=0.004619
+17694 mse=0.006835
+16399 mse=0.000178
+08920 mse=0.005907
+13189 mse=0.009695
+09964 mse=0.005229
+15358 mse=0.010141
+04074 mse=0.003589
+14390 mse=0.003879
+09457 mse=0.005501
+18651 mse=0.002579
+10079 mse=0.004674
+05436 mse=0.002362
+01954 mse=0.005140
+19929 mse=0.002276
+04660 mse=0.001914
+08704 mse=0.001942
+04882 mse=0.004099
+07366 mse=0.017801
+17844 mse=0.004283
+03760 mse=0.007480
+13930 mse=0.004852
+06482 mse=0.008570
+08668 mse=0.005134
+07648 mse=0.003510
+02332 mse=0.006791
+14877 mse=0.004230
+09264 mse=0.004825
+09440 mse=0.001539
+07585 mse=0.006412
+01069 mse=0.006695
+09543 mse=0.005231
+15398 mse=0.005820
+19199 mse=0.009465
+13386 mse=0.004845
+00087 mse=0.003284
+05129 mse=0.009161
+02459 mse=0.007267
+18087 mse=0.012211
+05695 mse=0.002230
+07859 mse=0.004539
+05146 mse=0.007429
+02660 mse=0.004905
+08671 mse=0.005280
+06914 mse=0.003935
+06808 mse=0.002815
+03251 mse=0.003065
+01973 mse=0.008081
+18632 mse=0.009218
+10006 mse=0.004221
+14051 mse=0.001517
+09743 mse=0.005019
+14507 mse=0.006649
+14762 mse=0.002845
+19426 mse=0.008785
+17123 mse=0.009591
+07395 mse=0.005674
+13637 mse=0.001504
+18286 mse=0.027338
+03881 mse=0.002515
+08132 mse=0.019447
+02791 mse=0.001741
+09956 mse=0.007228
+18443 mse=0.012852
+04888 mse=0.005656
+04971 mse=0.009306
+01756 mse=0.007460
+18907 mse=0.003296
+02427 mse=0.015776
+16113 mse=0.008090
+13130 mse=0.004492
+02141 mse=0.006973
+03738 mse=0.006366
+15654 mse=0.006128
+08273 mse=0.020160
+01683 mse=0.001660
+06041 mse=0.005015
+11164 mse=0.014999
+14433 mse=0.001843
+15686 mse=0.004520
+15381 mse=0.005756
+09317 mse=0.009119
+18810 mse=0.007605
+00401 mse=0.000677
+03361 mse=0.007881
+16085 mse=0.008975
+04137 mse=0.007379
+09572 mse=0.007390
+18132 mse=0.004506
+01786 mse=0.006291
+01491 mse=0.003085
+08988 mse=0.013923
+04147 mse=0.005262
+19404 mse=0.005139
+00463 mse=0.006676
+09904 mse=0.014308
+05879 mse=0.003520
+11825 mse=0.006539
+07454 mse=0.008526
+09707 mse=0.000577
+06360 mse=0.005060
+03294 mse=0.008094
+09273 mse=0.023240
+17191 mse=0.003251
+09060 mse=0.003096
+15866 mse=0.007664
+07090 mse=0.007650
+06276 mse=0.004397
+06066 mse=0.006344
+17559 mse=0.002536
+03917 mse=0.002943
+10339 mse=0.008907
+06310 mse=0.004446
+03953 mse=0.002108
+07554 mse=0.002860
+08237 mse=0.007305
+10047 mse=0.014969
+08877 mse=0.003393
+14868 mse=0.006121
+11528 mse=0.002175
+10839 mse=0.003760
+18035 mse=0.002952
+13942 mse=0.005696
+11264 mse=0.005236
+17397 mse=0.012174
+18136 mse=0.004356
+15507 mse=0.009321
+12092 mse=0.015843
+19056 mse=0.004750
+10033 mse=0.002007
+07972 mse=0.003850
+13251 mse=0.002938
+07261 mse=0.004277
+09641 mse=0.004101
+03049 mse=0.005762
+00565 mse=0.007996
+14685 mse=0.002584
+16083 mse=0.013000
+15600 mse=0.005931
+07874 mse=0.012112
+18192 mse=0.002150
+16009 mse=0.001184
+19748 mse=0.015839
+07908 mse=0.005978
+02805 mse=0.003082
+06260 mse=0.005192
+16928 mse=0.000296
+00332 mse=0.000355
+18212 mse=0.004046
+09052 mse=0.008125
+09163 mse=0.023424
+11152 mse=0.004150
+08465 mse=0.000505
+11626 mse=0.004183
+06950 mse=0.003077
+15976 mse=0.003224
+05958 mse=0.007775
+10650 mse=0.003275
+16197 mse=0.008717
+00940 mse=0.009285
+11023 mse=0.003402
+19741 mse=0.003408
+18672 mse=0.006676
+15227 mse=0.003110
+07990 mse=0.009484
+14771 mse=0.002472
+09139 mse=0.006769
+02839 mse=0.002735
+08044 mse=0.008896
+10561 mse=0.004395
+08794 mse=0.000091
+14272 mse=0.003562
+15232 mse=0.005341
+15843 mse=0.003702
+17634 mse=0.004786
+08850 mse=0.000439
+17096 mse=0.001433
+05771 mse=0.008529
+14270 mse=0.002307
+05013 mse=0.003193
+15064 mse=0.008857
+11868 mse=0.001827
+14205 mse=0.011431
+03458 mse=0.002995
+16062 mse=0.006256
+13764 mse=0.011470
+00414 mse=0.000423
+11181 mse=0.002202
+16579 mse=0.000423
+02259 mse=0.004367
+12142 mse=0.014027
+14037 mse=0.005371
+17456 mse=0.006111
+11377 mse=0.006735
+01345 mse=0.002136
+18544 mse=0.005388
+03966 mse=0.004297
+09882 mse=0.004373
+03339 mse=0.012594
+07957 mse=0.006317
+02490 mse=0.010495
+06498 mse=0.005833
+12997 mse=0.007122
+12298 mse=0.009941
+18867 mse=0.009180
+10222 mse=0.009552
+00841 mse=0.009046
+18962 mse=0.005539
+19596 mse=0.001313
+10192 mse=0.005371
+05366 mse=0.002582
+19872 mse=0.006050
+05498 mse=0.002254
+12087 mse=0.005397
+08396 mse=0.000558
+15238 mse=0.010978
+07142 mse=0.005340
+18480 mse=0.002215
+03748 mse=0.004140
+15517 mse=0.009327
+04136 mse=0.012967
+07816 mse=0.005544
+04993 mse=0.005405
+00667 mse=0.001161
+06532 mse=0.012035
+08102 mse=0.003827
+01016 mse=0.011339
+01319 mse=0.003026
+02248 mse=0.003955
+07347 mse=0.005337
+11467 mse=0.006970
+01706 mse=0.004633
+17675 mse=0.005402
+00635 mse=0.002192
+15411 mse=0.005838
+02683 mse=0.007702
+08266 mse=0.007328
+13813 mse=0.006184
+09117 mse=0.017622
+19531 mse=0.004532
+11059 mse=0.010098
+10710 mse=0.004675
+14899 mse=0.002513
+06935 mse=0.012035
+10726 mse=0.003698
+12422 mse=0.000294
+08524 mse=0.007463
+01647 mse=0.009652
+05954 mse=0.003376
+15128 mse=0.006336
+04446 mse=0.000312
+14044 mse=0.007312
+02750 mse=0.010362
+00888 mse=0.017365
+18493 mse=0.004126
+19395 mse=0.005806
+12911 mse=0.003836
+00883 mse=0.001296
+19927 mse=0.003950
+04182 mse=0.002908
+15315 mse=0.009212
+05408 mse=0.004999
+13248 mse=0.005942
+11014 mse=0.011532
+12742 mse=0.000877
+00951 mse=0.008686
+19747 mse=0.003179
+14281 mse=0.010931
+08943 mse=0.006028
+07190 mse=0.009106
+15998 mse=0.003195
+07226 mse=0.004107
+17895 mse=0.009976
+17636 mse=0.004723
+13730 mse=0.002176
+12779 mse=0.000939
+09784 mse=0.012018
+11526 mse=0.002732
+00193 mse=0.003580
+15629 mse=0.003651
+12464 mse=0.000513
+18864 mse=0.010520
+00640 mse=0.001982
+19139 mse=0.010358
+12762 mse=0.001432
+13767 mse=0.006102
+06777 mse=0.004732
+02040 mse=0.011819
+13624 mse=0.003338
+15267 mse=0.015659
+17780 mse=0.016504
+10842 mse=0.001999
+13356 mse=0.009999
+13594 mse=0.004251
+02104 mse=0.007057
+07010 mse=0.004739
+05007 mse=0.002228
+12551 mse=0.001664
+07577 mse=0.010430
+04942 mse=0.006112
+04534 mse=0.002691
+02640 mse=0.003778
+17699 mse=0.006536
+10476 mse=0.010439
+11005 mse=0.006676
+08903 mse=0.007070
+07683 mse=0.009860
+09362 mse=0.002731
+02538 mse=0.003612
+14413 mse=0.002838
+08998 mse=0.007377
+07839 mse=0.004246
+15879 mse=0.007475
+17081 mse=0.005933
+09086 mse=0.003096
+08101 mse=0.005517
+08493 mse=0.011169
+14747 mse=0.007154
+15554 mse=0.004717
+05215 mse=0.006625
+14611 mse=0.000665
+17952 mse=0.004754
+13983 mse=0.004664
+16540 mse=0.000416
+14502 mse=0.012893
+10397 mse=0.002089
+18314 mse=0.007516
+17404 mse=0.007627
+15225 mse=0.001843
+04369 mse=0.000670
+14957 mse=0.004946
+03976 mse=0.003730
+08165 mse=0.007345
+18265 mse=0.014817
+01645 mse=0.006865
+15222 mse=0.003725
+16972 mse=0.005263
+05763 mse=0.010498
+07170 mse=0.004263
+13931 mse=0.006381
+03492 mse=0.004798
+00181 mse=0.004791
+04835 mse=0.000131
+07994 mse=0.005594
+13095 mse=0.002430
+16199 mse=0.018779
+12582 mse=0.001651
+06371 mse=0.004494
+18863 mse=0.014359
+13580 mse=0.001739
+14923 mse=0.006933
+02386 mse=0.009795
+15569 mse=0.003373
+08023 mse=0.010031
+01514 mse=0.002425
+00800 mse=0.000258
+04828 mse=0.000532
+06525 mse=0.010423
+07361 mse=0.010910
+10171 mse=0.006608
+07696 mse=0.007901
+03060 mse=0.002002
+10983 mse=0.004688
+16890 mse=0.000462
+09299 mse=0.004427
+02778 mse=0.004951
+05560 mse=0.009737
+15505 mse=0.011543
+16750 mse=0.007996
+07002 mse=0.004323
+14488 mse=0.005948
+14476 mse=0.005617
+15236 mse=0.010391
+10559 mse=0.008287
+19004 mse=0.002443
+14086 mse=0.015066
+06887 mse=0.005182
+09401 mse=0.002061
+09957 mse=0.008144
+00013 mse=0.005464
+17954 mse=0.010544
+13306 mse=0.040973
+09861 mse=0.015161
+18648 mse=0.009899
+05702 mse=0.004912
+12423 mse=0.000515
+13777 mse=0.004517
+06286 mse=0.006292
+04170 mse=0.012163
+18166 mse=0.014863
+09688 mse=0.004297
+13185 mse=0.007625
+10688 mse=0.016156
+13382 mse=0.023912
+10134 mse=0.004477
+16617 mse=0.001585
+09840 mse=0.010508
+04083 mse=0.006149
+10286 mse=0.012526
+10819 mse=0.011054
+02292 mse=0.006911
+14132 mse=0.004875
+11934 mse=0.003254
+06456 mse=0.019909
+17643 mse=0.002607
+17036 mse=0.006798
+00055 mse=0.005215
+14288 mse=0.014392
+01454 mse=0.006386
+08346 mse=0.000335
+06228 mse=0.005780
+18893 mse=0.005317
+12622 mse=0.014434
+03789 mse=0.003452
+16758 mse=0.001055
+14299 mse=0.021217
+10278 mse=0.007585
+11327 mse=0.004929
+14239 mse=0.003540
+19548 mse=0.009048
+03552 mse=0.022578
+01253 mse=0.005894
+08698 mse=0.000867
+19024 mse=0.002185
+05851 mse=0.008123
+08559 mse=0.005700
+03779 mse=0.007637
+00904 mse=0.008019
+10851 mse=0.003364
+17570 mse=0.002191
+13096 mse=0.005568
+16692 mse=0.002952
+11506 mse=0.003977
+09146 mse=0.008194
+02274 mse=0.004023
+09129 mse=0.013056
+10756 mse=0.008751
+13056 mse=0.015209
+06328 mse=0.006939
+00775 mse=0.000719
+07304 mse=0.005530
+07457 mse=0.005769
+10000 mse=0.002826
+11457 mse=0.005553
+15083 mse=0.003962
+06539 mse=0.003780
+07288 mse=0.015032
+15423 mse=0.010442
+01268 mse=0.010835
+12535 mse=0.007065
+03542 mse=0.005691
+05216 mse=0.008257
+08742 mse=0.001468
+08131 mse=0.011416
+13491 mse=0.009239
+10885 mse=0.002555
+05877 mse=0.005266
+00812 mse=0.003922
+13510 mse=0.007523
+05801 mse=0.013077
+05324 mse=0.009311
+07757 mse=0.002630
+05064 mse=0.003532
+13316 mse=0.008569
+18346 mse=0.004035
+11589 mse=0.010977
+03368 mse=0.010653
+08176 mse=0.004915
+06883 mse=0.012554
+11724 mse=0.010935
+01307 mse=0.000943
+11948 mse=0.005417
+10101 mse=0.003065
+01370 mse=0.001716
+04176 mse=0.001732
+14083 mse=0.004617
+04852 mse=0.012606
+18105 mse=0.004661
+19087 mse=0.002899
+12098 mse=0.002071
+01180 mse=0.032120
+17877 mse=0.010336
+04884 mse=0.003506
+02465 mse=0.008399
+19844 mse=0.005397
+00316 mse=0.000524
+18379 mse=0.007948
+14014 mse=0.005911
+18077 mse=0.006781
+14478 mse=0.004664
+05294 mse=0.021503
+08583 mse=0.002269
+04286 mse=0.000144
+10929 mse=0.005501
+00116 mse=0.015530
+01444 mse=0.004305
+09066 mse=0.004321
+19778 mse=0.003098
+03024 mse=0.010053
+01664 mse=0.007941
+08228 mse=0.004464
+16008 mse=0.003189
+16561 mse=0.003868
+08650 mse=0.002290
+06671 mse=0.003400
+11235 mse=0.010489
+06901 mse=0.009372
+09232 mse=0.004964
+08924 mse=0.021215
+05093 mse=0.005350
+19874 mse=0.009032
+03507 mse=0.003076
+03782 mse=0.022239
+08666 mse=0.003655
+04400 mse=0.000548
+03394 mse=0.002589
+09810 mse=0.012773
+18173 mse=0.003096
+00256 mse=0.000407
+17332 mse=0.013669
+09912 mse=0.005981
+17800 mse=0.002989
+14358 mse=0.007492
+05169 mse=0.015356
+09338 mse=0.008593
+12108 mse=0.004350
+08004 mse=0.005980
+02254 mse=0.010440
+11436 mse=0.006910
+04340 mse=0.000543
+19738 mse=0.006448
+16576 mse=0.004364
+06984 mse=0.003509
+17617 mse=0.004963
+02400 mse=0.015990
+03276 mse=0.005885
+18448 mse=0.008356
+15017 mse=0.003819
+00305 mse=0.000113
+02377 mse=0.011819
+10360 mse=0.005368
+14982 mse=0.011285
+09830 mse=0.006814
+04288 mse=0.000242
+12969 mse=0.005355
+08703 mse=0.000688
+07821 mse=0.005969
+10295 mse=0.003671
+06693 mse=0.004699
+08544 mse=0.005174
+10366 mse=0.004704
+17130 mse=0.003539
+18521 mse=0.002302
+19125 mse=0.003507
+13737 mse=0.003372
+02686 mse=0.003147
+01302 mse=0.005836
+19481 mse=0.005929
+18970 mse=0.009271
+08067 mse=0.002808
+18667 mse=0.007331
+03928 mse=0.028819
+13230 mse=0.004064
+07707 mse=0.005022
+02226 mse=0.012873
+19498 mse=0.004052
+02245 mse=0.006626
+06089 mse=0.002865
+02625 mse=0.008537
+16640 mse=0.006780
+01863 mse=0.015235
+05158 mse=0.005529
+17403 mse=0.012846
+16477 mse=0.000135
+15623 mse=0.016849
+19142 mse=0.006173
+01643 mse=0.003417
+01873 mse=0.006018
+10276 mse=0.010539
+01997 mse=0.012274
+17766 mse=0.002147
+18978 mse=0.003328
+01916 mse=0.006207
+07133 mse=0.010933
+09722 mse=0.008000
+06222 mse=0.003645
+05072 mse=0.002408
+15947 mse=0.002638
+18211 mse=0.003234
+13861 mse=0.018975
+09347 mse=0.007893
+14909 mse=0.009625
+08690 mse=0.001223
+12793 mse=0.001838
+00070 mse=0.019099
+12418 mse=0.000195
+05394 mse=0.008171
+01921 mse=0.004962
+13141 mse=0.003647
+07004 mse=0.004832
+15773 mse=0.003870
+15913 mse=0.005211
+13317 mse=0.002295
+05449 mse=0.003415
+07745 mse=0.001731
+03056 mse=0.020637
+00483 mse=0.003349
+17713 mse=0.004062
+01657 mse=0.011674
+03208 mse=0.007049
+18033 mse=0.011215
+14520 mse=0.002908
+02470 mse=0.007919
+08185 mse=0.003485
+15159 mse=0.003266
+09127 mse=0.004895
+06012 mse=0.002327
+13824 mse=0.007023
+04593 mse=0.008154
+14700 mse=0.015028
+17573 mse=0.011038
+06232 mse=0.004607
+06278 mse=0.005450
+08147 mse=0.004964
+03580 mse=0.005904
+11118 mse=0.004537
+13193 mse=0.011521
+01986 mse=0.008747
+03224 mse=0.001771
+01775 mse=0.006740
+15303 mse=0.009594
+13470 mse=0.003670
+13879 mse=0.007771
+11609 mse=0.009756
+14124 mse=0.003176
+03176 mse=0.010371
+03090 mse=0.010388
+06239 mse=0.015475
+13338 mse=0.006884
+15498 mse=0.002574
+07962 mse=0.007727
+15488 mse=0.003228
+18881 mse=0.009415
+18711 mse=0.002872
+04335 mse=0.000312
+17669 mse=0.001634
+07020 mse=0.003957
+15906 mse=0.002830
+09125 mse=0.015277
+07798 mse=0.017547
+16847 mse=0.000129
+02321 mse=0.005903
+10827 mse=0.004561
+01029 mse=0.005611
+02208 mse=0.003085
+07501 mse=0.005096
+01929 mse=0.002922
+02326 mse=0.025720
+00235 mse=0.003902
+07216 mse=0.004973
+19282 mse=0.003903
+07540 mse=0.010510
+18150 mse=0.015341
+19277 mse=0.006658
+00689 mse=0.001561
+11078 mse=0.008348
+02098 mse=0.002378
+07362 mse=0.013275
+08123 mse=0.017092
+03965 mse=0.005523
+14794 mse=0.003243
+16959 mse=0.014355
+14355 mse=0.003741
+12236 mse=0.004858
+13074 mse=0.003443
+10214 mse=0.004258
+06591 mse=0.003775
+17671 mse=0.010929
+06940 mse=0.002097
+13865 mse=0.005914
+17290 mse=0.006832
+08685 mse=0.016334
+05410 mse=0.008638
+18015 mse=0.009019
+15574 mse=0.011938
+04207 mse=0.006514
+04120 mse=0.007746
+17455 mse=0.008978
+02267 mse=0.011783
+15924 mse=0.008287
+02806 mse=0.005054
+02580 mse=0.003646
+18591 mse=0.005672
+07892 mse=0.003923
+01898 mse=0.004494
+07846 mse=0.002609
+10076 mse=0.005584
+11894 mse=0.010259
+03665 mse=0.009702
+00638 mse=0.004971
+16010 mse=0.004073
+10621 mse=0.004637
+19626 mse=0.007213
+00018 mse=0.006267
+17378 mse=0.015393
+17674 mse=0.002557
+05293 mse=0.014019
+12252 mse=0.008291
+19885 mse=0.017402
+06517 mse=0.008035
+19859 mse=0.004822
+09779 mse=0.002638
+03486 mse=0.003352
+16635 mse=0.007544
+05854 mse=0.007090
+16403 mse=0.001594
+08630 mse=0.002533
+00106 mse=0.012805
+14867 mse=0.007520
+05182 mse=0.009494
+14246 mse=0.013231
+09617 mse=0.001610
+03655 mse=0.018741
+11146 mse=0.005865
+08743 mse=0.001506
+17595 mse=0.005043
+03753 mse=0.015660
+00376 mse=0.000177
+18128 mse=0.011650
+08238 mse=0.006696
+17338 mse=0.012221
+15337 mse=0.005766
+19526 mse=0.010126
+12504 mse=0.001161
+12608 mse=0.005191
+02081 mse=0.014520
+19543 mse=0.006403
+13833 mse=0.010069
+05242 mse=0.008055
+05008 mse=0.005635
+03592 mse=0.004524
+01543 mse=0.011988
+02978 mse=0.005750
+16171 mse=0.007243
+16696 mse=0.011489
+04532 mse=0.004631
+07186 mse=0.009895
+11861 mse=0.005365
+13087 mse=0.004986
+19121 mse=0.010848
+14038 mse=0.003433
+19155 mse=0.008910
+08609 mse=0.001800
+17661 mse=0.007982
+18394 mse=0.006856
+08081 mse=0.016075
+04575 mse=0.010351
+08679 mse=0.003621
+04681 mse=0.000964
+15035 mse=0.012249
+12964 mse=0.005834
+16358 mse=0.004575
+06967 mse=0.028753
+10311 mse=0.004275
+18585 mse=0.014650
+06913 mse=0.008484
+02168 mse=0.015948
+08773 mse=0.002038
+13145 mse=0.003065
+10336 mse=0.017272
+01051 mse=0.008510
+07505 mse=0.003368
+01832 mse=0.009042
+10626 mse=0.000839
+07196 mse=0.008562
+18250 mse=0.001764
+08845 mse=0.000329
+12433 mse=0.000763
+15147 mse=0.011794
+05354 mse=0.001267
+08021 mse=0.010643
+17502 mse=0.002931
+05607 mse=0.004029
+02339 mse=0.002522
+08748 mse=0.000439
+06865 mse=0.002523
+11641 mse=0.008693
+12130 mse=0.005906
+05329 mse=0.002781
+11954 mse=0.003806
+14857 mse=0.004294
+09108 mse=0.014271
+12455 mse=0.000913
+03309 mse=0.005440
+07628 mse=0.005613
+02614 mse=0.002911
+09482 mse=0.008526
+07467 mse=0.006411
+01501 mse=0.002836
+02279 mse=0.006905
+06300 mse=0.012404
+18918 mse=0.005894
+11850 mse=0.005871
+09606 mse=0.002263
+18089 mse=0.017444
+02582 mse=0.003119
+12403 mse=0.000606
+04090 mse=0.003476
+17571 mse=0.002986
+15054 mse=0.006459
+01423 mse=0.003188
+08667 mse=0.008596
+19782 mse=0.019427
+11270 mse=0.006343
+11763 mse=0.005554
+03169 mse=0.016593
+12449 mse=0.000495
+03039 mse=0.002252
+03349 mse=0.010690
+11029 mse=0.004264
+07055 mse=0.003639
+05094 mse=0.008496
+09105 mse=0.002958
+11149 mse=0.010304
+13848 mse=0.004117
+05231 mse=0.030994
+00212 mse=0.007919
+09115 mse=0.005759
+19309 mse=0.017774
+14719 mse=0.015124
+07223 mse=0.005964
+13746 mse=0.001556
+17856 mse=0.005898
+06515 mse=0.008227
+18390 mse=0.007518
+00869 mse=0.004310
+19726 mse=0.015906
+16559 mse=0.003382
+07623 mse=0.011249
+07164 mse=0.016578
+03070 mse=0.003559
+00976 mse=0.002195
+01041 mse=0.007530
+13825 mse=0.003102
+19349 mse=0.017139
+02848 mse=0.013111
+17870 mse=0.004815
+03358 mse=0.013159
+04572 mse=0.011368
+07314 mse=0.004934
+08024 mse=0.005998
+09012 mse=0.019656
+00819 mse=0.004931
+03648 mse=0.009436
+mean_mse 0.00672006 n=1000

requirements_baseline.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch>=2.0
+torchvision>=0.15
+numpy>=1.22
+opencv-python>=4.5
+h5py>=3.8
+albumentations>=1.3.0

requirements_uvdoc_train.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# UVDoc 官方训练 + 本地 baseline 共用（已用 conda env uvdoc 验证可装）
+# 使用: conda activate uvdoc && pip install -r requirements_uvdoc_train.txt
+torch>=2.0
+torchvision>=0.15
+numpy>=1.22
+h5py>=3.8
+opencv-python-headless>=4.7
+albumentations>=1.3

run_overfit_official_uvdoc.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env bash
+# Single-sample overfit on UVDoc_final with OFFICIAL default hyperparameters:
+#   UVDocnet, lr=2e-4, batch=8, n_epochs=10, n_epochs_decay=10,
+#   alpha=beta=5, gamma=1, ep_gamma_start=10.
+# Overfit branch uses deterministic crop + no aug (matches verify_ckpt_val_pipeline.py).
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PY="${PYTHON:-/root/miniconda3/envs/o3dedit/bin/python}"
+LOGDIR="${LOGDIR:-$ROOT/log_overfit_official_uvdoc}"
+UV="${UV_DOC_ROOT:-$ROOT/UVDoc_final}"
+exec "$PY" "$ROOT/UVDoc_official/train.py" \
+  --data_to_use uvdoc \
+  --data_path_UVDoc "$UV" \
+  --overfit_n 1 \
+  --batch_size 8 \
+  --n_epochs 10 \
+  --n_epochs_decay 10 \
+  --lr 0.0002 \
+  --alpha_w 5.0 \
+  --beta_w 5.0 \
+  --gamma_w 1.0 \
+  --ep_gamma_start 10 \
+  --num_workers "${NUM_WORKERS:-4}" \
+  --device "${DEVICE:-cuda:0}" \
+  --logdir "$LOGDIR"

run_overfit_train_infer_consistency.sh ADDED Viewed

	@@ -0,0 +1,75 @@

+#!/usr/bin/env bash
+# 单样例过拟合 + 训练/推理数据管线一致性校验
+#
+# 1) 快速断言：train.py 与 verify_ckpt_val_pipeline.py 使用的 UVDocDataset 张量一致
+# 2) 可选：单样例过拟合训练（与 run_overfit_official_uvdoc.sh 相同超参）
+# 3) 用同一套预处理跑 verify_ckpt_val_pipeline.py，mean_mse 应与训练日志里该 epoch 的 Val MSE 对齐
+#
+# 用法：
+#   PREPROCESS_ONLY=1 ./run_overfit_train_infer_consistency.sh
+#   ./run_overfit_train_infer_consistency.sh
+#
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PY="${PYTHON:-python3}"
+UV="${UV_DOC_ROOT:-$ROOT/UVDoc_final}"
+OFF="$ROOT/UVDoc_official"
+LOGDIR="${LOGDIR:-$ROOT/log_overfit_consistency}"
+CKPT_GLOB="${CKPT_GLOB:-}"
+cd "$OFF"
+echo "== (1) Preprocess alignment: train vs verify_ckpt constructors =="
+"$PY" verify_uvdoc_train_infer_preprocess.py \
+  --data_path_UVDoc "$UV" \
+  --overfit_n 1 \
+  --mode overfit \
+  --check_dataloader \
+  --batch_size 8 \
+  --num_workers 0
+if [[ "${PREPROCESS_ONLY:-0}" == "1" ]]; then
+  echo "PREPROCESS_ONLY=1, skip training and checkpoint verify."
+  exit 0
+fi
+echo "== (2) Single-sample overfit training =="
+"$PY" train.py \
+  --data_to_use uvdoc \
+  --data_path_UVDoc "$UV" \
+  --overfit_n 1 \
+  --batch_size 8 \
+  --n_epochs 10 \
+  --n_epochs_decay 10 \
+  --lr 0.0002 \
+  --alpha_w 5.0 \
+  --beta_w 5.0 \
+  --gamma_w 1.0 \
+  --ep_gamma_start 10 \
+  --num_workers "${NUM_WORKERS:-4}" \
+  --device "${DEVICE:-cuda:0}" \
+  --logdir "$LOGDIR"
+# 取最新 best ckpt（按修改时间）
+mapfile -t CKPTS < <(ls -t "$LOGDIR"/ep_*_best_model.pkl 2>/dev/null || true)
+if [[ ${#CKPTS[@]} -eq 0 ]]; then
+  echo "No ep_*_best_model.pkl under $LOGDIR" >&2
+  exit 1
+fi
+CKPT="${CKPT_GLOB:-${CKPTS[0]}}"
+echo "Using checkpoint: $CKPT"
+OUT="$LOGDIR/verify_infer_same_preprocess"
+rm -rf "$OUT"
+mkdir -p "$OUT"
+echo "== (3) Inference with SAME dataset kwargs as train val/overfit =="
+"$PY" verify_ckpt_val_pipeline.py \
+  --ckpt "$CKPT" \
+  --data_path_UVDoc "$UV" \
+  --overfit_n 1 \
+  --out_dir "$OUT" \
+  --max_save_images 1 \
+  --device "${DEVICE:-cuda:0}"
+echo "Done. Compare mean_mse in $OUT/metrics.txt to the Val MSE line in train log under $LOGDIR"

run_train_full_uvdoc_gpu0.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env bash
+# Full UVDoc training (no Doc3D), GPU 0, recommended hyperparameters.
+# Stop with: kill <pid>  (or Ctrl+C if foreground)
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:-0}"
+PY="${PYTHON:-/root/miniconda3/envs/o3dedit/bin/python}"
+UV_ROOT="${UV_ROOT:-$ROOT/UVDoc_final}"
+LOGDIR="${LOGDIR:-$ROOT/log_full_uvdoc_gpu0}"
+# Shorter uvdoc-only schedule (override with N_EPOCHS, N_DECAY env vars).
+BS="${BS:-8}"
+N_EPOCHS="${N_EPOCHS:-25}"
+N_DECAY="${N_DECAY:-10}"
+LR="${LR:-0.0002}"
+EP_GAMMA="${EP_GAMMA:-10}"
+VAL_RATIO="${VAL_RATIO:-0.05}"
+SPLIT_SEED="${SPLIT_SEED:-42}"
+NUM_WORKERS="${NUM_WORKERS:-8}"
+exec "$PY" "$ROOT/UVDoc_official/train.py" \
+  --data_to_use uvdoc \
+  --data_path_UVDoc "$UV_ROOT" \
+  --uvdoc_val_ratio "$VAL_RATIO" \
+  --uvdoc_split_seed "$SPLIT_SEED" \
+  --batch_size "$BS" \
+  --n_epochs "$N_EPOCHS" \
+  --n_epochs_decay "$N_DECAY" \
+  --lr "$LR" \
+  --alpha_w 5.0 \
+  --beta_w 5.0 \
+  --gamma_w 1.0 \
+  --ep_gamma_start "$EP_GAMMA" \
+  --appearance_augmentation visual noise color \
+  --geometric_augmentationsUVDoc rotate \
+  --num_workers "$NUM_WORKERS" \
+  --device cuda:0 \
+  --logdir "$LOGDIR"

run_train_official_config.sh ADDED Viewed

	@@ -0,0 +1,80 @@

+#!/usr/bin/env bash
+# UVDoc official-style training (matches UVDoc_official/train.py defaults).
+#
+# Mode A — paper/repo default: Doc3D + UVDoc mixed (--data_to_use both).
+#   Set DOC3D_ROOT to your Doc3D dataset; UVDoc to UVDoc_final (or official layout).
+#
+# Mode B — local UVDoc only: same LR/epoch/gamma schedule as official, no Doc3D.
+#   export TRAIN_MODE=uvdoc_only
+#
+# Defaults (official argparse):
+#   batch_size=8, n_epochs=10, n_epochs_decay=10, lr=2e-4,
+#   alpha_w=5 beta_w=5 gamma_w=1 ep_gamma_start=10,
+#   appearance: visual noise color, UVDoc geom: rotate
+set -euo pipefail
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PY="${PYTHON:-python3}"
+TRAIN="${ROOT}/UVDoc_official/train.py"
+# --- edit paths ---
+DOC3D_ROOT="${DOC3D_ROOT:-/path/to/data/doc3D}"
+UV_DOC_ROOT="${UV_DOC_ROOT:-/mnt/zsn/zsn_workspace/dzx/UvDoc/UVDoc_final}"
+LOGDIR="${LOGDIR:-${ROOT}/log/official_default}"
+# TRAIN_MODE: both | uvdoc_only
+TRAIN_MODE="${TRAIN_MODE:-both}"
+# Official hyperparameters (explicit for clarity)
+BS="${BS:-8}"
+N_EPOCHS="${N_EPOCHS:-10}"
+N_EPOCHS_DECAY="${N_EPOCHS_DECAY:-10}"
+LR="${LR:-0.0002}"
+ALPHA="${ALPHA:-5.0}"
+BETA="${BETA:-5.0}"
+GAMMA="${GAMMA:-1.0}"
+EP_GAMMA_START="${EP_GAMMA_START:-10}"
+UV_VAL_RATIO="${UV_VAL_RATIO:-0.05}"
+UV_SPLIT_SEED="${UV_SPLIT_SEED:-42}"
+NUM_WORKERS="${NUM_WORKERS:-8}"
+DEVICE="${DEVICE:-cuda:0}"
+common_args=(
+  --batch_size "$BS"
+  --n_epochs "$N_EPOCHS"
+  --n_epochs_decay "$N_EPOCHS_DECAY"
+  --lr "$LR"
+  --alpha_w "$ALPHA"
+  --beta_w "$BETA"
+  --gamma_w "$GAMMA"
+  --ep_gamma_start "$EP_GAMMA_START"
+  --appearance_augmentation visual noise color
+  --geometric_augmentationsUVDoc rotate
+  --num_workers "$NUM_WORKERS"
+  --device "$DEVICE"
+  --logdir "$LOGDIR"
+)
+if [[ "$TRAIN_MODE" == "both" ]]; then
+  if [[ ! -d "$DOC3D_ROOT" ]]; then
+    echo "ERROR: DOC3D_ROOT is not a directory: $DOC3D_ROOT"
+    echo "Set DOC3D_ROOT to your Doc3D dataset root, or use TRAIN_MODE=uvdoc_only."
+    exit 1
+  fi
+  exec "$PY" "$TRAIN" \
+    --data_to_use both \
+    --data_path_doc3D "$DOC3D_ROOT" \
+    --data_path_UVDoc "$UV_DOC_ROOT" \
+    "${common_args[@]}"
+elif [[ "$TRAIN_MODE" == "uvdoc_only" ]]; then
+  exec "$PY" "$TRAIN" \
+    --data_to_use uvdoc \
+    --data_path_UVDoc "$UV_DOC_ROOT" \
+    --uvdoc_val_ratio "$UV_VAL_RATIO" \
+    --uvdoc_split_seed "$UV_SPLIT_SEED" \
+    "${common_args[@]}"
+else
+  echo "TRAIN_MODE must be 'both' or 'uvdoc_only', got: $TRAIN_MODE"
+  exit 1
+fi

run_train_uvdoc_baseline.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env python3
+"""Entrypoint: run from UvDoc directory: python run_train_uvdoc_baseline.py --data_root ./UVDoc_final"""
+import sys
+from pathlib import Path
+if __name__ == "__main__":
+    here = Path(__file__).resolve().parent
+    sys.path.insert(0, str(here))
+    from baseline_resnet_unet.train import main
+    main()

unzip_extract.log ADDED Viewed

	@@ -0,0 +1 @@


1	+ caution: excluded filename not matched: /__MACOSX/

uvdoc_文档矫正_colab_技术路线（gemini_可执行版）.md ADDED Viewed

	@@ -0,0 +1,212 @@

+# UVDoc 文档矫正技术路线（Colab + Gemini 实现指南）
+---
+# 🎯 目标
+在 Colab 上实现一个可训练、可验证的文档矫正模型：
+- 输入：畸变文档图像
+- 输出：UV map（H×W×2）
+- 使用：grid_sample 生成矫正图
+- 在 UVDoc benchmark 上评估性能
+---
+# 🧠 核心思路
+本任务本质是：
+> Dense per-pixel mapping（UV 映射预测）
+模型学习：
+f(I) → UV
+然后：
+I_rectified = grid_sample(I, UV)
+---
+# 🏗️ 模型结构（推荐实现）
+## Backbone
+- ResNet50（ImageNet 预训练）
+## Decoder
+- U-Net 风格（带 skip connection）
+## 输出
+- UV map: [B, H, W, 2]
+- 值域：[-1, 1]
+---
+# ⚙️ 训练流程
+## Step 1：加载预训练 backbone
+- 使用 torchvision ResNet50
+## Step 2：构建 U-Net Decoder
+- 上采样 + skip connection
+## Step 3：输出 UV map
+- 最后一层用 Tanh
+## Step 4：warp 图像
+使用：
+```
+F.grid_sample(input, UV_pred)
+```
+---
+# 📉 Loss 设计
+## 1. UV Loss（核心）
+L_uv = |UV_pred - UV_gt|
+## 2. Image Loss
+L_img = |I_pred - I_gt|
+## 3. Perceptual Loss（可选）
+使用 VGG feature
+## 最终 Loss
+L = L_uv + 1.0 * L_img + 0.1 * L_perc
+---
+# 📦 数据准备
+## UVDoc 数据
+需要包含：
+- input image
+- GT UV map
+- GT rectified image（可选）
+## 数据预处理
+- resize 到 256×256
+- normalize
+---
+# 🚀 Colab 配置
+## 推荐设置
+- GPU: T4 / A100
+- batch size: 4~8
+- epoch: 50+
+- optimizer: AdamW
+- lr: 1e-4
+---
+# ⚡ 性能优化
+## 必做
+- mixed precision（torch.cuda.amp）
+- gradient accumulation
+## 推荐
+- 随机 crop
+- 数据增强（亮度、对比度）
+---
+# 📊 评估指标
+- L1 UV error
+- PSNR
+- SSIM
+---
+# 🧪 训练策略（强烈推荐）
+## Phase 1（可选）
+Synthetic 数据预训练：
+- 使用 TPS 生成畸变
+- 自动生成 UV GT
+## Phase 2
+在 UVDoc 上 finetune
+---
+# ⚠️ 常见错误
+- ❌ UV 坐标方向错误（forward/backward 混淆）
+- ❌ 未使用 grid_sample
+- ❌ 直接预测图像
+- ❌ 分辨率过大导致 OOM
+---
+# 🧾 Gemini Prompt（直接可用）
+将下面内容复制给 Gemini：
+---
+你需要在 Google Colab 上实现一个文档矫正模型，要求如下：
+1. 使用 PyTorch
+2. Backbone: ResNet50（ImageNet 预训练）
+3. Decoder: U-Net
+4. 输出：UV map（H×W×2，范围 [-1,1]）
+5. 使用 torch.nn.functional.grid_sample 生成矫正图像
+训练部分：
+- Loss = L1(UV) + L1(image)
+- optimizer: AdamW
+- 使用 mixed precision
+数据：
+- 输入图像
+- UV GT
+要求：
+- 提供完整训练代码
+- 包含 model、dataset、train loop
+- 可直接在 Colab 运行
+---
+# ✅ 最终效果
+你应该能得到：
+- 一个可训练模型
+- 输出 UV map
+- 可视化矫正结果
+- 在 UVDoc 上评估
+---
+# 📌 备注
+如果效果不佳，可以升级：
+- backbone → Swin-T
+- 加 perceptual loss
+- 使用 multi-scale
+---
+# 🎯 一句话总结
+用 ResNet + U-Net 预测 UV map，再用 grid_sample 重建图像，这是 UVDoc 最稳的 baseline 方案。