boatbomber commited on Mar 14

Commit

3050f1b

0 Parent(s):

Initial release

Browse files

Files changed (48) hide show

.gitattributes +36 -0
.gitignore +176 -0
LICENSE.md +176 -0
README.md +447 -0
ae.safetensors +3 -0
assets/NisabaRelief-Logo.png +3 -0
assets/example_diff_0.png +3 -0
assets/example_diff_1.png +3 -0
assets/example_diff_2.png +3 -0
assets/example_diff_3.png +3 -0
assets/example_diff_4.png +3 -0
assets/example_input_0.png +3 -0
assets/example_input_1.png +3 -0
assets/example_input_2.png +3 -0
assets/example_input_3.png +3 -0
assets/example_input_4.png +3 -0
assets/example_output_0.png +3 -0
assets/example_output_1.png +3 -0
assets/example_output_2.png +3 -0
assets/example_output_3.png +3 -0
assets/example_output_4.png +3 -0
assets/example_truth_0.png +3 -0
assets/example_truth_1.png +3 -0
assets/example_truth_2.png +3 -0
assets/example_truth_3.png +3 -0
assets/example_truth_4.png +3 -0
data/val_tablet_ids.json +90 -0
dev_scripts/benchmark.py +149 -0
dev_scripts/evaluation.py +162 -0
dev_scripts/process_images.py +197 -0
dev_scripts/util/load_val_dataset.py +24 -0
dev_scripts/util/metrics.py +67 -0
dev_scripts/util/psnr_hvsm.py +137 -0
model.safetensors +3 -0
nisaba_relief/__init__.py +7 -0
nisaba_relief/constants.py +42 -0
nisaba_relief/flux/__init__.py +0 -0
nisaba_relief/flux/autoencoder.py +351 -0
nisaba_relief/flux/layers.py +341 -0
nisaba_relief/flux/model.py +147 -0
nisaba_relief/flux/sampling.py +92 -0
nisaba_relief/image_utils.py +153 -0
nisaba_relief/model.py +474 -0
nisaba_relief/py.typed +0 -0
nisaba_relief/weights.py +23 -0
prompt_embedding.safetensors +3 -0
pyproject.toml +69 -0
uv.lock +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+data/*
+!data/val_tablet_ids.json
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+**/__marimo__/
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# PyPI configuration file
+.pypirc

LICENSE.md ADDED Viewed

	@@ -0,0 +1,176 @@

+                             Apache License
+                       Version 2.0, January 2004
+                    http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+  "License" shall mean the terms and conditions for use, reproduction,
+  and distribution as defined by Sections 1 through 9 of this document.
+  "Licensor" shall mean the copyright owner or entity authorized by
+  the copyright owner that is granting the License.
+  "Legal Entity" shall mean the union of the acting entity and all
+  other entities that control, are controlled by, or are under common
+  control with that entity. For the purposes of this definition,
+  "control" means (i) the power, direct or indirect, to cause the
+  direction or management of such entity, whether by contract or
+  otherwise, or (ii) ownership of fifty percent (50%) or more of the
+  outstanding shares, or (iii) beneficial ownership of such entity.
+  "You" (or "Your") shall mean an individual or Legal Entity
+  exercising permissions granted by this License.
+  "Source" form shall mean the preferred form for making modifications,
+  including but not limited to software source code, documentation
+  source, and configuration files.
+  "Object" form shall mean any form resulting from mechanical
+  transformation or translation of a Source form, including but
+  not limited to compiled object code, generated documentation,
+  and conversions to other media types.
+  "Work" shall mean the work of authorship, whether in Source or
+  Object form, made available under the License, as indicated by a
+  copyright notice that is included in or attached to the work
+  (an example is provided in the Appendix below).
+  "Derivative Works" shall mean any work, whether in Source or Object
+  form, that is based on (or derived from) the Work and for which the
+  editorial revisions, annotations, elaborations, or other modifications
+  represent, as a whole, an original work of authorship. For the purposes
+  of this License, Derivative Works shall not include works that remain
+  separable from, or merely link (or bind by name) to the interfaces of,
+  the Work and Derivative Works thereof.
+  "Contribution" shall mean any work of authorship, including
+  the original version of the Work and any modifications or additions
+  to that Work or Derivative Works thereof, that is intentionally
+  submitted to Licensor for inclusion in the Work by the copyright owner
+  or by an individual or Legal Entity authorized to submit on behalf of
+  the copyright owner. For the purposes of this definition, "submitted"
+  means any form of electronic, verbal, or written communication sent
+  to the Licensor or its representatives, including but not limited to
+  communication on electronic mailing lists, source code control systems,
+  and issue tracking systems that are managed by, or on behalf of, the
+  Licensor for the purpose of discussing and improving the Work, but
+  excluding communication that is conspicuously marked or otherwise
+  designated in writing by the copyright owner as "Not a Contribution."
+  "Contributor" shall mean Licensor and any individual or Legal Entity
+  on behalf of whom a Contribution has been received by Licensor and
+  subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  copyright license to reproduce, prepare Derivative Works of,
+  publicly display, publicly perform, sublicense, and distribute the
+  Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of
+  this License, each Contributor hereby grants to You a perpetual,
+  worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+  (except as stated in this section) patent license to make, have made,
+  use, offer to sell, sell, import, and otherwise transfer the Work,
+  where such license applies only to those patent claims licensable
+  by such Contributor that are necessarily infringed by their
+  Contribution(s) alone or by combination of their Contribution(s)
+  with the Work to which such Contribution(s) was submitted. If You
+  institute patent litigation against any entity (including a
+  cross-claim or counterclaim in a lawsuit) alleging that the Work
+  or a Contribution incorporated within the Work constitutes direct
+  or contributory patent infringement, then any patent licenses
+  granted to You under this License for that Work shall terminate
+  as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the
+  Work or Derivative Works thereof in any medium, with or without
+  modifications, and in Source or Object form, provided that You
+  meet the following conditions:
+  (a) You must give any other recipients of the Work or
+      Derivative Works a copy of this License; and
+  (b) You must cause any modified files to carry prominent notices
+      stating that You changed the files; and
+  (c) You must retain, in the Source form of any Derivative Works
+      that You distribute, all copyright, patent, trademark, and
+      attribution notices from the Source form of the Work,
+      excluding those notices that do not pertain to any part of
+      the Derivative Works; and
+  (d) If the Work includes a "NOTICE" text file as part of its
+      distribution, then any Derivative Works that You distribute must
+      include a readable copy of the attribution notices contained
+      within such NOTICE file, excluding those notices that do not
+      pertain to any part of the Derivative Works, in at least one
+      of the following places: within a NOTICE text file distributed
+      as part of the Derivative Works; within the Source form or
+      documentation, if provided along with the Derivative Works; or,
+      within a display generated by the Derivative Works, if and
+      wherever such third-party notices normally appear. The contents
+      of the NOTICE file are for informational purposes only and
+      do not modify the License. You may add Your own attribution
+      notices within Derivative Works that You distribute, alongside
+      or as an addendum to the NOTICE text from the Work, provided
+      that such additional attribution notices cannot be construed
+      as modifying the License.
+  You may add Your own copyright statement to Your modifications and
+  may provide additional or different license terms and conditions
+  for use, reproduction, or distribution of Your modifications, or
+  for any such Derivative Works as a whole, provided Your use,
+  reproduction, and distribution of the Work otherwise complies with
+  the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise,
+  any Contribution intentionally submitted for inclusion in the Work
+  by You to the Licensor shall be under the terms and conditions of
+  this License, without any additional terms or conditions.
+  Notwithstanding the above, nothing herein shall supersede or modify
+  the terms of any separate license agreement you may have executed
+  with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade
+  names, trademarks, service marks, or product names of the Licensor,
+  except as required for reasonable and customary use in describing the
+  origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or
+  agreed to in writing, Licensor provides the Work (and each
+  Contributor provides its Contributions) on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+  implied, including, without limitation, any warranties or conditions
+  of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+  PARTICULAR PURPOSE. You are solely responsible for determining the
+  appropriateness of using or redistributing the Work and assume any
+  risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory,
+  whether in tort (including negligence), contract, or otherwise,
+  unless required by applicable law (such as deliberate and grossly
+  negligent acts) or agreed to in writing, shall any Contributor be
+  liable to You for damages, including any direct, indirect, special,
+  incidental, or consequential damages of any character arising as a
+  result of this License or out of the use or inability to use the
+  Work (including but not limited to damages for loss of goodwill,
+  work stoppage, computer failure or malfunction, or any and all
+  other commercial damages or losses), even if such Contributor
+  has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing
+  the Work or Derivative Works thereof, You may choose to offer,
+  and charge a fee for, acceptance of support, warranty, indemnity,
+  or other liability obligations and/or rights consistent with this
+  License. However, in accepting such obligations, You may act only
+  on Your own behalf and on Your sole responsibility, not on behalf
+  of any other Contributor, and only if You agree to indemnify,
+  defend, and hold each Contributor harmless for any liability
+  incurred by, or claims asserted against, such Contributor by reason
+  of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS

README.md ADDED Viewed

	@@ -0,0 +1,447 @@

+---
+license: apache-2.0
+pipeline_tag: image-to-image
+base_model:
+- black-forest-labs/FLUX.2-klein-base-4B
+base_model_relation: finetune
+datasets:
+- boatbomber/CuneiformPhotosMSII
+tags:
+- image-to-image
+- cuneiform
+- geometry
+- curvature
+- multi-scale-integral-invariant
+- msii
+- Flux
+---
+<div align="center">
+<h1 align="center">
+NisabaRelief
+</h1>
+<img src="./assets/NisabaRelief-Logo.png" width="600"/>
+</div>
+# NisabaRelief
+NisabaRelief is a rectified flow transformer that converts ordinary photographs of cuneiform clay tablets into Multi-Scale Integral Invariant (MSII) curvature visualizations, without requiring 3D scanning hardware. Traditional MSII computation requires a high-resolution 3D scanner and GigaMesh postprocessing, averaging approximately 68 minutes per tablet. NisabaRelief processes a photograph in approximately 7 seconds.
+Photographic images introduce a variety of noise sources: lighting direction, clay color, surface sheen, photography conditions, and surface staining. Any of these can cause wedge impressions to appear as shadows or shadows to appear as wedge impressions. MSII filtering discards this photometric variation, retaining only the geometric signal pressed into the clay. See [What is MSII?](#what-is-msii) for full technical details.
+Built by fine-tuning [Flux.2 Klein Base 4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B) on paired photo/MSII data generated from 3D scans in the [HeiCuBeDa](https://doi.org/10.11588/data/IE8CCN) corpus. Training data is made available here: [CuneiformPhotosMSII](https://huggingface.co/datasets/boatbomber/CuneiformPhotosMSII).
+Named for Nisaba, the early Sumerian goddess of writing and scribes, NisabaRelief will serve as the preprocessing backbone of NabuOCR V2, a cuneiform OCR system currently in development.
+---
+## Contents
+- [NisabaRelief](#nisabarelief)
+  - [Contents](#contents)
+  - [Example Output](#example-output)
+  - [Quickstart](#quickstart)
+    - [Installation](#installation)
+    - [Usage](#usage)
+  - [Hardware Requirements](#hardware-requirements)
+  - [Performance](#performance)
+  - [What is MSII?](#what-is-msii)
+  - [Intended Use \& Limitations](#intended-use--limitations)
+  - [Evaluation](#evaluation)
+    - [Step Sweep](#step-sweep)
+  - [Training Data](#training-data)
+  - [Training Pipeline](#training-pipeline)
+    - [Key Technical Decision: Text-Encoder-Free Training](#key-technical-decision-text-encoder-free-training)
+    - [Key Technical Decision: VAE BatchNorm Domain Calibration](#key-technical-decision-vae-batchnorm-domain-calibration)
+    - [Stage 1: Pretrain (Domain Initialization)](#stage-1-pretrain-domain-initialization)
+    - [Stage 2: Train (Image-to-Image Adaptation)](#stage-2-train-image-to-image-adaptation)
+      - [Augmentation Pipeline](#augmentation-pipeline)
+      - [Loss](#loss)
+    - [Stage 3: Rectify (Trajectory Straightening)](#stage-3-rectify-trajectory-straightening)
+  - [Acknowledgements \& Citations](#acknowledgements--citations)
+---
+## Example Output
+<table>
+<thead>
+<tr>
+  <th align="center" width="25%">Input</th>
+  <th align="center" width="25%">Output</th>
+  <th align="center" width="25%">Ground Truth</th>
+  <th align="center" width="25%">Difference</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+  <td align="center"><img src="./assets/example_input_0.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_output_0.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_truth_0.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_diff_0.png" width="200"/></td>
+</tr>
+<tr>
+  <td colspan="4" align="center"><b>Dice: 0.9652</b> &nbsp;·&nbsp; RMSE: 0.0775 &nbsp;·&nbsp; MS-SSIM: 0.9295 &nbsp;·&nbsp; PSNR: 22.22 dB &nbsp;·&nbsp; PSNR-HVS-M: 17.77 dB &nbsp;·&nbsp; SRE: 58.34 dB</td>
+</tr>
+<tr>
+  <td align="center"><img src="./assets/example_input_1.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_output_1.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_truth_1.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_diff_1.png" width="200"/></td>
+</tr>
+<tr>
+  <td colspan="4" align="center"><b>Dice: 0.9555</b> &nbsp;·&nbsp; RMSE: 0.0788 &nbsp;·&nbsp; MS-SSIM: 0.9219 &nbsp;·&nbsp; PSNR: 22.07 dB &nbsp;·&nbsp; PSNR-HVS-M: 17.80 dB &nbsp;·&nbsp; SRE: 57.89 dB</td>
+</tr>
+<tr>
+  <td align="center"><img src="./assets/example_input_2.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_output_2.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_truth_2.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_diff_2.png" width="200"/></td>
+</tr>
+<tr>
+  <td colspan="4" align="center"><b>Dice: 0.9630</b> &nbsp;·&nbsp; RMSE: 0.1108 &nbsp;·&nbsp; MS-SSIM: 0.8513 &nbsp;·&nbsp; PSNR: 19.11 dB &nbsp;·&nbsp; PSNR-HVS-M: 14.65 dB &nbsp;·&nbsp; SRE: 59.60 dB</td>
+</tr>
+<tr>
+  <td align="center"><img src="./assets/example_input_3.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_output_3.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_truth_3.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_diff_3.png" width="200"/></td>
+</tr>
+<tr>
+  <td colspan="4" align="center"><b>Dice: 0.9713</b> &nbsp;·&nbsp; RMSE: 0.1035 &nbsp;·&nbsp; MS-SSIM: 0.8748 &nbsp;·&nbsp; PSNR: 19.70 dB &nbsp;·&nbsp; PSNR-HVS-M: 15.33 dB &nbsp;·&nbsp; SRE: 59.41 dB</td>
+</tr>
+<tr>
+  <td align="center"><img src="./assets/example_input_4.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_output_4.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_truth_4.png" width="200"/></td>
+  <td align="center"><img src="./assets/example_diff_4.png" width="200"/></td>
+</tr>
+<tr>
+  <td colspan="4" align="center"><b>Dice: 0.9564</b> &nbsp;·&nbsp; RMSE: 0.1054 &nbsp;·&nbsp; MS-SSIM: 0.9325 &nbsp;·&nbsp; PSNR: 19.55 dB &nbsp;·&nbsp; PSNR-HVS-M: 15.18 dB &nbsp;·&nbsp; SRE: 57.36 dB</td>
+</tr>
+</tbody>
+</table>
+---
+## Quickstart
+### Installation
+**Prerequisites:**
+- Python >= 3.10
+- PyTorch with CUDA support. See https://pytorch.org/get-started/locally/.
+```bash
+# Install PyTorch (CUDA 12.8 example)
+pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
+# Windows only: install Triton (included automatically on Linux)
+pip install triton-windows
+```
+**Install:**
+```bash
+pip install nisaba-relief
+```
+### Usage
+```python
+from nisaba_relief import NisabaRelief
+model = NisabaRelief()  # downloads weights from HF Hub automatically if needed
+result = model.process("tablet.jpg")
+result.save("tablet_msii.png")
+```
+**Constructor parameters:**
+| Parameter | Default | Description |
+|---|---|---|
+| `device` | `"cuda"` if available | Device for inference |
+| `num_steps` | `2` | Denoising steps |
+| `weights_dir` | `None` | Local weights directory; if `None`, downloads from HF Hub or uses HF cache. Expected dir contents: `model.safetensors`, `ae.safetensors`, `prompt_embedding.safetensors` |
+| `batch_size` | `None` | Batch size for processing tiles during inference. `None` (default) auto-selects the largest batch that fits in available VRAM. Set an explicit integer to override. Higher values are faster but see note below. |
+| `seed` | `None` | Optional random seed for reproducible noise generation; if `None`, randomized |
+| `compile` | `True` | Use `torch.compile` for faster repeated inference. Requires Triton. Set to `False` if Triton is not installed or for one-off runs. |
+> **Reproducibility note:** Results are pixel-exact across repeated runs with the same `batch_size` and `seed`. However, changing `batch_size` between runs (including letting `None` auto-select a different value as available VRAM changes) will produce outputs that differ by up to ~1-2 pixel values (mean < 0.25) due to GPU floating-point non-determinism: CUDA selects different kernel implementations for different matrix shapes, which changes the floating-point accumulation order in the transformer attention and linear layers. The visual difference is imperceptible. If exact cross-run reproducibility is required, set a constant `batch_size`.
+**`process()` parameters:**
+| Parameter | Default | Description |
+|---|---|---|
+| `image` | required | File path (str/Path) or PIL Image |
+| `show_pbar` | `None` | Progress bar visibility. `None` = auto (shows when >= 2 batches); `True`/`False` = always show/hide |
+**Returns:** Grayscale `PIL.Image.Image` containing the MSII visualization.
+**Input requirements:**
+- Any PIL-readable format (PNG, JPG, WEBP, ...)
+- Minimum 64 px on the short side; maximum aspect ratio 8:1
+**Large image support:**
+The model's native tile size is 1024 px. For images where either side exceeds 1024 px, the model automatically applies a sliding-window tiling pass. Tiles are blended with raised-cosine overlap weights to avoid seams. Each tile is also conditioned on a 128 px thumbnail of the full image with a red rectangle marking the tile's position, so the model retains global context while processing local detail.
+There is no practical upper limit on input resolution, though the model may perform unexpectedly if the 1024 px tile is only a small fraction of the total image area.
+---
+## Hardware Requirements
+While CPU inference is technically supported, it is too slow for practical use. A GPU with at least 9GB VRAM is required, with 12GB+ being recommended for better batching.
+The 9 GB figure is substantially lower than the ~18 GB a standard FLUX.2-klein-base-4B deployment would require because the Qwen3-4B text encoder is never loaded at runtime. The conditioning prompt is pre-computed once and shipped as a 7.8 MB embedding file alongside the model weights.
+---
+## Performance
+Traditional pipelines require a high-resolution 3D scanner and GigaMesh postprocessing: across the HeiCuBeDa corpus, this averages approximately 68 minutes per tablet, totalling over 2,200 hours for the full collection. NisabaRelief processes a tablet photograph in approximately 7 seconds, roughly 600x faster, with no scanning equipment required.
+On a 1064x2048px photo, an RTX 3090 performs as follows:
+| Run | Time |
+|---|---|
+| *compile warmup* | 11.61s |
+| 1 | 7.05s |
+| 2 | 7.07s |
+| 3 | 7.09s |
+| **Mean** | **7.07 ± 0.02s** |
+---
+## What is MSII?
+Multi-Scale Integral Invariant (MSII) filtering is a geometry-processing algorithm that computes a robust curvature measure at every point on a 3D surface mesh. At each vertex, a sphere of radius *r* is centered on the surface and the algorithm measures how much of the sphere's volume falls below the surface (the "interior" volume). On a perfectly flat surface the ratio is exactly one half. Concave regions (such as the channel cut by a wedge impression) admit more of the sphere below the surface, pushing the ratio above 0.5. Convex regions such as ridges or the rounded back of a tablet expose less interior volume, pulling the ratio below 0.5. The signed difference from the flat baseline maps directly to the sign and magnitude of mean curvature at that point.
+The multi-scale component repeats this computation at several sphere radii simultaneously. Small radii resolve fine wedge tips and hairline details; large radii capture broader curvature trends such as the tablet's overall convexity. The per-vertex measurements across all radii form a compact feature vector, and the final scalar output conventionally displayed as a grayscale image is the maximum component of that feature vector, capturing the strongest curvature response across all scales into a single value per pixel.
+By convention the scalar is displayed with its sign inverted relative to the mean curvature: concave regions (ratio > 0.5) map to darker pixel values and convex regions (ratio < 0.5) to lighter ones. This places the flat-surface baseline at mid-gray and renders wedge channels as dark strokes against a bright background, similar to ink on paper.
+Because the result depends only on the 3D shape of the surface rather than on lighting, clay color, or photograph angle, wedge impressions appear as consistent dark strokes against a bright background. This makes the surface structure considerably more legible to machine-vision OCR systems than raw photographs.
+---
+## Intended Use & Limitations
+Generating an MSII visualization of a tablet requires a high-resolution laser scanner and substantial per-vertex computation. The vast majority of cuneiform tablets do not have a 3D scan available, and the computational cost is difficult to scale across large corpora.
+To reduce this barrier and increase the availability of readable images, this model is trained to predict the MSII visualization directly from photographs.
+**Intended use:**
+- Preprocessing step for cuneiform OCR (specifically NabuOCR V2)
+- Visualizing cuneiform tablet geometry for research and digital humanities
+**Limitations:**
+- Trained exclusively using [HeiCuBeDa](https://doi.org/10.11588/data/IE8CCN) 3D-scan data; performance on tablet types or scribal traditions not well-represented in that corpus is unknown
+- Outputs are MSII approximations inferred from 2D photographs, not computed from true 3D geometry. They are suitable for OCR preprocessing but are not a substitute for physical scanning
+- Not a general-purpose MSII model; behavior on non-cuneiform inputs is undefined and out of distribution
+- Designed for photographs following [CDLI photography guidelines](https://cdli.earth/docs/images-acquisition-and-processing): high-resolution fatcross layout on a black background. The model may underperform on low-resolution or visually cluttered inputs such as older black-and-white excavation photographs where the background blends into the tablet
+---
+## Evaluation
+The model was evaluated on 704 held-out validation pairs, all tablets whose geometry was never seen during training (see [Training Data](#training-data)). Each validation image was processed through the model and the output compared against the ground-truth MSII visualization computed from the 3D scan. Ran with `seed=42` and `batch_size=4`.
+| Metric     |            Value |
+|------------|------------------|
+| Dice       |  0.9639 ± 0.0138 |
+| RMSE       |  0.0877 ± 0.0208 |
+| MS-SSIM    |  0.9026 ± 0.0308 |
+| PSNR       |  21.36 ± 1.91 dB |
+| PSNR-HVS-M |  16.98 ± 1.89 dB |
+| SRE        |  59.57 ± 1.92 dB |
+**Dice** (Binarized Dice Coefficient) thresholds both images to isolate wedge stroke regions, then measures overlap between predicted and ground-truth strokes on a 0-1 scale. This is the most task-relevant metric, as it directly measures whether the model correctly localizes wedge impressions for downstream OCR.
+**RMSE** (Root Mean Squared Error) measures average pixel-level reconstruction error; lower is better.
+**MS-SSIM** (Multi-Scale Structural Similarity Index) measures perceptual image similarity by comparing luminance, contrast, and local structure at multiple spatial scales simultaneously. Coarser scales capture global shape agreement; finer scales capture edge and texture detail. Scores range from 0 to 1, where 1 is a perfect match; higher is better.
+**PSNR** (Peak Signal-to-Noise Ratio) expresses reconstruction fidelity in decibels relative to the maximum pixel value; higher is better.
+**PSNR-HVS-M** (Peak Signal-to-Noise Ratio - Human Visual System and Masking) measures reconstruction fidelity in decibels relative to the maximum pixel value while taking into account Contrast Sensitivity Function (CSF) and between-coefficient contrast masking of DCT basis functions.
+**SRE** (Signal-to-Reconstruction Error) ratio measures reconstruction fidelity in decibels based on signal energy vs. error energy; higher is better.
+### Step Sweep
+A sweep of step counts was run on a subset of 175 validation samples and found that 2 steps is ideal for this model, adding one corrective step over the already solid single-step result. The rectified flow field is extremely straight (straightness_ratio=0.9989, path_length_ratio=1.0011, velocity_std=0.1565). For near-perfectly straight ODE trajectories, a single Euler step is theoretically near-exact, and each additional step accumulates small model prediction errors faster than it reduces discretization error. Where throughput is the primary concern, one step is acceptable. Ran with `seed=42` and `batch_size=4`.
+| Metric     | Steps=1          | Steps=2              | Steps=4          | Steps=8          |
+|------------|------------------|----------------------|------------------|------------------|
+| Dice       | 0.9582 ± 0.0153  | **0.9634** ± 0.0139  | 0.9612 ± 0.0142  | 0.9580 ± 0.0148  |
+| RMSE       | 0.0909 ± 0.0209  | **0.0859** ± 0.0212  | 0.0900 ± 0.0203  | 0.0949 ± 0.0197  |
+| MS-SSIM    | 0.8987 ± 0.0326  | **0.9081** ± 0.0310  | 0.9039 ± 0.0314  | 0.8959 ± 0.0326  |
+| PSNR       | 21.03 ± 1.83 dB  | **21.56** ± 1.97 dB  | 21.11 ± 1.84 dB  | 20.63 ± 1.72 dB  |
+| PSNR-HVS-M | 16.65 ± 1.80 dB  | **17.19** ± 1.96 dB  | 16.70 ± 1.83 dB  | 16.18 ± 1.70 dB  |
+| SRE        | 58.81 ± 1.81 dB  | **59.07** ± 1.87 dB  | 58.85 ± 1.87 dB  | 58.61 ± 1.86 dB  |
+---
+## Training Data
+Training uses the [CuneiformPhotosMSII](https://huggingface.co/datasets/boatbomber/CuneiformPhotosMSII) dataset: 13,928 paired image pairs generated from 1,741 tablets sourced from the HeiCuBeDa (Heidelberg Cuneiform Benchmark Dataset), a professional research collection of 3D-scanned clay tablets. Each tablet was rendered multiple times in Blender at up to 4096 px, producing synthetic photographs alongside their corresponding MSII curvature visualizations.
+Each render variant randomizes which faces of the tablet are shown, camera focal length (80-150 mm), tablet rotation (±5° Euler XYZ), lighting position/color/intensity, and background (fabric, grunge, stone, or none). This diversity encourages the model to generalize across realistic shooting conditions rather than overfitting to a specific lighting or composition style.
+The dataset was split tablet-wise: 13,224 pairs (~95% of tablets) for training and 704 pairs (~5% of tablets) held out for validation. Because the split is by tablet identity, the model never sees a validation tablet's geometry during training.
+---
+## Training Pipeline
+Training proceeded in three sequential stages: Pretrain, Train, and Rectify. Each stage builds directly on the weights from the previous one.
+### Key Technical Decision: Text-Encoder-Free Training
+All three stages skip the Qwen3-4B text encoder entirely. Text embeddings are pre-computed once and cached to disk, reducing VRAM consumption from ~18 GB to ~9 GB without any loss in conditioning fidelity.
+### Key Technical Decision: VAE BatchNorm Domain Calibration
+The FLUX.2 VAE contains a BatchNorm layer whose running statistics (`running_mean` and `running_var` across 128 channels: 32 latent channels × 2×2 patch size) were originally computed on diverse natural images. Applying this encoder to cuneiform tablets and MSII renderings introduces a latent-space distribution shift that manifests as screen-door dithering artifacts in decoded outputs.
+To correct this, the BatchNorm statistics were recalibrated on the target domain before training began. 3,000 CDLI cuneiform tablet photographs and 2,000 synthetic MSII visualizations (5,000 images total) were encoded through the frozen VAE encoder; running mean and variance were accumulated across 19,301,093 spatial samples using float64 accumulators for numerical stability. Images from both domains were interleaved to ensure balanced sampling. The calibrated statistics are baked directly into the `ae.safetensors` weights shipped with this model.
+---
+### Stage 1: Pretrain (Domain Initialization)
+The pretrain stage adapts the base FLUX.2 model to the cuneiform domain before any image-to-image translation is attempted. It runs standard text-to-image flow-matching training on two sources of real cuneiform imagery:
+- ~60% CDLI archive photographs: real museum photos of tablets, paired with per-image text embeddings generated from CDLI metadata (period, material, object type, provenience, genre, language). Eight prompt templates were used and varied randomly.
+- ~40% synthetic MSII renders: MSII visualization images from the training set, paired with MSII-specific text embeddings emphasizing curvature, surface topology, and wedge impression terminology.
+Each image has its own unique cached embedding rather than a shared prompt, preventing the model from memorizing specimen identifiers and encouraging generalization.
+| Hyperparameter | Value |
+|---|---|
+| Steps | 75,000 |
+| Learning rate | 2e-4 (cosine decay, 1k warmup) |
+| Effective batch size | 2 (batch 1, grad accum 2) |
+| LoRA rank | 256 |
+| LoRA init | PiSSA (8-iteration fast SVD) |
+| Optimizer | 8-bit Adam |
+| Precision | bfloat16 autocast |
+| Timestep sampling | Logit-normal (mean=0, std=1) |
+| Gradient clipping | 1.0 |
+Images are resized to fit within 1 megapixel and rounded to 128-pixel multiples. Light augmentations are applied (horizontal flip, ±5° rotation, minor color jitter). Validation generates text-conditioned images across four aspect ratios every 1,000 steps.
+---
+### Stage 2: Train (Image-to-Image Adaptation)
+The main training stage fine-tunes the pretrained weights for the target task: translating cuneiform tablet photographs into MSII visualizations. This stage introduces two significant changes over standard FLUX.2 fine-tuning.
+**Tile and global context conditioning**
+Rather than processing full images, the model trains on dynamic tile crops (128-1024 px, depending on image resolution) while simultaneously receiving a downscaled 128 px thumbnail of the full image with a red rectangle marking the tile's location, providing both local detail and global context.
+**Paired crop with geometric consistency**
+The same crop coordinates and geometric transforms (flip, rotation, perspective distortion) are applied to both the input photograph and the target MSII image, ensuring the model always receives spatially aligned pairs.
+#### Augmentation Pipeline
+Augmentations are split into two categories applied in sequence:
+Geometric (applied identically to input and target):
+- Horizontal flip (50%), vertical flip (40%), rotation ±8° (50%), perspective distortion strength 0.02 (30%)
+Domain adaptation (applied to input only, to simulate real photographic variation):
+- Perlin noise illumination (20%), vignette (40%), directional lighting gradient (50%), dust particles (50%), Gaussian noise (80%), gamma correction (50%), contrast adjustment (50%), brightness shift (50%), hue/saturation shift (40%), Gaussian blur (30%), grayscale conversion (3%)
+Spatially-dependent effects (Perlin noise, vignette, gradient) use crop coordinates so the tile and its global thumbnail receive matching effects.
+#### Loss
+Flow-matching loss with Min-SNR-γ weighting (γ=5.0) to down-weight noisy high-timestep predictions, plus a multi-scale latent gradient loss weighted at 0.25. The gradient loss computes spatial gradient differences between predicted and target latents at four downsampling scales, encouraging sharp edge structure in outputs.
+| Hyperparameter | Value |
+|---|---|
+| Steps | 150,000 |
+| Learning rate | 3e-4 (cosine decay to 6e-6, 1k warmup) |
+| Effective batch size | 8 (batch 1, grad accum 8) |
+| LoRA rank | 256, alpha √rank, RSLoRA |
+| LoRA init | PiSSA (8-iteration fast SVD) |
+| EMA decay | 0.999 (used for validation and final save) |
+| Optimizer | 8-bit Adam |
+| Gradient clipping | 0.8 (with spike detection: skip if >2.5× EMA norm) |
+| Precision | bfloat16 autocast |
+| Gradient loss weight | 0.25 |
+| Min-SNR-γ | 5.0 |
+| Timestep sampling | Logit-normal (mean=0, std=1) |
+Validation runs every 2,000 steps, generating 8 sample images with 8 denoising steps.
+---
+### Stage 3: Rectify (Trajectory Straightening)
+The rectify stage implements [Rectified Flow](https://arxiv.org/abs/2209.03003) to reduce the number of inference steps required at runtime.
+Standard flow-matching trains on random (noise, real target) pairs, producing curved ODE trajectories that require 25-50 denoising steps to traverse accurately. Rectified training instead pairs each noise sample with the output the fully-trained model generates from that noise, creating straight-line trajectories that can be traversed in 1-4 steps without quality loss.
+Before training, a one-time preprocessing pass runs the trained model over the training set. Each image is cropped deterministically (seeded RNG, same tile-sizing logic as training), then fully denoised with the trained weights to produce a (noise, generated_output) coupled pair saved to disk. This eliminates VAE encoding from the training loop, reducing VRAM further.
+The loss trains the model to predict the velocity between a coupled (noise, generated) pair at a random interpolated timestep. A pseudo-Huber loss replaces the MSE used in earlier stages, providing better gradient stability when predictions are far from target.
+| Hyperparameter | Value |
+|---|---|
+| Steps | 50,000 |
+| Learning rate | 3e-6 (cosine decay, 500 warmup) |
+| Effective batch size | 4 (batch 1, grad accum 4) |
+| LoRA rank | 256 |
+| LoRA init | Loaded from Stage 2 weights (warm-start) |
+| Loss | Pseudo-Huber (c=0.001) |
+| Optimizer | 8-bit Adam |
+| Gradient clipping | 1.0 |
+| Precision | bfloat16 autocast |
+| Timestep sampling | Logit-normal (mean=0, std=1) |
+Validation runs every 2,000 steps using real validation images (not coupled pairs), generating outputs with only 2 denoising steps to directly measure few-step inference quality.
+The result is usable MSII visualizations in 1-2 denoising steps, compared to the 25-50 steps standard flow-matching requires.
+---
+## Acknowledgements & Citations
+**3D Scan Data (HeiCuBeDa)**
+3D scans used to generate the training dataset are from the Heidelberg Cuneiform Benchmark Dataset (HeiCuBeDa):
+> Bogacz, B., Gertz, M., & Mara, H. (2015). *Character Proposals for Cuneiform Script Digitization*. Proceedings of the 15th International Conference on Frontiers in Handwriting Recognition (ICFHR). doi:[10.11588/data/IE8CCN](https://doi.org/10.11588/data/IE8CCN)
+**Archive Photographs (CDLI)**
+Real tablet photographs used in Stage 1 pretraining are sourced from the [Cuneiform Digital Library Initiative (CDLI)](https://cdli.mpiwg-berlin.mpg.de/).
+**MSII Curvature (GigaMesh)**
+MSII curvature values embedded in the HeiCuBeDa PLY files were computed using the [GigaMesh Software Framework](https://gigamesh.eu/).
+**Rectified Flow**
+Stage 3 (Rectify) implements the trajectory-straightening approach from:
+> Liu, X., et al. (2022). *Flow Straight and Fast: Learning to Generate and Transfer Data with Rectified Flow*. arXiv:[2209.03003](https://arxiv.org/abs/2209.03003)
+**Base Model (FLUX.2 Klein Base 4B)**
+Fine-tuned from [FLUX.2-klein-base-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B) by Black Forest Labs.

ae.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:570cc44d0301b006a34b2604735cf296ef6083a95564b45042c1788eae246977
+size 336211292

assets/NisabaRelief-Logo.png ADDED Viewed

Git LFS Details

SHA256: 62b1fa428e2dea3b963eae0bf5d58cb369e0b2674d6e951fe96206c7e7b9becf
Pointer size: 132 Bytes
Size of remote file: 2.16 MB

assets/example_diff_0.png ADDED Viewed

Git LFS Details

SHA256: 837769a8e9d4a223e9575476e22f809f0d0b4305937586eb36d8cf235999a3bd
Pointer size: 132 Bytes
Size of remote file: 1.03 MB

assets/example_diff_1.png ADDED Viewed

Git LFS Details

SHA256: 6b155cd60651da23389b43cce096fe2ed6ec5e5c730ba69feec166e2728a402d
Pointer size: 132 Bytes
Size of remote file: 1.91 MB

assets/example_diff_2.png ADDED Viewed

Git LFS Details

SHA256: 4f457cfedc797934ffb66c6878c342e26d2d1c5e89bf19bc3c3287136a971a7a
Pointer size: 131 Bytes
Size of remote file: 927 kB

assets/example_diff_3.png ADDED Viewed

Git LFS Details

SHA256: b0aa7d54ff0ec6483c0ceda87793563a1b3bf7049ab45cafae96680dc1e6fb42
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

assets/example_diff_4.png ADDED Viewed

Git LFS Details

SHA256: 2b154de211ec623532f835473aedcaf42f6a2969b53a35c79125ced14393c336
Pointer size: 131 Bytes
Size of remote file: 962 kB

assets/example_input_0.png ADDED Viewed

Git LFS Details

SHA256: e131fa5b64a19d113db548d9bf10181e1ffa9e5dce97185cf667ebd80113b228
Pointer size: 132 Bytes
Size of remote file: 6.2 MB

assets/example_input_1.png ADDED Viewed

Git LFS Details

SHA256: 3c0b2547da571247bd18452b8e956c850b47978ad8371b00ef0c6c6e1425c675
Pointer size: 132 Bytes
Size of remote file: 7.08 MB

assets/example_input_2.png ADDED Viewed

Git LFS Details

SHA256: 2b7dd866b8a9ff119b15b7471b67519bcf60af901255a18f6ff4d580d4181bbc
Pointer size: 132 Bytes
Size of remote file: 4.75 MB

assets/example_input_3.png ADDED Viewed

Git LFS Details

SHA256: 84843078d6da5c46f9afe2b9470dab73536ad62aaff33b9f3144b4cc7c61ee42
Pointer size: 132 Bytes
Size of remote file: 6.65 MB

assets/example_input_4.png ADDED Viewed

Git LFS Details

SHA256: f18ca16e4e4d18c7bbe6936bd80ba8d1d25ffebf90091c581ff2605778837a50
Pointer size: 132 Bytes
Size of remote file: 4.96 MB

assets/example_output_0.png ADDED Viewed

Git LFS Details

SHA256: 41b7420375095bfa79b929a45afe81a3076b5accf274971d6d6c154725e97ddc
Pointer size: 132 Bytes
Size of remote file: 1.74 MB

assets/example_output_1.png ADDED Viewed

Git LFS Details

SHA256: 317e6785571d72fe2e4de4629af384e03b59812d22a6c82781b5166392199f0c
Pointer size: 132 Bytes
Size of remote file: 2.38 MB

assets/example_output_2.png ADDED Viewed

Git LFS Details

SHA256: 99f48ab700e9f5aad5dc797a3b3e2afbe9e42ed2d408fdf392b12d683a7cc3ce
Pointer size: 132 Bytes
Size of remote file: 1.61 MB

assets/example_output_3.png ADDED Viewed

Git LFS Details

SHA256: da55bebf52e374b84769f4845e0f0ac7d3f7ec22e50a1f5003f0e6fe05414748
Pointer size: 132 Bytes
Size of remote file: 2.07 MB

assets/example_output_4.png ADDED Viewed

Git LFS Details

SHA256: 83275746a8232b448331d6b6738f64140a5cd091afc74cdb12851a54514c9325
Pointer size: 132 Bytes
Size of remote file: 1.67 MB

assets/example_truth_0.png ADDED Viewed

Git LFS Details

SHA256: d6df5e794dddc87e61fe021d6d85d17f18f3e06f675c7907a4a3d44ff8bfd09a
Pointer size: 132 Bytes
Size of remote file: 3.39 MB

assets/example_truth_1.png ADDED Viewed

Git LFS Details

SHA256: f79ffd7d9b76de5f11c12c29abdab46f1248e6727c385668561c81880c8ed31d
Pointer size: 132 Bytes
Size of remote file: 4.81 MB

assets/example_truth_2.png ADDED Viewed

Git LFS Details

SHA256: 503dbab87683109c6df9848b1ccfa3b1de79503bb5e9ed7b64f87d4a5893030e
Pointer size: 132 Bytes
Size of remote file: 3.21 MB

assets/example_truth_3.png ADDED Viewed

Git LFS Details

SHA256: 5413190d0c3473a03f15d2d9d6b30d7c1ef769e68abae37a7819705b23a6edfd
Pointer size: 132 Bytes
Size of remote file: 4.15 MB

assets/example_truth_4.png ADDED Viewed

Git LFS Details

SHA256: debdccfc969a8329bd79ba3fc9439d768a2a5373185141d1de76e2b24a26a5e2
Pointer size: 132 Bytes
Size of remote file: 3.09 MB

data/val_tablet_ids.json ADDED Viewed

	@@ -0,0 +1,90 @@

+[
+  "HS_1746",
+  "HS_1059",
+  "HS_1660",
+  "HS_2631",
+  "HS_2072",
+  "HS_890",
+  "HS_883",
+  "HS_0713",
+  "HS_919",
+  "HS_0459",
+  "HS_1327",
+  "HS_736",
+  "HS_1200",
+  "HS_294",
+  "HS_0205",
+  "HS_0362",
+  "HS_510",
+  "HS_1122",
+  "HS_2467",
+  "HS_1650",
+  "HS_2590",
+  "HS_2616",
+  "HS_1336",
+  "HS_2355",
+  "HS_0449",
+  "HS_1770",
+  "HS_0898",
+  "HS_2309",
+  "HS_2084",
+  "HS_566",
+  "HS_0199",
+  "HS_843",
+  "HS_1275",
+  "HS_2556",
+  "HS_1506",
+  "HS_1643",
+  "HS_0661",
+  "HS_1774",
+  "HS_0626",
+  "HS_933",
+  "HS_1485",
+  "HS_665",
+  "HS_1175",
+  "HS_1045",
+  "HS_901",
+  "HS_1494",
+  "HS_194a",
+  "HS_491",
+  "HS_1052",
+  "HS_841",
+  "HS_653",
+  "HS_0102",
+  "HS_848",
+  "HS_1304",
+  "HS_2503",
+  "HS_2061",
+  "HS_1186",
+  "HS_1944",
+  "HS_929",
+  "HS_501",
+  "HS_2673",
+  "HS_535",
+  "HS_1139",
+  "HS_2373",
+  "HS_0151",
+  "HS_2550",
+  "HS_2249",
+  "HS_1210",
+  "HS_1182",
+  "HS_0628",
+  "HS_0158b",
+  "HS_0164",
+  "HS_1949",
+  "HS_2511",
+  "HS_0570",
+  "HS_2337",
+  "HS_598",
+  "HS_435",
+  "HS_0717",
+  "HS_588",
+  "HS_1010",
+  "HS_1192",
+  "HS_1235",
+  "HS_1298",
+  "HS_600",
+  "HS_0147",
+  "HS_0749",
+  "HS_2641"
+]

dev_scripts/benchmark.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""Benchmark script for NisabaRelief inference pipeline."""
+import argparse
+import statistics
+import time
+from datetime import datetime
+from pathlib import Path
+import numpy as np
+from PIL import Image
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.table import Table
+from nisaba_relief import NisabaRelief
+from util.load_val_dataset import load_val_dataset
+BENCHMARK_DIR = Path(__file__).parent.parent / "data" / "benchmark"
+BASELINE = BENCHMARK_DIR / "benchmark_baseline.png"
+WARMUP_RUNS = 2
+BENCH_RUNS = 3
+def build_timing_table(timings: list[float], n_warmup: int) -> Table:
+    bench_timings = timings[n_warmup:]
+    mean = statistics.mean(bench_timings)
+    stdev = statistics.stdev(bench_timings) if len(bench_timings) > 1 else 0.0
+    table = Table(title="Inference Timings")
+    table.add_column("Run", justify="right")
+    table.add_column("Time", justify="right")
+    for i, t in enumerate(timings, 1):
+        label = f"[dim]{i} (warmup)[/dim]" if i <= n_warmup else str(i - n_warmup)
+        time_str = f"[dim]{t:.2f}s[/dim]" if i <= n_warmup else f"{t:.2f}s"
+        table.add_row(label, time_str)
+    table.add_section()
+    table.add_row("[bold]Mean[/bold]", f"[bold]{mean:.2f} ± {stdev:.2f}s[/bold]")
+    return table
+def build_diff_table(flat: np.ndarray, max_diff: int) -> Table:
+    percentile_vals = np.percentile(flat, [50, 90, 95, 96, 97, 98, 99])
+    p98 = percentile_vals[5]
+    status = "PASS" if p98 <= 1 else "FAIL"
+    status_style = "green" if status == "PASS" else "red"
+    table = Table(
+        title=f"Pixel Diff vs Baseline  —  [{status_style}]{status}[/{status_style}]"
+    )
+    table.add_column("Stat", style="bold")
+    table.add_column("Value", justify="right")
+    table.add_row("Mean", f"{flat.mean():.4f}")
+    for label, val in zip(
+        ["p50", "p90", "p95", "p96", "p97", "p98", "p99"], percentile_vals
+    ):
+        table.add_row(label, f"{val:.0f}")
+    table.add_row("Max", str(max_diff))
+    return table
+def main():
+    parser = argparse.ArgumentParser(
+        description="Benchmark NisabaRelief inference pipeline"
+    )
+    parser.add_argument(
+        "--weights-dir",
+        default=".",
+        metavar="PATH",
+        help="path to weights directory (default: .)",
+    )
+    parser.add_argument(
+        "--device",
+        default=None,
+        metavar="DEVICE",
+        help="device to run inference on, e.g. cuda, cpu (default: cuda if available, else cpu)",
+    )
+    args = parser.parse_args()
+    console = Console()
+    rows = load_val_dataset()
+    test_image = rows[0]["photo"]
+    max_dim = max(test_image.size)
+    if max_dim > 2048:
+        scale = 2048 / max_dim
+        new_size = (round(test_image.width * scale), round(test_image.height * scale))
+        test_image = test_image.resize(new_size, Image.LANCZOS)
+    console.print(f"Input size: [cyan]{test_image.width}x{test_image.height}[/cyan]")
+    model_kwargs = dict(seed=42, weights_dir=Path(args.weights_dir))
+    if args.device is not None:
+        model_kwargs["device"] = args.device
+    model = NisabaRelief(**model_kwargs)
+    timings = []
+    output = None
+    total_runs = WARMUP_RUNS + BENCH_RUNS
+    progress = Progress(
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+    )
+    with progress:
+        task = progress.add_task("Benchmarking", total=total_runs)
+        for i in range(total_runs):
+            t0 = time.perf_counter()
+            result = model.process(test_image, show_pbar=False)
+            timings.append(time.perf_counter() - t0)
+            progress.advance(task)
+            if i == WARMUP_RUNS:
+                output = result
+    console.print(build_timing_table(timings, WARMUP_RUNS))
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_path = BENCHMARK_DIR / f"benchmark_{timestamp}.png"
+    run_path.parent.mkdir(parents=True, exist_ok=True)
+    output.save(run_path)
+    console.print(f"Run image saved to [cyan]{run_path}[/cyan]")
+    output_arr = np.array(output)
+    if not BASELINE.exists():
+        output.save(BASELINE)
+        console.print(f"Baseline saved to [cyan]{BASELINE}[/cyan]")
+    else:
+        baseline_arr = np.array(Image.open(BASELINE))
+        diff = np.abs(output_arr.astype(int) - baseline_arr.astype(int))
+        flat = diff.flatten()
+        max_diff = int(flat.max())
+        console.print(build_diff_table(flat, max_diff))
+        if max_diff > 0:
+            diff_img = Image.fromarray(
+                np.clip(diff * (255 // max_diff), 0, 255).astype("uint8")
+            )
+            diff_path = Path(f"benchmark_{timestamp}_diff.png")
+            diff_img.save(diff_path)
+            console.print(
+                f"Diff image saved to [cyan]{diff_path}[/cyan] (amplified {255 // max_diff}x)"
+            )
+if __name__ == "__main__":
+    main()

dev_scripts/evaluation.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""
+Evaluate NisabaRelief on the validation set, optionally sweeping over step counts.
+Usage:
+    python evaluation.py                # full dataset, num_steps=2
+    python evaluation.py --sweep        # subset, steps=[1,2,4,8]
+"""
+import argparse
+import time
+from datetime import timedelta
+from pathlib import Path
+import numpy as np
+from PIL import Image
+from rich.console import Console, Group
+from rich.live import Live
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.table import Table
+from nisaba_relief import NisabaRelief
+from util.metrics import compute_metrics, METRIC_NAMES, LABELS
+from util.load_val_dataset import load_val_dataset
+SWEEP_STEPS = [1, 2, 4, 8]
+DEFAULT_STEPS = 2
+SWEEP_STRIDE = 4
+SWEEP_MAX = 175
+EVALS_DIR = Path(__file__).parent.parent / "data" / "evals"
+def _eta(n_done: int, n_total: int, elapsed: float) -> str:
+    if n_done >= n_total > 0:
+        return "Done"
+    if n_done > 0:
+        return str(timedelta(seconds=int(elapsed / n_done * (n_total - n_done))))
+    return "?"
+def build_table(
+    results: dict,
+    n_done: int = 0,
+    n_total: int = 0,
+    elapsed: float = 0.0,
+) -> Table:
+    eta = _eta(n_done, n_total, elapsed)
+    steps = list(results.keys())
+    table = Table(title=f"Results  —  ETA: {eta}")
+    table.add_column("Metric", style="bold")
+    for s in steps:
+        table.add_column(f"Steps={s}", justify="right")
+    for name in METRIC_NAMES:
+        cells = []
+        for s in steps:
+            arr = np.array(results[s][name])
+            if len(arr) == 0:
+                cells.append("—")
+            elif name in ("psnr", "psnr_hvsm", "sre"):
+                cells.append(f"{arr.mean():.2f} ± {arr.std():.2f} dB")
+            else:
+                cells.append(f"{arr.mean():.4f} ± {arr.std():.4f}")
+        table.add_row(LABELS[name], *cells)
+    return table
+def load_grayscale(img: Image.Image) -> np.ndarray:
+    return np.array(img.convert("L"))
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate NisabaRelief model")
+    parser.add_argument(
+        "--weights-dir",
+        default=".",
+        metavar="PATH",
+        help="path to weights directory (default: .)",
+    )
+    parser.add_argument(
+        "--sweep",
+        action="store_true",
+        help="sweep over steps=[1,2,4,8] on a dataset subset",
+    )
+    args = parser.parse_args()
+    rows = load_val_dataset()
+    if args.sweep:
+        rows = rows.select(
+            range(0, min(len(rows), SWEEP_MAX * SWEEP_STRIDE), SWEEP_STRIDE)
+        )
+        steps_to_run = SWEEP_STEPS
+    else:
+        steps_to_run = [DEFAULT_STEPS]
+    results = {s: {m: [] for m in METRIC_NAMES} for s in steps_to_run}
+    model = NisabaRelief(seed=42, batch_size=4, weights_dir=Path(args.weights_dir))
+    progress = Progress(
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TextColumn("[cyan]{task.fields[hs_number]}"),
+    )
+    task_desc = "Step Sweep" if args.sweep else "Evaluating"
+    task = progress.add_task(task_desc, total=len(rows), hs_number="")
+    start_time = time.monotonic()
+    with Live(
+        Group(progress, build_table(results)),
+        refresh_per_second=4,
+        transient=True,
+    ) as live:
+        for n_done, row in enumerate(rows):
+            progress.update(task, hs_number=row["hs_number"])
+            gt = load_grayscale(row["msii"])
+            for num_steps in steps_to_run:
+                model.num_steps = num_steps
+                save_name = f"{row['hs_number']}_photo_fullview_{int(row['variation']):02d}-step{num_steps}.png"
+                save_path = EVALS_DIR / save_name
+                save_path.parent.mkdir(parents=True, exist_ok=True)
+                if save_path.exists():
+                    pred_img = Image.open(save_path)
+                else:
+                    pred_img = model.process(row["photo"], show_pbar=False)
+                    pred_img.save(save_path)
+                pred = load_grayscale(pred_img)
+                pred_img.close()
+                if pred.shape != gt.shape:
+                    pred = np.array(
+                        Image.fromarray(pred).resize(
+                            (gt.shape[1], gt.shape[0]), Image.LANCZOS
+                        )
+                    )
+                m = compute_metrics(pred, gt)
+                for name, val in m.items():
+                    results[num_steps][name].append(val)
+                elapsed = time.monotonic() - start_time
+                live.update(
+                    Group(progress, build_table(results, n_done + 1, len(rows), elapsed))
+                )
+            progress.advance(task)
+    final_elapsed = time.monotonic() - start_time
+    Console().print(build_table(results, len(rows), len(rows), final_elapsed))
+if __name__ == "__main__":
+    main()

dev_scripts/process_images.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""Process a directory of images through NisabaRelief and save as PNG."""
+import argparse
+from pathlib import Path
+from PIL import Image
+from rich.console import Console
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    ProgressColumn,
+    SpinnerColumn,
+    Task,
+    TextColumn,
+    TimeElapsedColumn,
+)
+from rich.text import Text
+from nisaba_relief import NisabaRelief
+from nisaba_relief.constants import MAX_TILE, MIN_IMAGE_DIMENSION
+Image.MAX_IMAGE_PIXELS = None
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}
+class SimpleTimeRemainingColumn(ProgressColumn):
+    """Estimates remaining time from the average duration of the last 10 iterations.
+    Only recomputes when a new step completes so the display is stable.
+    """
+    def __init__(self, window: int = 10) -> None:
+        super().__init__()
+        self._last_completed: float = 0
+        self._last_elapsed: float = 0.0
+        self._durations: list[float] = []
+        self._window: int = window
+        self._cached: Text = Text("-:--:--", style="progress.remaining")
+    def render(self, task: Task) -> Text:
+        if task.completed <= self._last_completed:
+            return self._cached
+        elapsed = task.finished_time if task.finished else task.elapsed
+        if not elapsed or not task.completed:
+            self._last_completed = task.completed
+            self._cached = Text("-:--:--", style="progress.remaining")
+            return self._cached
+        step_duration = elapsed - self._last_elapsed
+        steps = task.completed - self._last_completed
+        if steps > 0 and self._last_completed > 0:
+            per_step = step_duration / steps
+            self._durations.append(per_step)
+            if len(self._durations) > self._window:
+                self._durations = self._durations[-self._window :]
+        self._last_completed = task.completed
+        self._last_elapsed = elapsed
+        if not self._durations:
+            self._cached = Text("-:--:--", style="progress.remaining")
+            return self._cached
+        avg = sum(self._durations) / len(self._durations)
+        remaining = task.total - task.completed
+        eta_seconds = avg * remaining
+        hours, rem = divmod(int(eta_seconds), 3600)
+        minutes, seconds = divmod(rem, 60)
+        if hours:
+            self._cached = Text(
+                f"{hours}:{minutes:02d}:{seconds:02d}", style="progress.remaining"
+            )
+        else:
+            self._cached = Text(f"{minutes}:{seconds:02d}", style="progress.remaining")
+        return self._cached
+def main():
+    parser = argparse.ArgumentParser(
+        description="Process images through NisabaRelief and save as PNG."
+    )
+    parser.add_argument(
+        "--input-dir", type=Path, required=True, help="Source image directory"
+    )
+    parser.add_argument(
+        "--output-dir", type=Path, required=True, help="Destination directory (created if needed)"
+    )
+    parser.add_argument(
+        "--max-size", type=int, default=MAX_TILE * 5,
+        help="Downsample images larger than this before processing (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--min-size", type=int, default=1536,
+        help="Skip images where max dimension < this (default: %(default)s)",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="Reproducibility seed")
+    parser.add_argument("--weights-dir", type=Path, default=None, help="Local weights directory")
+    parser.add_argument("--batch-size", type=int, default=None, help="Tile batch size")
+    parser.add_argument("--num-steps", type=int, default=2, help="Solver steps (default: %(default)s)")
+    parser.add_argument("--device", default="cuda", help="Torch device (default: %(default)s)")
+    parser.add_argument(
+        "--overwrite", action="store_true", help="Re-process even if output file exists"
+    )
+    args = parser.parse_args()
+    console = Console()
+    input_dir: Path = args.input_dir
+    output_dir: Path = args.output_dir
+    if not input_dir.is_dir():
+        console.print(f"[red]Input directory not found:[/red] [cyan]{input_dir}[/cyan]")
+        return
+    input_images = sorted(
+        p for p in input_dir.iterdir() if p.suffix.lower() in IMAGE_EXTENSIONS
+    )
+    if not input_images:
+        console.print(f"[red]No images found in[/red] [cyan]{input_dir}[/cyan]")
+        return
+    output_dir.mkdir(parents=True, exist_ok=True)
+    to_process = []
+    skipped_existing = 0
+    skipped_small = 0
+    for src in input_images:
+        dst = output_dir / (src.stem + ".png")
+        if not args.overwrite and dst.exists():
+            skipped_existing += 1
+            continue
+        with Image.open(src) as img:
+            if max(img.size) < args.min_size or min(img.size) < MIN_IMAGE_DIMENSION:
+                skipped_small += 1
+                continue
+        to_process.append((src, dst))
+    if skipped_existing:
+        console.print(
+            f"[dim]Skipping {skipped_existing} already-processed image(s)[/dim]"
+        )
+    if skipped_small:
+        console.print(
+            f"[dim]Skipping {skipped_small} image(s) smaller than {args.min_size}px[/dim]"
+        )
+    if not to_process:
+        console.print("[green]All images already processed.[/green]")
+        return
+    console.print(
+        f"Processing [bold]{len(to_process)}[/bold] / {len(input_images)} images  "
+        f"[dim]({input_dir} → {output_dir})[/dim]"
+    )
+    model_kwargs = dict(num_steps=args.num_steps, device=args.device)
+    if args.seed is not None:
+        model_kwargs["seed"] = args.seed
+    if args.weights_dir is not None:
+        model_kwargs["weights_dir"] = args.weights_dir
+    if args.batch_size is not None:
+        model_kwargs["batch_size"] = args.batch_size
+    model = NisabaRelief(**model_kwargs)
+    progress = Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        BarColumn(),
+        MofNCompleteColumn(),
+        TimeElapsedColumn(),
+        TextColumn("eta"),
+        SimpleTimeRemainingColumn(),
+    )
+    with progress:
+        task = progress.add_task("Processing", total=len(to_process))
+        for src, dst in to_process:
+            progress.update(task, description=f"[cyan]{src.name}[/cyan]")
+            image = Image.open(src).convert("RGB")
+            original_size = image.size
+            if max(image.size) > args.max_size:
+                scale = args.max_size / max(image.size)
+                new_size = (
+                    round(image.width * scale) // 16 * 16,
+                    round(image.height * scale) // 16 * 16,
+                )
+                image = image.resize(new_size, Image.LANCZOS)
+            result = model.process(image, show_pbar=False)
+            if result.size != original_size:
+                result = result.resize(original_size, Image.LANCZOS)
+            result.save(dst)
+            progress.advance(task)
+    console.print(
+        f"[green]Done.[/green] {len(to_process)} image(s) saved to [cyan]{output_dir}[/cyan]"
+    )
+if __name__ == "__main__":
+    main()

dev_scripts/util/load_val_dataset.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+Load the validation set from the CuneiformPhotosMSII dataset.
+"""
+from datasets import load_dataset, Dataset
+from pathlib import Path
+import json
+VAL_IDS_PATH = Path(__file__).parent.parent.parent / "data" / "val_tablet_ids.json"
+VAL_IDS = set(json.load(open(VAL_IDS_PATH)))
+def load_val_dataset() -> Dataset:
+    ds = load_dataset("boatbomber/CuneiformPhotosMSII", split="train", num_proc=4)
+    # First pass: parquet column projection reads only the ID strings, skipping image bytes
+    indices = [
+        i
+        for i, row in enumerate(ds.select_columns(["hs_number"]))
+        if row["hs_number"] in VAL_IDS
+    ]
+    return ds.select(indices)

dev_scripts/util/metrics.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Shared metric computation for NisabaRelief evaluation scripts."""
+from concurrent.futures import ThreadPoolExecutor
+import numpy as np
+from image_similarity_measures.quality_metrics import (
+    rmse,
+    psnr,
+    sre,
+)
+import torch
+from pytorch_msssim import ms_ssim as _pt_msssim
+from util.psnr_hvsm import psnr_hvsm
+DICE_THRESHOLD = 130
+METRIC_NAMES = [
+    "dice",
+    "rmse",
+    "msssim",
+    "psnr",
+    "psnr_hvsm",
+    "sre",
+]
+LABELS = {
+    "dice": "**Dice**",
+    "rmse": "RMSE",
+    "msssim": "MS-SSIM",
+    "psnr": "PSNR",
+    "psnr_hvsm": "PSNR-HVS-M",
+    "sre": "SRE",
+}
+def _to_tensor(arr: np.ndarray) -> torch.Tensor:
+    return torch.from_numpy(arr).float().unsqueeze(0).unsqueeze(0)
+def _msssim(gt: np.ndarray, pred: np.ndarray) -> float:
+    return _pt_msssim(
+        _to_tensor(gt), _to_tensor(pred), data_range=255, size_average=True
+    ).item()
+def compute_metrics(pred: np.ndarray, gt: np.ndarray) -> dict[str, float]:
+    """Compute all metrics for a pair of equal-shape grayscale uint8 images."""
+    pred_3d = pred[:, :, np.newaxis]
+    gt_3d = gt[:, :, np.newaxis]
+    pred_bin = pred > DICE_THRESHOLD
+    gt_bin = gt > DICE_THRESHOLD
+    denom = pred_bin.sum() + gt_bin.sum()
+    dice = float(2 * np.logical_and(pred_bin, gt_bin).sum() / denom) if denom > 0 else 1.0
+    tasks = {
+        "rmse": lambda: rmse(gt_3d, pred_3d, max_p=255),
+        "psnr": lambda: psnr(gt_3d, pred_3d, max_p=255),
+        "msssim": lambda: _msssim(gt, pred),
+        "sre": lambda: sre(gt_3d, pred_3d),
+        "psnr_hvsm": lambda: psnr_hvsm(gt, pred)[0],
+        "dice": lambda: dice,
+    }
+    with ThreadPoolExecutor(max_workers=len(tasks)) as executor:
+        futures = {name: executor.submit(fn) for name, fn in tasks.items()}
+        return {name: future.result() for name, future in futures.items()}

dev_scripts/util/psnr_hvsm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""PSNR-HVS-M and PSNR-HVS metrics (Ponomarenko et al., 2006/2007).
+Direct Python translation of the MATLAB reference implementation at
+https://www.ponomarenko.info/psnrhvsm.m
+Returns (p_hvs_m, p_hvs) as a tuple.
+Uses CUDA if available, otherwise falls back to CPU.
+"""
+import math
+import numpy as np
+import torch
+_N = 8
+def _make_dct_matrix() -> torch.Tensor:
+    """8x8 orthonormal DCT-II matrix: D[0,n]=1/√N, D[k>0,n]=√(2/N)·cos(π·k·(2n+1)/(2N))."""
+    k = torch.arange(_N, dtype=torch.float64).unsqueeze(1)
+    n = torch.arange(_N, dtype=torch.float64).unsqueeze(0)
+    D = torch.cos(math.pi * k * (2 * n + 1) / (2 * _N))
+    D[0] = D[0] / math.sqrt(_N)
+    D[1:] = D[1:] * math.sqrt(2.0 / _N)
+    return D
+_DCT8 = _make_dct_matrix()  # (8, 8), CPU float64
+_CSF = torch.tensor(
+    [
+        [1.608443, 2.339554, 2.573509, 1.608443, 1.072295, 0.643377, 0.504610, 0.421887],
+        [2.144591, 2.144591, 1.838221, 1.354478, 0.989811, 0.443708, 0.428918, 0.467911],
+        [1.838221, 1.979622, 1.608443, 1.072295, 0.643377, 0.451493, 0.372972, 0.459555],
+        [1.838221, 1.513829, 1.169777, 0.887417, 0.504610, 0.295806, 0.321689, 0.415082],
+        [1.429727, 1.169777, 0.695543, 0.459555, 0.378457, 0.236102, 0.249855, 0.334222],
+        [1.072295, 0.735288, 0.467911, 0.402111, 0.317717, 0.247453, 0.227744, 0.279729],
+        [0.525206, 0.402111, 0.329937, 0.295806, 0.249855, 0.212687, 0.214459, 0.254803],
+        [0.357432, 0.279729, 0.270896, 0.262603, 0.229778, 0.257351, 0.249855, 0.259950],
+    ],
+    dtype=torch.float64,
+)
+_MASKCOF = torch.tensor(
+    [
+        [0.390625, 0.826446, 1.000000, 0.390625, 0.173611, 0.062500, 0.038447, 0.026874],
+        [0.694444, 0.694444, 0.510204, 0.277008, 0.147929, 0.029727, 0.027778, 0.033058],
+        [0.510204, 0.591716, 0.390625, 0.173611, 0.062500, 0.030779, 0.021004, 0.031888],
+        [0.510204, 0.346021, 0.206612, 0.118906, 0.038447, 0.013212, 0.015625, 0.026015],
+        [0.308642, 0.206612, 0.073046, 0.031888, 0.021626, 0.008417, 0.009426, 0.016866],
+        [0.173611, 0.081633, 0.033058, 0.024414, 0.015242, 0.009246, 0.007831, 0.011815],
+        [0.041649, 0.024414, 0.016437, 0.013212, 0.009426, 0.006830, 0.006944, 0.009803],
+        [0.019290, 0.011815, 0.011080, 0.010412, 0.007972, 0.010000, 0.009426, 0.010203],
+    ],
+    dtype=torch.float64,
+)
+# True everywhere except the DC coefficient at (0, 0)
+_AC_MASK = torch.ones((_N, _N), dtype=torch.bool)
+_AC_MASK[0, 0] = False
+def _vari_batch(blocks: torch.Tensor) -> torch.Tensor:
+    """Unbiased variance * N for a batch of blocks. (B, H, W) -> (B,)"""
+    flat = blocks.reshape(blocks.shape[0], -1)
+    return flat.var(dim=-1, correction=1) * flat.shape[-1]
+def _maskeff_batch(blocks: torch.Tensor, dct_blocks: torch.Tensor) -> torch.Tensor:
+    """Perceptual masking strength for a batch of 8x8 blocks. Returns (B,)."""
+    dev = blocks.device
+    ac = _AC_MASK.to(dev)
+    mc = _MASKCOF.to(dev)
+    m = (dct_blocks[:, ac] ** 2 * mc[ac]).sum(dim=-1)  # (B,)
+    pop = _vari_batch(blocks)
+    quad = (
+        _vari_batch(blocks[:, :4, :4])
+        + _vari_batch(blocks[:, :4, 4:])
+        + _vari_batch(blocks[:, 4:, :4])
+        + _vari_batch(blocks[:, 4:, 4:])
+    )
+    pop_ratio = torch.where(pop > 0, quad / pop, torch.zeros_like(pop))
+    return torch.sqrt(m * pop_ratio) / 32.0
+def psnr_hvsm(img1: np.ndarray, img2: np.ndarray) -> tuple[float, float]:
+    """Return (PSNR-HVS-M, PSNR-HVS) for two uint8 grayscale arrays.
+    Direct translation of the MATLAB reference (Ponomarenko et al.).
+    Partial edge blocks are skipped (truncate to nearest multiple of 8).
+    """
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    D = _DCT8.to(device)
+    csf = _CSF.to(device)
+    maskcof = _MASKCOF.to(device)
+    ac_mask = _AC_MASK.to(device)
+    a = torch.from_numpy(img1.astype(np.float64)).to(device)
+    b = torch.from_numpy(img2.astype(np.float64)).to(device)
+    h, w = a.shape
+    h = (h // 8) * 8
+    w = (w // 8) * 8
+    a = a[:h, :w]
+    b = b[:h, :w]
+    num_blocks = (h // 8) * (w // 8)
+    if num_blocks == 0:
+        return 100000.0, 100000.0
+    # Extract all non-overlapping 8x8 blocks: (B, 8, 8)
+    ba = a.unfold(0, 8, 8).unfold(1, 8, 8).contiguous().reshape(-1, 8, 8)
+    bb = b.unfold(0, 8, 8).unfold(1, 8, 8).contiguous().reshape(-1, 8, 8)
+    # 2D DCT-II (ortho) via separable matrix product: D @ block @ D.T
+    da = D @ ba @ D.t()
+    db = D @ bb @ D.t()
+    mask = torch.maximum(_maskeff_batch(ba, da), _maskeff_batch(bb, db))  # (B,)
+    diff = torch.abs(da - db)  # (B, 8, 8)
+    # PSNR-HVS: CSF-weighted squared error (no masking)
+    S2 = float(((diff * csf) ** 2).sum())
+    # PSNR-HVS-M: soft-threshold AC coefficients by local mask, keep DC as-is
+    thresh = mask[:, None, None] / maskcof[None, :, :]
+    u = torch.where(ac_mask[None, :, :], torch.clamp(diff - thresh, min=0.0), diff)
+    S1 = float(((u * csf) ** 2).sum())
+    denom = num_blocks * 64
+    S1 /= denom
+    S2 /= denom
+    p_hvs_m = 100000.0 if S1 == 0 else float(10.0 * np.log10(255.0**2 / S1))
+    p_hvs = 100000.0 if S2 == 0 else float(10.0 * np.log10(255.0**2 / S2))
+    return p_hvs_m, p_hvs

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2b83231048fc47de658665425368daadce6791bfd95456397b8b595aa0e5d05d
+size 7751105712

nisaba_relief/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""NisabaRelief: Transform cuneiform tablet photos into MSII relief visualizations."""
+__version__ = "0.1.0"
+from .model import NisabaRelief
+__all__ = ["NisabaRelief"]

nisaba_relief/constants.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Named constants for NisabaRelief magic numbers."""
+# Flux model processes images in 16×16 pixel patches
+PATCH_SIZE = 16
+# Tile size bounds: 12 patches (192px) to 64 patches (1024px)
+MIN_TILE = PATCH_SIZE * 12  # 192
+MAX_TILE = PATCH_SIZE * 64  # 1024
+# Aims for ~4 tiles along the longest axis when computing tile size
+TARGET_TILES_PER_SIDE = 4
+# Overlap is 1/8 of the tile size, giving a smooth cosine blend region
+TILE_OVERLAP_DIVISOR = 8
+# Smallest accepted input side in pixels
+MIN_IMAGE_DIMENSION = MIN_TILE * 2
+# Maximum allowed aspect ratio (width:height or height:width)
+MAX_ASPECT_RATIO = 8.0
+# Maximum size (px) for the global context thumbnail
+MAX_GLOBAL_CONTEXT_SIZE = 128
+# Positional sequence ID for conditioning tokens (image being processed)
+COND_SEQ_ID = 10
+# Positional sequence ID for global context tokens (thumbnail overview)
+GLOBAL_CTX_ID = 20
+# Number of latent channels in the Flux model's latent space
+LATENT_CHANNELS = 128
+# Dynamic batch_size constants. Determined empirically on an RTX 3090.
+MAX_BATCH_SIZE = 16
+MIN_BATCH_SIZE = 1
+VRAM_MB_PER_PIXEL = 0.0035
+VRAM_FIXED_OVERHEAD_MB = 15.0
+VRAM_HEADROOM_MB = 1024.0
+# Divisor for AE decoder sub-batching (decoder needs more VRAM than denoiser)
+DECODE_BATCH_SIZE_DIVISOR = 5

nisaba_relief/flux/__init__.py ADDED Viewed

File without changes

nisaba_relief/flux/autoencoder.py ADDED Viewed

	@@ -0,0 +1,351 @@

+import math
+from dataclasses import dataclass, field
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor, nn
+@dataclass
+class AutoEncoderParams:
+    resolution: int = 256
+    in_channels: int = 3
+    ch: int = 128
+    out_ch: int = 3
+    ch_mult: list[int] = field(default_factory=lambda: [1, 2, 4, 4])
+    num_res_blocks: int = 2
+    z_channels: int = 32
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(
+            num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
+        )
+        self.conv1 = nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        self.norm2 = nn.GroupNorm(
+            num_groups=32, num_channels=out_channels, eps=1e-6, affine=True
+        )
+        self.conv2 = nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1
+        )
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(
+                in_channels, out_channels, kernel_size=1, stride=1, padding=0
+            )
+    def forward(self, x: Tensor) -> Tensor:
+        h = x
+        h = self.norm1(h)
+        h = F.silu(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = F.silu(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=2, padding=0
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels, in_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.quant_conv = torch.nn.Conv2d(2 * z_channels, 2 * z_channels, 1)
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(
+            block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1
+        )
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        h = self.conv_in(x)
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](h)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+            if i_level != self.num_resolutions - 1:
+                h = self.down[i_level].downsample(h)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = F.silu(h)
+        h = self.conv_out(h)
+        h = self.quant_conv(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.post_quant_conv = torch.nn.Conv2d(z_channels, z_channels, 1)
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(
+            num_groups=32, num_channels=block_in, eps=1e-6, affine=True
+        )
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        z = self.post_quant_conv(z)
+        # get dtype for proper tracing
+        upscale_dtype = next(self.up.parameters()).dtype
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # cast to proper dtype
+        h = h.to(upscale_dtype)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = F.silu(h)
+        h = self.conv_out(h)
+        return h
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams = AutoEncoderParams()):
+        super().__init__()
+        self.params = params
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.bn_eps = 1e-4
+        self.bn_momentum = 0.1
+        self.ps = [2, 2]
+        self.bn = torch.nn.BatchNorm2d(
+            math.prod(self.ps) * params.z_channels,
+            eps=self.bn_eps,
+            momentum=self.bn_momentum,
+            affine=False,
+            track_running_stats=True,
+        )
+    def normalize(self, z: Tensor) -> Tensor:
+        return self.bn(z)
+    def inv_normalize(self, z: Tensor) -> Tensor:
+        s = torch.sqrt(self.bn.running_var.view(1, -1, 1, 1) + self.bn_eps)
+        m = self.bn.running_mean.view(1, -1, 1, 1)
+        return z * s + m
+    def encode(self, x: Tensor) -> Tensor:
+        moments = self.encoder(x)
+        mean = torch.chunk(moments, 2, dim=1)[0]
+        z = rearrange(
+            mean,
+            "... c (i pi) (j pj)  -> ... (c pi pj) i j",
+            pi=self.ps[0],
+            pj=self.ps[1],
+        )
+        z = self.normalize(z)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = self.inv_normalize(z)
+        z = rearrange(
+            z,
+            "... (c pi pj) i j -> ... c (i pi) (j pj)",
+            pi=self.ps[0],
+            pj=self.ps[1],
+        )
+        dec = self.decoder(z.to(next(self.decoder.parameters()).dtype))
+        return dec

nisaba_relief/flux/layers.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""Building-block nn.Module primitives and standalone functions for Flux2."""
+import math
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period)
+        * torch.arange(start=0, end=half, device=t.device, dtype=torch.float32)
+        / half
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float32, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos.float(), omega)
+    out = torch.stack(
+        [torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)],
+        dim=-1,
+    )
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.reshape(*xk.shape[:-1], -1, 1, 2)
+    freqs_cis = freqs_cis.to(xq.dtype)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape), xk_out.reshape(*xk.shape)
+class SelfAttention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=False)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim, bias=False)
+class SiLUActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.gate_fn = nn.SiLU()
+    def forward(self, x: Tensor) -> Tensor:
+        x1, x2 = x.chunk(2, dim=-1)
+        return self.gate_fn(x1) * x2
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool, disable_bias: bool = False):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=not disable_bias)
+    def forward(self, vec: torch.Tensor):
+        out = self.lin(nn.functional.silu(vec))
+        if out.ndim == 2:
+            out = out[:, None, :]
+        out = out.chunk(self.multiplier, dim=-1)
+        return out[:3], out[3:] if self.is_double else None
+class LastLayer(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        out_channels: int,
+    ):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=False)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=False)
+        )
+    def forward(self, x: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
+        mod = self.adaLN_modulation(vec)
+        shift, scale = mod.chunk(2, dim=-1)
+        if shift.ndim == 2:
+            shift = shift[:, None, :]
+            scale = scale[:, None, :]
+        x = (1 + scale) * self.norm_final(x) + shift
+        x = self.linear(x)
+        return x
+class SingleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.mlp_mult_factor = 2
+        self.linear1 = nn.Linear(
+            hidden_size,
+            hidden_size * 3 + self.mlp_hidden_dim * self.mlp_mult_factor,
+            bias=False,
+        )
+        self.linear2 = nn.Linear(
+            hidden_size + self.mlp_hidden_dim, hidden_size, bias=False
+        )
+        self.norm = QKNorm(head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = SiLUActivation()
+    def forward(
+        self,
+        x: Tensor,
+        pe: Tensor,
+        mod: tuple[Tensor, Tensor],
+    ) -> Tensor:
+        mod_shift, mod_scale, mod_gate = mod
+        x_mod = (1 + mod_scale) * self.pre_norm(x) + mod_shift
+        qkv, mlp = torch.split(
+            self.linear1(x_mod),
+            [3 * self.hidden_size, self.mlp_hidden_dim * self.mlp_mult_factor],
+            dim=-1,
+        )
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        attn = attention(q, k, v, pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod_gate * output
+class DoubleStreamBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float,
+    ):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        assert hidden_size % num_heads == 0, (
+            f"{hidden_size=} must be divisible by {num_heads=}"
+        )
+        self.hidden_size = hidden_size
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_mult_factor = 2
+        self.img_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+        )
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim * self.mlp_mult_factor, bias=False),
+            SiLUActivation(),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=False),
+        )
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(
+            dim=hidden_size,
+            num_heads=num_heads,
+        )
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(
+                hidden_size,
+                mlp_hidden_dim * self.mlp_mult_factor,
+                bias=False,
+            ),
+            SiLUActivation(),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=False),
+        )
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        pe: Tensor,
+        pe_ctx: Tensor,
+        mod_img: tuple[Tensor, Tensor],
+        mod_txt: tuple[Tensor, Tensor],
+    ) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = mod_img
+        txt_mod1, txt_mod2 = mod_txt
+        img_mod1_shift, img_mod1_scale, img_mod1_gate = img_mod1
+        img_mod2_shift, img_mod2_scale, img_mod2_gate = img_mod2
+        txt_mod1_shift, txt_mod1_scale, txt_mod1_gate = txt_mod1
+        txt_mod2_shift, txt_mod2_scale, txt_mod2_gate = txt_mod2
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1_scale) * img_modulated + img_mod1_shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(
+            img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1_scale) * txt_modulated + txt_mod1_shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(
+            txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads
+        )
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        pe = torch.cat((pe_ctx, pe), dim=2)
+        attn = attention(q, k, v, pe)
+        txt_attn, img_attn = (
+            attn[:, : txt_q.shape[2]],
+            attn[:, txt_q.shape[2] :],
+        )
+        # calculate the img blocks
+        img = img + img_mod1_gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2_gate * self.img_mlp(
+            (1 + img_mod2_scale) * (self.img_norm2(img)) + img_mod2_shift
+        )
+        # calculate the txt blocks
+        txt = txt + txt_mod1_gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2_gate * self.txt_mlp(
+            (1 + txt_mod2_scale) * (self.txt_norm2(txt)) + txt_mod2_shift
+        )
+        return img, txt
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int, disable_bias: bool = False):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=not disable_bias)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=not disable_bias)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        emb = torch.cat(
+            [
+                rope(ids[..., i], self.axes_dim[i], self.theta)
+                for i in range(len(self.axes_dim))
+            ],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)

nisaba_relief/flux/model.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from dataclasses import dataclass, field
+import torch
+from torch import Tensor, nn
+from .layers import (
+    DoubleStreamBlock,
+    EmbedND,
+    LastLayer,
+    MLPEmbedder,
+    Modulation,
+    SingleStreamBlock,
+    timestep_embedding,
+)
+@dataclass
+class Klein4BParams:
+    in_channels: int = 128
+    context_in_dim: int = 7680
+    hidden_size: int = 3072
+    num_heads: int = 24
+    depth: int = 5
+    depth_single_blocks: int = 20
+    axes_dim: list[int] = field(default_factory=lambda: [32, 32, 32, 32])
+    theta: int = 2000
+    mlp_ratio: float = 3.0
+class Flux2(nn.Module):
+    def __init__(self, params: Klein4BParams = Klein4BParams()):
+        super().__init__()
+        self.in_channels = params.in_channels
+        self.out_channels = params.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(
+                f"Got {params.axes_dim} but expected positional dim {pe_dim}"
+            )
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(
+            dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim
+        )
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=False)
+        self.time_in = MLPEmbedder(
+            in_dim=256, hidden_dim=self.hidden_size, disable_bias=True
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size, bias=False)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                )
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.double_stream_modulation_img = Modulation(
+            self.hidden_size,
+            double=True,
+            disable_bias=True,
+        )
+        self.double_stream_modulation_txt = Modulation(
+            self.hidden_size,
+            double=True,
+            disable_bias=True,
+        )
+        self.single_stream_modulation = Modulation(
+            self.hidden_size, double=False, disable_bias=True
+        )
+        self.final_layer = LastLayer(
+            self.hidden_size,
+            self.out_channels,
+        )
+    def forward(
+        self,
+        x: Tensor,
+        x_ids: Tensor,
+        timesteps: Tensor,
+        ctx: Tensor,
+        ctx_ids: Tensor,
+        pe_x: Tensor | None = None,
+        pe_ctx: Tensor | None = None,
+    ) -> Tensor:
+        num_txt_tokens = ctx.shape[1]
+        timestep_emb = timestep_embedding(timesteps, 256)
+        vec = self.time_in(timestep_emb)
+        double_block_mod_img = self.double_stream_modulation_img(vec)
+        double_block_mod_txt = self.double_stream_modulation_txt(vec)
+        single_block_mod, _ = self.single_stream_modulation(vec)
+        img = self.img_in(x)
+        txt = self.txt_in(ctx)
+        if pe_x is None:
+            pe_x = self.pe_embedder(x_ids)
+        if pe_ctx is None:
+            pe_ctx = self.pe_embedder(ctx_ids)
+        for block in self.double_blocks:
+            img, txt = block(
+                img,
+                txt,
+                pe_x,
+                pe_ctx,
+                double_block_mod_img,
+                double_block_mod_txt,
+            )
+        img = torch.cat((txt, img), dim=1)
+        pe = torch.cat((pe_ctx, pe_x), dim=2)
+        for block in self.single_blocks:
+            img = block(
+                img,
+                pe,
+                single_block_mod,
+            )
+        img = img[:, num_txt_tokens:, ...]
+        img = self.final_layer(img, vec)
+        return img

nisaba_relief/flux/sampling.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import math
+import torch
+from einops import rearrange
+from torch import Tensor
+from .model import Flux2
+def prc_img_batch(x: Tensor) -> tuple[Tensor, Tensor]:
+    b, _, h, w = x.shape
+    x_ids = torch.cartesian_prod(
+        torch.arange(1),
+        torch.arange(h),
+        torch.arange(w),
+        torch.arange(1),
+    )
+    x_ids = x_ids.unsqueeze(0).expand(b, -1, -1)
+    x = rearrange(x, "b c h w -> b (h w) c")
+    return x, x_ids.to(x.device)
+def generalized_time_snr_shift(t: Tensor, mu: float, sigma: float) -> Tensor:
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def compute_empirical_mu(image_seq_len: int, num_steps: int) -> float:
+    a1, b1 = 8.73809524e-05, 1.89833333
+    a2, b2 = 0.00016927, 0.45666666
+    if image_seq_len > 4300:
+        mu = a2 * image_seq_len + b2
+        return float(mu)
+    m_200 = a2 * image_seq_len + b2
+    m_10 = a1 * image_seq_len + b1
+    a = (m_200 - m_10) / 190.0
+    b = m_200 - 200.0 * a
+    mu = a * num_steps + b
+    return float(mu)
+def get_schedule(num_steps: int, image_seq_len: int) -> list[float]:
+    mu = compute_empirical_mu(image_seq_len, num_steps)
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    timesteps = generalized_time_snr_shift(timesteps, mu, 1.0)
+    return timesteps.tolist()
+def denoise(
+    model: Flux2,
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    timesteps: list[float],
+    img_cond_seq: Tensor | None = None,
+    img_cond_seq_ids: Tensor | None = None,
+) -> Tensor:
+    if img_cond_seq is not None:
+        assert img_cond_seq_ids is not None, (
+            "You need to provide either both or neither of the sequence conditioning"
+        )
+        combined_ids = torch.cat((img_ids, img_cond_seq_ids), dim=1)
+    else:
+        combined_ids = img_ids
+    # Pre-compute positional embeddings once (constant across all timesteps)
+    pe_x = model.pe_embedder(combined_ids)
+    pe_ctx = model.pe_embedder(txt_ids)
+    for t_curr, t_prev in zip(timesteps[:-1], timesteps[1:]):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        img_input = img
+        if img_cond_seq is not None:
+            img_input = torch.cat((img_input, img_cond_seq), dim=1)
+        pred = model(
+            x=img_input,
+            x_ids=combined_ids,
+            timesteps=t_vec,
+            ctx=txt,
+            ctx_ids=txt_ids,
+            pe_x=pe_x,
+            pe_ctx=pe_ctx,
+        )
+        pred = pred[:, : img.shape[1]]
+        img = img + (t_prev - t_curr) * pred
+    return img

nisaba_relief/image_utils.py ADDED Viewed

	@@ -0,0 +1,153 @@

+"""Pure image and tensor helper functions for NisabaRelief."""
+import math
+import numpy as np
+import torch
+from PIL import Image
+from torchvision import transforms
+from .constants import (
+    MAX_TILE,
+    MIN_TILE,
+    PATCH_SIZE,
+    TARGET_TILES_PER_SIDE,
+    TILE_OVERLAP_DIVISOR,
+)
+_to_tensor = transforms.ToTensor()
+def round_to_patch(value: float) -> int:
+    """Round a pixel value to the nearest multiple of PATCH_SIZE (minimum PATCH_SIZE)."""
+    return max(PATCH_SIZE, PATCH_SIZE * round(value / PATCH_SIZE))
+def ceil_to_patch(value: float) -> int:
+    """Ceil a pixel value to the next multiple of PATCH_SIZE (minimum PATCH_SIZE)."""
+    return max(PATCH_SIZE, PATCH_SIZE * math.ceil(value / PATCH_SIZE))
+def compute_tile_size(max_side: int) -> int:
+    """Compute the optimal square tile side length for a given image maximum side."""
+    raw = ceil_to_patch(
+        max_side
+        * TILE_OVERLAP_DIVISOR
+        / (TARGET_TILES_PER_SIDE * (TILE_OVERLAP_DIVISOR - 1) + 1)
+    )
+    return max(min(raw, MAX_TILE), MIN_TILE)
+def compute_tile_grid(
+    orig_w: int, orig_h: int, tile_size: int
+) -> tuple[int, int, int, int, int, int, int, int]:
+    """Compute tiled grid layout for an image.
+    Returns (n_cols, n_rows, padded_w, padded_h, pad_left, pad_top, overlap, stride).
+    """
+    overlap = tile_size // TILE_OVERLAP_DIVISOR
+    stride = tile_size - overlap
+    n_cols = max(1, math.ceil((orig_w - overlap) / stride))
+    n_rows = max(1, math.ceil((orig_h - overlap) / stride))
+    padded_w = tile_size + (n_cols - 1) * stride
+    padded_h = tile_size + (n_rows - 1) * stride
+    pad_left = (padded_w - orig_w) // 2
+    pad_top = (padded_h - orig_h) // 2
+    return n_cols, n_rows, padded_w, padded_h, pad_left, pad_top, overlap, stride
+def image_to_tensor(image: Image.Image, device: str) -> torch.Tensor:
+    """Convert a PIL image to a normalised [-1, 1] float tensor on device."""
+    return (2 * _to_tensor(image) - 1).to(device)
+def tensor_to_image(tensor: torch.Tensor) -> Image.Image:
+    """Convert a normalised [-1, 1] CHW tensor to a PIL RGB image."""
+    img = (tensor.clamp(-1, 1) + 1) / 2
+    img = img.permute(1, 2, 0).float().cpu().numpy()
+    return Image.fromarray((img * 255).astype("uint8"))
+def pad_to_patch_multiple(image: Image.Image) -> Image.Image:
+    """Pad image width and height up to the next multiple of PATCH_SIZE."""
+    w, h = image.size
+    pad_w = (PATCH_SIZE - w % PATCH_SIZE) % PATCH_SIZE
+    pad_h = (PATCH_SIZE - h % PATCH_SIZE) % PATCH_SIZE
+    if pad_w == 0 and pad_h == 0:
+        return image
+    padded = Image.new("RGB", (w + pad_w, h + pad_h), (0, 0, 0))
+    padded.paste(image, (0, 0))
+    return padded
+def postprocess(image: Image.Image, shadow_strength: float = 0.7) -> Image.Image:
+    """Apply adaptive gamma correction and convert to grayscale."""
+    arr = np.array(image, dtype=np.float32) / 255.0
+    gamma = 1.0 + shadow_strength * (1.0 - arr)
+    arr = np.power(arr, gamma)
+    return Image.fromarray((arr * 255).clip(0, 255).astype(np.uint8)).convert("L")
+def draw_tile_indicator(
+    tensor: torch.Tensor,
+    full_w: int,
+    full_h: int,
+    tile_x: int,
+    tile_y: int,
+    tile_w: int,
+    tile_h: int,
+    line_width: int = 1,
+) -> torch.Tensor:
+    """Draw a red rectangle on a CHW tensor to mark the current tile position."""
+    C, H, W = tensor.shape
+    result = tensor.clone()
+    scale_x = W / full_w
+    scale_y = H / full_h
+    x1 = max(0, min(int(tile_x * scale_x), W - 1))
+    y1 = max(0, min(int(tile_y * scale_y), H - 1))
+    x2 = max(0, min(int((tile_x + tile_w) * scale_x), W))
+    y2 = max(0, min(int((tile_y + tile_h) * scale_y), H))
+    red = torch.tensor([1.0, -1.0, -1.0], device=tensor.device, dtype=tensor.dtype)
+    for dy in range(line_width):
+        if y1 + dy < H:
+            result[:, y1 + dy, x1:x2] = red.view(3, 1)
+        if 0 <= y2 - 1 - dy < H:
+            result[:, y2 - 1 - dy, x1:x2] = red.view(3, 1)
+    for dx in range(line_width):
+        if x1 + dx < W:
+            result[:, y1:y2, x1 + dx] = red.view(3, 1)
+        if 0 <= x2 - 1 - dx < W:
+            result[:, y1:y2, x2 - 1 - dx] = red.view(3, 1)
+    return result
+def create_blend_weights(
+    tile_size: int,
+    overlap: int,
+    is_top: bool = False,
+    is_bottom: bool = False,
+    is_left: bool = False,
+    is_right: bool = False,
+    device: str = "cpu",
+) -> torch.Tensor:
+    """Create cosine blend weights for a tile, ramping down at non-edge overlaps."""
+    weights = torch.ones(tile_size, tile_size, device=device)
+    if overlap > 0:
+        ramp = 0.5 * (1 - torch.cos(torch.linspace(0, torch.pi, overlap, device=device)))
+        if not is_top:
+            weights[:overlap, :] *= ramp.view(-1, 1)
+        if not is_bottom:
+            weights[-overlap:, :] *= ramp.flip(0).view(-1, 1)
+        if not is_left:
+            weights[:, :overlap] *= ramp.view(1, -1)
+        if not is_right:
+            weights[:, -overlap:] *= ramp.flip(0).view(1, -1)
+    return weights

nisaba_relief/model.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+NisabaRelief inference model.
+Transforms cuneiform tablet images into MSII visualizations.
+"""
+import contextlib
+import logging
+from os import PathLike
+from pathlib import Path
+import gc
+import torch
+from einops import rearrange
+from PIL import Image
+from tqdm.auto import tqdm
+from safetensors.torch import load_file
+from .constants import (
+    COND_SEQ_ID,
+    DECODE_BATCH_SIZE_DIVISOR,
+    GLOBAL_CTX_ID,
+    LATENT_CHANNELS,
+    MAX_ASPECT_RATIO,
+    MAX_GLOBAL_CONTEXT_SIZE,
+    MIN_IMAGE_DIMENSION,
+    VRAM_FIXED_OVERHEAD_MB,
+    VRAM_HEADROOM_MB,
+    VRAM_MB_PER_PIXEL,
+    MIN_BATCH_SIZE,
+    MAX_BATCH_SIZE,
+)
+from .image_utils import (
+    _to_tensor,
+    compute_tile_grid,
+    compute_tile_size,
+    create_blend_weights,
+    draw_tile_indicator,
+    image_to_tensor,
+    pad_to_patch_multiple,
+    postprocess,
+    round_to_patch,
+    tensor_to_image,
+)
+from .weights import WEIGHT_FILES, download_weights
+from .flux.autoencoder import AutoEncoder
+from .flux.model import Flux2
+from .flux.sampling import (
+    denoise,
+    get_schedule,
+    prc_img_batch,
+)
+logger = logging.getLogger(__name__)
+class NisabaRelief:
+    """Transform cuneiform tablet images into MSII relief visualizations.
+    Args:
+        device: Device to run inference on (default "cuda" if available).
+        num_steps: Number of denoising steps (default 2).
+        weights_dir: Optional local weights directory. If None, uses HuggingFace Hub (boatbomber/NisabaRelief).
+        batch_size: Batch size for processing tiles during inference.
+            None (default) = auto-select based on available GPU memory each call.
+            Set an explicit int to override.
+        seed: Optional random seed for reproducible noise generation (default None).
+        compile: Whether to use torch.compile for faster repeated inference (default True).
+            Requires Triton. Set to False if Triton is not installed or for one-off runs.
+    """
+    def __init__(
+        self,
+        device: str = "cuda" if torch.cuda.is_available() else "cpu",
+        num_steps: int = 2,
+        weights_dir: PathLike | None = None,
+        batch_size: int | None = None,
+        seed: int | None = None,
+        compile: bool = True,
+    ):
+        if batch_size is not None and batch_size < 1:
+            raise ValueError(f"batch_size must be >= 1 or None, got {batch_size}")
+        self.num_steps = num_steps
+        self.device = device
+        self.batch_size = batch_size
+        self.seed = seed
+        if weights_dir is not None:
+            weights_dir = Path(weights_dir)
+            if not weights_dir.is_dir():
+                raise FileNotFoundError(f"weights_dir does not exist: {weights_dir}")
+            missing = [f for f in WEIGHT_FILES if not (weights_dir / f).exists()]
+            if missing:
+                raise FileNotFoundError(
+                    f"Missing weight files in {weights_dir}: {missing}"
+                )
+            weight_paths = {f: str(weights_dir / f) for f in WEIGHT_FILES}
+        else:
+            logger.info("Downloading weights from HuggingFace Hub...")
+            weight_paths = download_weights()
+        # Load AutoEncoder
+        logger.debug("Loading AutoEncoder...")
+        with torch.device("meta"):
+            self.ae = AutoEncoder()
+        ae_weights = load_file(weight_paths["ae.safetensors"], device=device)
+        self.ae.load_state_dict(ae_weights, strict=True, assign=True)
+        self.ae.decoder = self.ae.decoder.to(self.dtype)
+        self.ae.eval()
+        # Load finetuned FLUX.2 model (merged weights)
+        logger.debug("Loading Transformer...")
+        with torch.device("meta"):
+            self.model = Flux2().to(self.dtype)
+        model_weights = load_file(weight_paths["model.safetensors"], device=device)
+        self.model.load_state_dict(model_weights, strict=True, assign=True)
+        self.model = self.model.to(device=device, dtype=self.dtype).eval()
+        # Load pre-computed text embedding
+        logger.debug("Loading text embedding...")
+        text_data = load_file(weight_paths["prompt_embedding.safetensors"], device=device)
+        self.prompt_embedding = text_data["prompt_embedding"].to(self.dtype)
+        self.ctx_ids = text_data["ctx_ids"]
+        if compile and self.device_type == "cuda":
+            try:
+                self.model = torch.compile(self.model)
+                self.ae = torch.compile(self.ae)
+                logger.debug(
+                    "Model compile mode enabled. First run will be slow, but subsequent runs will be faster."
+                )
+            except Exception as e:
+                logger.error("Error compiling model: %s", e, exc_info=True)
+                logger.warning("Falling back to non-compiled model")
+        logger.info("NisabaRelief model loaded and ready")
+    @property
+    def device_type(self) -> str:
+        return self.device.split(":")[0]
+    @property
+    def dtype(self) -> torch.dtype:
+        if self.device_type == "cuda":
+            return torch.bfloat16
+        return torch.float32
+    def _pick_batch_size(self, tile_size: int) -> int:
+        """Estimate the largest safe batch size for a given tile size."""
+        if self.device_type != "cuda":
+            return MIN_BATCH_SIZE
+        gc.collect()
+        torch.cuda.empty_cache()
+        try:
+            device_idx = torch.device(self.device).index or 0
+            free_vram_mb = (
+                torch.cuda.get_device_properties(device_idx).total_memory
+                - torch.cuda.memory_allocated(device_idx)
+            ) / (1024**2)
+            available = free_vram_mb - VRAM_HEADROOM_MB
+            per_tile = VRAM_MB_PER_PIXEL * tile_size**2 + VRAM_FIXED_OVERHEAD_MB
+            batch = max(MIN_BATCH_SIZE, min(MAX_BATCH_SIZE, int(available / per_tile)))
+            logger.debug(
+                "Auto batch_size=%d (tile=%d, free=%.0f MB, per_tile=%.0f MB)",
+                batch,
+                tile_size,
+                free_vram_mb,
+                per_tile,
+            )
+            return batch
+        except Exception as e:
+            logger.error("Error picking batch size: %s", e, exc_info=True)
+            return MIN_BATCH_SIZE
+    def __repr__(self) -> str:
+        return (
+            f"NisabaRelief(device={self.device!r}, num_steps={self.num_steps}, "
+            f"batch_size={self.batch_size}, seed={self.seed!r})"
+        )
+    def process(
+        self,
+        image: PathLike | Image.Image,
+        show_pbar: bool | None = None,
+    ) -> Image.Image:
+        """Transform a cuneiform tablet image into MSII visualization.
+        Args:
+            image: Input image (path or PIL Image).
+            show_pbar: Whether to show a progress bar during tiled inference.
+                If None (default), shows the bar only when there are at least 2 batches to run.
+        Returns:
+            PIL Image (grayscale) with MSII visualization.
+        """
+        if isinstance(image, (str, PathLike)):
+            image = Image.open(image)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        w, h = image.size
+        max_side = max(w, h)
+        min_side = min(w, h)
+        if min_side < MIN_IMAGE_DIMENSION:
+            raise ValueError(
+                f"Image too small: {min_side}px minimum side (need >= {MIN_IMAGE_DIMENSION}px)"
+            )
+        if max_side / min_side > MAX_ASPECT_RATIO:
+            raise ValueError(
+                f"Aspect ratio too extreme: {max_side / min_side:.1f}:1 (max {MAX_ASPECT_RATIO:.0f}:1)"
+            )
+        tile_size = compute_tile_size(max_side)
+        output_image = self._process_tiled(
+            image, tile_size=tile_size, show_pbar=show_pbar
+        )
+        return postprocess(output_image)
+    def _prepare_global_context_tensor(
+        self,
+        image: Image.Image,
+        max_size: int = MAX_GLOBAL_CONTEXT_SIZE,
+    ) -> torch.Tensor:
+        w, h = image.size
+        scale = min(max_size / w, max_size / h)
+        new_w = round_to_patch(w * scale)
+        new_h = round_to_patch(h * scale)
+        resized = image.resize((new_w, new_h), Image.Resampling.LANCZOS)
+        return image_to_tensor(resized, self.device)
+    def _encode_global_context_batch(
+        self,
+        img_tensors: list[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch = torch.stack(img_tensors)
+        with torch.inference_mode():
+            global_latent = self.ae.encode(batch)
+            global_tokens, global_ids = prc_img_batch(global_latent)
+            global_ids[..., 0] = GLOBAL_CTX_ID
+        return global_tokens.to(self.dtype), global_ids
+    def _process_tile_batch(
+        self,
+        tiles: list[Image.Image],
+        global_ctx_tokens: torch.Tensor,
+        global_ctx_ids: torch.Tensor,
+        tile_index_offset: int = 0,
+    ) -> list[Image.Image]:
+        b = len(tiles)
+        original_sizes = [tile.size for tile in tiles]
+        padded_tiles = [pad_to_patch_multiple(tile) for tile in tiles]
+        img_tensors = torch.stack(
+            [image_to_tensor(tile, self.device) for tile in padded_tiles]
+        )
+        with torch.inference_mode():
+            input_latent = self.ae.encode(img_tensors)
+            input_tokens, input_ids = prc_img_batch(input_latent)
+            input_ids_cond = input_ids.clone()
+            input_ids_cond[..., 0] = COND_SEQ_ID
+            cond_tokens = torch.cat([input_tokens, global_ctx_tokens], dim=1)
+            cond_ids = torch.cat([input_ids_cond, global_ctx_ids], dim=1)
+            latent_h = input_latent.shape[2]
+            latent_w = input_latent.shape[3]
+            if self.seed is None:
+                noise = torch.randn(
+                    b,
+                    LATENT_CHANNELS,
+                    latent_h,
+                    latent_w,
+                    device=self.device,
+                    dtype=self.dtype,
+                )
+            else:
+                noise_list = []
+                for i in range(b):
+                    tile_seed = self.seed ^ (tile_index_offset + i)
+                    generator = torch.Generator(device=self.device).manual_seed(tile_seed)
+                    noise_list.append(
+                        torch.randn(
+                            LATENT_CHANNELS,
+                            latent_h,
+                            latent_w,
+                            device=self.device,
+                            dtype=self.dtype,
+                            generator=generator,
+                        )
+                    )
+                noise = torch.stack(noise_list)
+            noise_tokens, _ = prc_img_batch(noise)
+            noise_ids = input_ids
+            seq_len = noise_tokens.shape[1]
+            timesteps = get_schedule(self.num_steps, seq_len)
+            ctx = self.prompt_embedding.unsqueeze(0).expand(b, -1, -1)
+            ctx_ids = self.ctx_ids.unsqueeze(0).expand(b, -1, -1)
+            autocast_ctx = (
+                torch.autocast(device_type=self.device_type, dtype=self.dtype)
+                if self.device_type == "cuda"
+                else contextlib.nullcontext()
+            )
+            with autocast_ctx:
+                output_tokens = denoise(
+                    model=self.model,
+                    img=noise_tokens.to(self.dtype),
+                    img_ids=noise_ids,
+                    txt=ctx.to(self.dtype),
+                    txt_ids=ctx_ids,
+                    timesteps=timesteps,
+                    img_cond_seq=cond_tokens.to(self.dtype),
+                    img_cond_seq_ids=cond_ids,
+                )
+            output_latent = rearrange(
+                output_tokens,
+                "b (h w) c -> b c h w",
+                h=latent_h,
+                w=latent_w,
+            )
+            # Free tensors from encode/denoise phases before AE decode to
+            # avoid CUDA memory fragmentation (the decoder needs large
+            # full-resolution float32 allocations that differ in shape from
+            # the transformer's cached blocks).
+            del img_tensors, input_latent, input_tokens, input_ids
+            del input_ids_cond, cond_tokens, cond_ids
+            del noise, noise_tokens, noise_ids
+            del output_tokens, ctx, ctx_ids
+            if self.device_type == "cuda":
+                torch.cuda.empty_cache()
+            # The AE decoder operates at full pixel resolution in float32,
+            # requiring much more VRAM per tile than the latent-space denoiser.
+            # Sub-batch to avoid overflowing into shared memory.
+            decode_bs = max(1, b // DECODE_BATCH_SIZE_DIVISOR)
+            if decode_bs >= b:
+                output_imgs = self.ae.decode(output_latent)
+            else:
+                chunks = []
+                for i in range(0, b, decode_bs):
+                    chunks.append(self.ae.decode(output_latent[i : i + decode_bs]))
+                    if self.device_type == "cuda":
+                        torch.cuda.empty_cache()
+                output_imgs = torch.cat(chunks, dim=0)
+        results = []
+        for i, (orig_w, orig_h) in enumerate(original_sizes):
+            result = tensor_to_image(output_imgs[i])
+            if padded_tiles[i].size != (orig_w, orig_h):
+                result = result.crop((0, 0, orig_w, orig_h))
+            results.append(result)
+        return results
+    def _process_tiled(
+        self, image: Image.Image, tile_size: int, show_pbar: bool | None = None
+    ) -> Image.Image:
+        orig_w, orig_h = image.size
+        n_cols, n_rows, w, h, pad_left, pad_top, overlap, stride = compute_tile_grid(
+            orig_w, orig_h, tile_size
+        )
+        # Pad canvas so tiles land at exact stride positions with uniform overlap.
+        # Center the image so padding is distributed evenly on all sides.
+        padded = Image.new("RGB", (w, h), (0, 0, 0))
+        padded.paste(image, (pad_left, pad_top))
+        image = padded
+        global_base_tensor = self._prepare_global_context_tensor(image)
+        output = torch.zeros(3, h, w, device=self.device)
+        weights = torch.zeros(1, h, w, device=self.device)
+        tile_specs = [
+            (row, col, col * stride, row * stride)
+            for row in range(n_rows)
+            for col in range(n_cols)
+        ]
+        blend_cache: dict[tuple[bool, bool, bool, bool], torch.Tensor] = {}
+        batch_size = (
+            self.batch_size
+            if self.batch_size is not None
+            else self._pick_batch_size(tile_size)
+        )
+        if show_pbar is None:
+            show_pbar = len(tile_specs) >= 2 * batch_size
+        pbar = tqdm(
+            total=len(tile_specs),
+            desc=f"Processing {orig_w}x{orig_h} px image with {n_cols}x{n_rows} tiles ({tile_size} px each, {overlap} px overlap)",
+            unit="tile",
+            leave=False,
+            disable=not show_pbar,
+        )
+        for batch_start in range(0, len(tile_specs), batch_size):
+            batch_specs = tile_specs[batch_start : batch_start + batch_size]
+            pbar.clear()
+            logger.debug(
+                "Processing %d batched tiles: %s",
+                len(batch_specs),
+                " + ".join([f"({row},{col})" for row, col, _, _ in batch_specs]),
+            )
+            pbar.refresh()
+            ctx_tensors = [
+                draw_tile_indicator(global_base_tensor, w, h, x, y, tile_size, tile_size)
+                for (row, col, x, y) in batch_specs
+            ]
+            global_tokens, global_ids = self._encode_global_context_batch(ctx_tensors)
+            tiles = [
+                image.crop((x, y, x + tile_size, y + tile_size))
+                for (row, col, x, y) in batch_specs
+            ]
+            result_tiles = self._process_tile_batch(
+                tiles, global_tokens, global_ids, tile_index_offset=batch_start
+            )
+            for i, (row, col, x, y) in enumerate(batch_specs):
+                edge_key = (
+                    row == 0,
+                    row == n_rows - 1,
+                    col == 0,
+                    col == n_cols - 1,
+                )
+                if edge_key not in blend_cache:
+                    blend_cache[edge_key] = create_blend_weights(
+                        tile_size,
+                        overlap,
+                        is_top=edge_key[0],
+                        is_bottom=edge_key[1],
+                        is_left=edge_key[2],
+                        is_right=edge_key[3],
+                        device=self.device,
+                    )
+                blend = blend_cache[edge_key]
+                result_tensor = _to_tensor(result_tiles[i]).to(self.device)
+                output[:, y : y + tile_size, x : x + tile_size] += result_tensor * blend
+                weights[:, y : y + tile_size, x : x + tile_size] += blend
+            if self.device_type == "cuda":
+                torch.cuda.empty_cache()
+            pbar.update(len(batch_specs))
+        pbar.close()
+        output = output / weights.clamp(min=1e-6)
+        output = output.permute(1, 2, 0).cpu().numpy()
+        output = (output * 255).clip(0, 255).astype("uint8")
+        return Image.fromarray(output).crop(
+            (pad_left, pad_top, pad_left + orig_w, pad_top + orig_h)
+        )

nisaba_relief/py.typed ADDED Viewed

File without changes

nisaba_relief/weights.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""HuggingFace Hub weight downloading for NisabaRelief."""
+from huggingface_hub import hf_hub_download
+HF_REPO_ID = "boatbomber/NisabaRelief"
+WEIGHT_FILES = [
+    "ae.safetensors",
+    "model.safetensors",
+    "prompt_embedding.safetensors",
+]
+def download_weights(repo_id: str = HF_REPO_ID) -> dict[str, str]:
+    """Download all weight files from HF Hub, returning {filename: local_path}."""
+    paths = {}
+    for filename in WEIGHT_FILES:
+        try:
+            paths[filename] = hf_hub_download(repo_id=repo_id, filename=filename)
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download {filename} from {repo_id}: {e}"
+            ) from e
+    return paths

prompt_embedding.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bc9b70751370039f6af10f5c803f9854354f7029f7d9521c6a4ee7c5ae28f999
+size 7880872

pyproject.toml ADDED Viewed

	@@ -0,0 +1,69 @@

+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.sdist]
+exclude = ["*.safetensors", "assets/**", "data/**", "dev_scripts/**", "uv.lock"]
+[tool.hatch.build.targets.wheel]
+packages = ["nisaba_relief"]
+[project]
+name = "nisaba-relief"
+version = "0.1.0"
+description = "Transform cuneiform tablet photos into MSII relief visualizations"
+readme = { file = "README.md", content-type = "text/markdown" }
+license = "Apache-2.0"
+requires-python = ">=3.10,<3.14"
+authors = [{ name = "Zack Williams", email = "zack@boatbomber.com" }]
+keywords = ["cuneiform", "msii", "relief", "ocr", "flux", "deep-learning"]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+    "Topic :: Scientific/Engineering :: Image Processing",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+]
+dependencies = [
+    "einops>=0.8.2",
+    "safetensors",
+    "numpy",
+    "pillow",
+    "huggingface-hub",
+    "tqdm",
+]
+[project.urls]
+Homepage = "https://huggingface.co/boatbomber/NisabaRelief"
+Repository = "https://huggingface.co/boatbomber/NisabaRelief"
+Issues = "https://huggingface.co/boatbomber/NisabaRelief/discussions"
+[dependency-groups]
+dev = [
+    "ruff>=0.15.4",
+    "scikit-image>=0.25.2",
+    "scipy>=1.15.3",
+    "image-similarity-measures[speedups]>=0.3.5",
+    "pytorch-msssim>=1.0.0",
+    "rich>=14.3.3",
+    "datasets>=4.6.1",
+]
+[[tool.uv.index]]
+name = "pytorch-cu128"
+url = "https://download.pytorch.org/whl/cu128"
+explicit = true
+[tool.uv.sources]
+torch = { index = "pytorch-cu128" }
+torchvision = { index = "pytorch-cu128" }
+triton = { index = "pytorch-cu128" }
+[tool.ruff]
+line-length = 90

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff