fcxfcx commited on 1 day ago

Commit

742a3d1

verified ·

1 Parent(s): 6f7b1aa

Upload 549 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +5 -0
CONTRIBUTING.md +32 -0
IOU_test.py +21 -0
LICENSE +202 -0
README.md +217 -0
__pycache__/owlv2_helper.cpython-310.pyc +0 -0
__pycache__/owlv2_helper_functions.cpython-310.pyc +0 -0
auto_bbox.py +266 -0
big_vision/.gitignore +1 -0
big_vision/CONTRIBUTING.md +26 -0
big_vision/LICENSE +201 -0
big_vision/README.md +499 -0
big_vision/__init__.py +0 -0
big_vision/__pycache__/__init__.cpython-310.pyc +0 -0
big_vision/__pycache__/utils.cpython-310.pyc +0 -0
big_vision/configs/__init__.py +0 -0
big_vision/configs/bit_i1k.py +102 -0
big_vision/configs/bit_i21k.py +85 -0
big_vision/configs/common.py +188 -0
big_vision/configs/common_fewshot.py +60 -0
big_vision/configs/load_and_eval.py +143 -0
big_vision/configs/mlp_mixer_i1k.py +120 -0
big_vision/configs/transfer.py +186 -0
big_vision/configs/vit_i1k.py +177 -0
big_vision/configs/vit_i21k.py +145 -0
big_vision/configs/vit_s16_i1k.py +105 -0
big_vision/datasets/ai2d/ai2d.py +0 -0
big_vision/datasets/aokvqa/aokvqa.py +0 -0
big_vision/datasets/chartqa/chartqa.py +0 -0
big_vision/datasets/coco35l/coco35l.py +0 -0
big_vision/datasets/core.py +77 -0
big_vision/datasets/countbenchqa/countbenchqa.py +0 -0
big_vision/datasets/docvqa/docvqa.py +0 -0
big_vision/datasets/gqa/gqa.py +0 -0
big_vision/datasets/imagenet/class_names.py +0 -0
big_vision/datasets/infovqa/infovqa.py +0 -0
big_vision/datasets/jsonl.py +177 -0
big_vision/datasets/nocaps/nocaps.py +0 -0
big_vision/datasets/okvqa/okvqa.py +0 -0
big_vision/datasets/pope/pope.py +0 -0
big_vision/datasets/refcoco/refcoco.py +0 -0
big_vision/datasets/rsvqa_hr/rsvqa_hr.py +0 -0
big_vision/datasets/rsvqa_lr/rsvqa_lr.py +0 -0
big_vision/datasets/scicap/scicap.py +0 -0
big_vision/datasets/science_qa/science_qa.py +0 -0
big_vision/datasets/screen2words/screen2words.py +0 -0
big_vision/datasets/sequence_packing.py +77 -0
big_vision/datasets/stvqa/stvqa.py +0 -0
big_vision/datasets/tallyqa/tallyqa.py +0 -0
big_vision/datasets/textcaps/textcaps.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+ckpts/clip_vit_l14_with_masks_6c17944 filter=lfs diff=lfs merge=lfs -text
+ckpts/owl2-b16-960-st-ngrams-curated-ft-lvisbase-ens-cold-weight-05_209b65b filter=lfs diff=lfs merge=lfs -text
+ckpts/owl2-l14-1008-st-ngrams-ft-lvisbase-ens-cold-weight-04_8ca674c filter=lfs diff=lfs merge=lfs -text
+images/scenic_design.jpg filter=lfs diff=lfs merge=lfs -text
+images/scenic_logo.jpg filter=lfs diff=lfs merge=lfs -text

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,32 @@

+# How to Contribute
+Scenic is a platform used for developing new methods and ideas by Google
+researchers, mostly around attention-based models for computer vision or
+multi-modal applications. We encourage forking the repository and continued
+development. We welcome suggestions and contributions to improving Scenic.
+There are a few small guidelines you need to follow.
+## Contributor License Agreement
+Contributions to this project must be accompanied by a Contributor License
+Agreement (CLA). You (or your employer) retain the copyright to your
+contribution; this simply gives us permission to use and redistribute your
+contributions as part of the project. Head over to
+<https://cla.developers.google.com/> to see your current agreements on file or
+to sign a new one.
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+## Code Reviews
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult
+[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+## Community Guidelines
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).

IOU_test.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from owlv2_helper_functions import get_iou, boxes_filter
+boxes = [
+    (128.56, 4.57, 732.52, 476.05),
+    (569.65, 185.71, 740.31, 244.76),
+    (569.65, 185.71, 740.31, 244.76),
+    (569.65, 185.71, 740.31, 244.76),
+    (101.99, 99.00, 720.12, 88.63),
+    ]
+scores  = [1.0, 0.99, 0.89, 1.0, 0.99]
+instances  = ['cat', 'dog', 'dog', 'tiger', 'cat']
+pred_bboxes, pred_scores, instances = boxes_filter(boxes, scores, instances)
+print(pred_bboxes)
+print(pred_scores)
+print(instances)

LICENSE ADDED Viewed

	@@ -0,0 +1,202 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,217 @@

+# Scenic
+<div style="text-align: left">
+<img align="right" src="https://raw.githubusercontent.com/google-research/scenic/main/images/scenic_logo.png" width="200" alt="scenic logo"></img>
+</div>
+*Scenic* is a codebase with a focus on research around attention-based models
+for computer vision. Scenic has been successfully used to develop
+classification, segmentation, and detection models for multiple modalities
+including images, video, audio, and multimodal combinations of them.
+More precisely, *Scenic* is a (i) set of shared light-weight libraries solving
+tasks commonly encountered tasks when training large-scale (i.e. multi-device,
+multi-host) vision models; and (ii) several *projects* containing fully
+fleshed out problem-specific training and evaluation loops using these
+libraries.
+Scenic is developed in [JAX](https://github.com/jax-ml/jax) and uses
+[Flax](https://github.com/google/flax).
+### Contents
+* [What we offer](#what-we-offer)
+* [SOTA models and baselines in Scenic](#sota-models-and-baselines-in-scenic)
+* [Philosophy](#philosophy)
+* [Getting started](#getting-started)
+* [Scenic component design](#scenic-component-design)
+* [Citing Scenic](#citing-scenic)
+## What we offer
+Among others *Scenic* provides
+* Boilerplate code for launching experiments, summary writing, logging,
+  profiling, etc;
+* Optimized training and evaluation loops, losses, metrics, bi-partite matchers,
+  etc;
+* Input-pipelines for popular vision datasets;
+* [Baseline models](https://github.com/google-research/scenic/tree/main/scenic/projects/baselines#scenic-baseline-models),
+including strong non-attentional baselines.
+## SOTA models and baselines in *Scenic*
+There are some SOTA models and baselines in Scenic which were either developed
+using Scenic, or have been reimplemented in Scenic:
+Projects that were developed in Scenic or used it for their experiments:
+* [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691)
+* [OmniNet: Omnidirectional Representations from Transformers](https://arxiv.org/abs/2103.01075)
+* [Attention Bottlenecks for Multimodal Fusion](https://arxiv.org/abs/2107.00135)
+* [TokenLearner: What Can 8 Learned Tokens Do for Images and Videos?](https://arxiv.org/abs/2106.11297)
+* [Exploring the Limits of Large Scale Pre-training](https://arxiv.org/abs/2110.02095)
+* [The Efficiency Misnomer](https://arxiv.org/abs/2110.12894)
+* [Discrete Representations Strengthen Vision Transformer Robustness](https://arxiv.org/abs/2111.10493)
+* [Pyramid Adversarial Training Improves ViT Performance](https://arxiv.org/abs/2111.15121)
+* [VUT: Versatile UI Transformer for Multi-Modal Multi-Task User Interface Modeling](https://arxiv.org/abs/2112.05692)
+* [CLAY: Learning to Denoise Raw Mobile UI Layouts for Improving Datasets at Scale](https://arxiv.org/abs/2201.04100)
+* [Zero-Shot Text-Guided Object Generation with Dream Fields](https://arxiv.org/abs/2112.01455)
+* [Multiview Transformers for Video Recognition](https://arxiv.org/abs/2201.04288)
+* [PolyViT: Co-training Vision Transformers on Images, Videos and Audio](https://arxiv.org/abs/2111.12993)
+* [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
+* [Learning with Neighbor Consistency for Noisy Labels](https://arxiv.org/abs/2202.02200)
+* [Token Turing Machines](https://arxiv.org/pdf/2211.09119.pdf)
+* [Vid2Seq: Large-Scale Pretraining of a Visual Language Model for Dense Video Captioning](https://arxiv.org/pdf/2302.14115.pdf)
+* [AVATAR: Unconstrained Audiovisual Speech Recognition](https://arxiv.org/abs/2206.07684)
+* [Adaptive Computation with Elastic Input Sequence](https://arxiv.org/abs/2301.13195)
+* [Location-Aware Self-Supervised Transformers for Semantic Segmentation](https://arxiv.org/abs/2212.02400)
+* [How can objects help action recognition?](https://openaccess.thecvf.com/content/CVPR2023/html/Zhou_How_Can_Objects_Help_Action_Recognition_CVPR_2023_paper.html)
+* [Verbs in Action: Improving verb understanding in video-language models](https://arxiv.org/abs/2304.06708)
+* [Unified Visual Relationship Detection with Vision and Language Models](https://arxiv.org/abs/2303.08998)
+* [UnLoc: A Unified Framework for Video Localization Tasks](https://arxiv.org/abs/2308.11062)
+* [REVEAL: Retrieval-Augmented Visual-Language Pre-Training with Multi-Source Multimodal Knowledge Memory](https://arxiv.org/abs/2212.05221)
+* [Audiovisual Masked Autoencoders](https://arxiv.org/abs/2212.05922)
+* [MatFormer: Nested Transformer for Elastic Inference](https://arxiv.org/abs/2310.07707)
+* [Pixel Aligned Language Models](https://arxiv.org/abs/2312.09237)
+* [A Generative Approach for Wikipedia-Scale Visual Entity Recognition](https://arxiv.org/abs/2403.02041)
+* [Streaming Dense Video Captioning](https://arxiv.org/abs/2404.01297)
+* [Dense Video Object Captioning from Disjoint Supervision](https://arxiv.org/abs/2306.11729)
+More information can be found in [projects](https://github.com/google-research/scenic/tree/main/scenic/projects#list-of-projects-hosted-in-scenic).
+Baselines that were reproduced in Scenic:
+* [(ViT) An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
+* [(DETR) End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872)
+* [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159)
+* [(CLIP) Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020)
+* [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601)
+* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
+* [How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers](https://arxiv.org/abs/2106.10270)
+* [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370)
+* [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385)
+* [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
+* [PCT: Point Cloud Transformer](https://arxiv.org/abs/2012.09688)
+* [Universal Transformers](https://arxiv.org/abs/1807.03819)
+* [PonderNet](https://arxiv.org/abs/2107.05407)
+* [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+* [Rethinking Attention with Performers](https://arxiv.org/abs/2009.14794)
+* [(CenterNet) Objects as Points](https://arxiv.org/abs/1904.07850)
+* [(SAM) Segment Anything](https://arxiv.org/abs/2304.02643)
+More information can be found in [baseline models](https://github.com/google-research/scenic/tree/main/scenic/projects/baselines#scenic-baseline-models).
+<a name="philosophy"></a>
+## Philosophy
+*Scenic* aims to facilitate rapid prototyping of large-scale vision models. To
+keep the code simple to understand and extend we prefer *forking and
+copy-pasting over adding complexity or increasing abstraction*. Only when
+functionality proves to be widely useful across many models and tasks it may be
+upstreamed to Scenic's shared libraries.
+<a name="getting_start"></a>
+## Getting started
+* See `projects/baselines/README.md` for a walk-through baseline models and
+  instructions on how to run the code.
+* If you would like to contribute to *Scenic*, please check out the
+  [Philisophy](#philosophy), [Code structure](#code_structure) and
+  [Contributing](CONTRIBUTING.md) sections.
+  Should your contribution be a part of the shared libraries, please send us a
+  pull request!
+### Quickstart
+You will need Python 3.9 or later. Download the code from GitHub
+```shell
+$ git clone https://github.com/google-research/scenic.git
+$ cd scenic
+$ pip install .
+```
+and run training for ViT on ImageNet:
+```shell
+$ python scenic/main.py -- \
+  --config=scenic/projects/baselines/configs/imagenet/imagenet_vit_config.py \
+  --workdir=./
+```
+Note that for specific projects and baselines, you might need to install extra
+packages that are mentioned in their `README.md` or `requirements.txt` files.
+[Here](https://colab.research.google.com/github/google-research/scenic/blob/main/scenic/common_lib/colabs/scenic_playground.ipynb)
+is also a minimal colab to train a simple feed-forward model using Scenic.
+<a name="code_structure"></a>
+## Scenic component design
+Scenic is designed to propose different levels of abstraction, to support
+hosting projects that only require changing hyper-parameters by defining config
+files, to those that need customization on the input pipeline, model
+architecture, losses and metrics, and the training loop. To make this happen,
+the code in Scenic is organized as either _project-level_ code,
+which refers to customized code for specific projects or baselines or
+_library-level_ code, which refers to common functionalities and general
+patterns that are adapted by the majority of projects. The project-level
+code lives in the `projects` directory.
+<div align="center">
+<img src="https://raw.githubusercontent.com/google-research/scenic/main/images/scenic_design.jpg" width="900" alt="scenic design"></img>
+</div>
+### Library-level code
+The goal is to keep the library-level code minimal and well-tested and to avoid
+introducing extra abstractions to support minor use-cases. Shared libraries
+provided by *Scenic* are split into:
+*   `dataset_lib`: Implements IO pipelines for loading and pre-processing data
+    for common Computer Vision tasks and benchmarks (see "Tasks and Datasets"
+    section). All pipelines are designed to be scalable and support multi-host
+    and multi-device setups, taking care dividing data among multiple hosts,
+    incomplete batches, caching, pre-fetching, etc.
+*   `model_lib` : Provides
+    *   several abstract model interfaces (e.g. `ClassificationModel` or
+        `SegmentationModel` in `model_lib.base_models`) with task-specific
+        losses and metrics;
+    *   neural network layers in `model_lib.layers`, focusing on efficient
+        implementation of attention and transformer layers;
+    *   accelerator-friendly implementations of bipartite matching
+        algorithms in `model_lib.matchers`.
+*   `train_lib`: Provides tools for constructing training loops and implements
+    several optimized trainers (classification trainer and segmentation trainer)
+    that can be forked for customization.
+*   `common_lib`: General utilities, like logging and debugging modules,
+    functionalities for processing raw data, etc.
+### Project-level code
+Scenic supports the development of customized solutions for customized tasks and
+data via the concept of "project". There is no one-fits-all recipe for how much
+code should be re-used by a project. Projects can consist of only configs and
+use the common models, trainers, task/data that live in library-level code, or
+they can simply fork any of the mentioned functionalities and redefine, layers,
+losses, metrics, logging methods, tasks, architectures, as well as training and
+evaluation loops. The modularity of library-level code makes it flexible for
+projects to fall placed on any spot in the "run-as-is" to "fully customized"
+spectrum.
+Common baselines such as a ResNet and Vision Transformer (ViT) are implemented
+in the [`projects/baselines`](https://github.com/google-research/scenic/tree/main/scenic/projects/baselines)
+project. Forking models in this directory is a good starting point for new
+projects.
+## Citing Scenic
+If you use Scenic, you can cite our [white paper](https://openaccess.thecvf.com/content/CVPR2022/html/Dehghani_Scenic_A_JAX_Library_for_Computer_Vision_Research_and_Beyond_CVPR_2022_paper.html).
+Here is an example BibTeX entry:
+```bibtex
+@InProceedings{dehghani2021scenic,
+    author    = {Dehghani, Mostafa and Gritsenko, Alexey and Arnab, Anurag and Minderer, Matthias and Tay, Yi},
+    title     = {Scenic: A JAX Library for Computer Vision Research and Beyond},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    year      = {2022},
+    pages     = {21393-21398}
+}
+```
+_Disclaimer: This is not an official Google product._

__pycache__/owlv2_helper.cpython-310.pyc ADDED Viewed

Binary file (4.22 kB). View file

__pycache__/owlv2_helper_functions.cpython-310.pyc ADDED Viewed

Binary file (7.51 kB). View file

auto_bbox.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import sys
+import cv2
+import json
+import glob
+import argparse
+import subprocess
+from typing import List, Tuple, Dict, Any
+import numpy as np
+from tqdm import tqdm
+# ----------------- Args -----------------
+def parse_args():
+    ap = argparse.ArgumentParser("OWLv2 detection on JPG folders (Top-K per image), multi-GPU.")
+    ap.add_argument("--input_dir", type=str, required=True, help="Root that contains subfolders of JPGs; if JPGs are directly under input_dir, it will be treated as a single set.")
+    ap.add_argument("--startswith", type=str, default="", help="Filter folder name prefix (or input_dir basename if no subfolders).")
+    ap.add_argument("--output_dir", type=str, required=True)
+    ap.add_argument("--frame_stride", type=int, default=1, help="Sample every N-th image within a folder.")
+    ap.add_argument("--top_k", type=int, default=5)
+    ap.add_argument("--max_frames", type=int, default=0, help="Max processed images per folder; 0 means no limit.")
+    ap.add_argument("--num_workers", type=int, default=1, help="#GPUs/#workers")
+    ap.add_argument("--worker_idx", type=int, default=-1, help="internal; >=0 means child worker")
+    ap.add_argument("--shard_file", type=str, default="", help="internal; JSON with folder paths for this worker")
+    ap.add_argument("--scenic_root", type=str, default="/home/ubuntu/rs/JiT/VisionModels/Scenic_OWLv2/big_vision")
+    return ap.parse_args()
+# ----------------- Utils -----------------
+def _has_jpgs(path: str) -> bool:
+    exts = ("*.jpg", "*.jpeg", "*.JPG", "*.JPEG")
+    for pat in exts:
+        if glob.glob(os.path.join(path, pat)):
+            return True
+    return False
+def iter_image_dirs(input_dir: str, startswith: str) -> List[str]:
+    """
+    Returns a list of directories to process.
+    - If input_dir contains subfolders: return subfolders that contain JPGs and match startswith.
+    - Else if input_dir itself contains JPGs and its basename matches startswith: return [input_dir].
+    """
+    input_dir = os.path.abspath(input_dir)
+    subs = sorted([p for p in glob.glob(os.path.join(input_dir, "*")) if os.path.isdir(p)])
+    # Prefer subfolders if any exist and contain jpgs
+    dirs = [d for d in subs if os.path.basename(d).startswith(startswith) and _has_jpgs(d)]
+    if dirs:
+        return dirs
+    # Fallback: treat input_dir itself as one set if it has jpgs
+    base_ok = os.path.basename(os.path.normpath(input_dir)).startswith(startswith)
+    if base_ok and _has_jpgs(input_dir):
+        return [input_dir]
+    return []
+def ensure_dir(p: str):
+    os.makedirs(p, exist_ok=True)
+def draw_single_box(frame_bgr: np.ndarray, box: List[float], color=(0, 255, 0), thickness=2) -> np.ndarray:
+    x1, y1, x2, y2 = map(int, box)
+    out = frame_bgr.copy()
+    cv2.rectangle(out, (x1, y1), (x2, y2), color, thickness)
+    return out
+def list_images_sorted(folder: str) -> List[str]:
+    pats = ["*.jpg", "*.jpeg", "*.JPG", "*.JPEG"]
+    files = []
+    for pat in pats:
+        files.extend(glob.glob(os.path.join(folder, pat)))
+    # Sort by natural file name order
+    return sorted(files)
+# ----------------- Worker logic (imports JAX/Scenic inside) -----------------
+def worker_run(args, dir_paths: List[str]):
+    import sys as _sys
+    if args.scenic_root not in _sys.path:
+        _sys.path.append(args.scenic_root)
+    # Free TF GPU to JAX in this process (why: avoid TF reserving VRAM)
+    import tensorflow as tf
+    tf.config.experimental.set_visible_devices([], "GPU")
+    from scenic.projects.owl_vit import configs
+    from scenic.projects.owl_vit import models
+    import jax
+    import functools
+    import owlv2_helper as helper  # must be available in PYTHONPATH
+    class OWLv2Objectness:
+        def __init__(self, top_k: int = 5):
+            self.top_k = top_k
+            self.config = configs.owl_v2_clip_b16.get_config(init_mode="canonical_checkpoint")
+            self.module = models.TextZeroShotDetectionModule(
+                body_configs=self.config.model.body,
+                objectness_head_configs=self.config.model.objectness_head,
+                normalize=self.config.model.normalize,
+                box_bias=self.config.model.box_bias,
+            )
+            self.variables = self.module.load_variables(self.config.init_from.checkpoint_path)
+            self.image_embedder = jax.jit(
+                functools.partial(self.module.apply, self.variables, train=False, method=self.module.image_embedder)
+            )
+            self.objectness_predictor = jax.jit(
+                functools.partial(self.module.apply, self.variables, method=self.module.objectness_predictor)
+            )
+            self.box_predictor = jax.jit(
+                functools.partial(self.module.apply, self.variables, method=self.module.box_predictor)
+            )
+        def detect(self, image_bgr: np.ndarray) -> List[Tuple[List[float], float]]:
+            image_rgb = cv2.cvtColor(image_bgr, cv2.COLOR_BGR2RGB)
+            processed = helper.preprocess_images([image_rgb], self.config.dataset_configs.input_size)[0]
+            feature_map = self.image_embedder(processed[None, ...])
+            b, h, w, d = feature_map.shape
+            image_features = feature_map.reshape(b, h * w, d)
+            obj_logits = self.objectness_predictor(image_features)["objectness_logits"]
+            raw_boxes = self.box_predictor(image_features=image_features, feature_map=feature_map)["pred_boxes"]
+            obj = np.array(obj_logits[0], dtype=np.float32)
+            raw_boxes = np.array(raw_boxes[0], dtype=np.float32)
+            boxes = helper.rescale_detection_box(raw_boxes, image_rgb)
+            if len(obj) == 0:
+                return []
+            k = min(self.top_k, len(obj))
+            thresh = np.partition(obj, -k)[-k]
+            filtered: List[Tuple[List[float], float]] = []
+            H, W = image_rgb.shape[:2]
+            for box, score in zip(boxes, obj):
+                if score < thresh:
+                    continue
+                if helper.too_small(box) or helper.too_large(box, image_rgb):
+                    continue
+                x1, y1, x2, y2 = box
+                x1 = max(0, min(float(x1), W - 1))
+                y1 = max(0, min(float(y1), H - 1))
+                x2 = max(0, min(float(x2), W - 1))
+                y2 = max(0, min(float(y2), H - 1))
+                filtered.append(([x1, y1, x2, y2], float(score)))
+            kept_boxes = helper.remove_overlapping_bboxes([b for b, _ in filtered])
+            def _match_score(bb: List[float]) -> float:
+                arr = np.array([b for b, _ in filtered], dtype=np.float32)
+                idx = int(np.argmin(np.abs(arr - np.array(bb, dtype=np.float32)).sum(axis=1)))
+                return filtered[idx][1]
+            return [(bb, _match_score(bb)) for bb in kept_boxes]
+    detector = OWLv2Objectness(top_k=args.top_k)
+    for dpath in tqdm(dir_paths, desc=f"Worker{args.worker_idx}", unit="set"):
+        stem = os.path.basename(os.path.normpath(dpath))
+        images = list_images_sorted(dpath)
+        if not images:
+            print(f"[WARN][w{args.worker_idx}] No JPGs under: {dpath}")
+            continue
+        saved_cnt = 0
+        pbar = tqdm(total=len(images), desc=f"{stem}[w{args.worker_idx}]", unit="img", leave=False)
+        for idx, ipath in enumerate(images):
+            pbar.update(1)
+            if args.frame_stride > 1 and (idx % args.frame_stride) != 0:
+                continue
+            frame = cv2.imread(ipath, cv2.IMREAD_COLOR)
+            if frame is None:
+                print(f"[WARN][w{args.worker_idx}] Cannot read: {ipath}")
+                continue
+            boxes_scores = detector.detect(frame)
+            if boxes_scores:
+                boxes_scores = sorted(boxes_scores, key=lambda x: x[1], reverse=True)[:args.top_k]
+            fname = os.path.basename(ipath)
+            for i, (box, score) in enumerate(boxes_scores):
+                out_dir = os.path.join(args.output_dir, stem, f"object_{i}")
+                ensure_dir(out_dir)
+                vis = draw_single_box(frame, box, color=(0, 255, 0), thickness=2)
+                cv2.imwrite(os.path.join(out_dir, fname), vis)
+            saved_cnt += 1
+            if args.max_frames and saved_cnt >= args.max_frames:
+                break
+        pbar.close()
+# ----------------- Master -----------------
+def main():
+    args = parse_args()
+    # Child worker path
+    if args.worker_idx >= 0:
+        if not args.shard_file or not os.path.exists(args.shard_file):
+            raise RuntimeError("Worker requires --shard_file with JSON list of folder paths.")
+        with open(args.shard_file, "r", encoding="utf-8") as f:
+            dir_paths = json.load(f)
+        worker_run(args, dir_paths)
+        return
+    # Master path
+    dir_paths = iter_image_dirs(args.input_dir, args.startswith)
+    if not dir_paths:
+        print(f"[INFO] No JPG folders (or JPGs) startwith '{args.startswith}' under {args.input_dir}")
+        return
+    num_workers = max(1, int(args.num_workers))
+    shards: List[List[str]] = [[] for _ in range(num_workers)]
+    for i, d in enumerate(dir_paths):
+        shards[i % num_workers].append(d)
+    procs = []
+    tmp_dir = os.path.join(args.output_dir, "_shards_tmp")
+    ensure_dir(tmp_dir)
+    for w in range(num_workers):
+        shard_path = os.path.join(tmp_dir, f"shard_{w}.json")
+        with open(shard_path, "w", encoding="utf-8") as f:
+            json.dump(shards[w], f, ensure_ascii=False, indent=0)
+        # Bind GPU: cycle through available GPU ids [0..num_workers-1]
+        env = os.environ.copy()
+        env["CUDA_VISIBLE_DEVICES"] = str(w)  # one GPU per worker
+        cmd = [
+            sys.executable, __file__,
+            "--input_dir", args.input_dir,
+            "--startswith", args.startswith,
+            "--output_dir", args.output_dir,
+            "--frame_stride", str(args.frame_stride),
+            "--top_k", str(args.top_k),
+            "--max_frames", str(args.max_frames),
+            "--num_workers", str(num_workers),
+            "--worker_idx", str(w),
+            "--shard_file", shard_path,
+            "--scenic_root", args.scenic_root,
+        ]
+        print(f"[Master] Launch worker {w}: GPU={env['CUDA_VISIBLE_DEVICES']}  folders={len(shards[w])}")
+        procs.append(subprocess.Popen(cmd, env=env))
+    # wait
+    rc = 0
+    for p in procs:
+        p.wait()
+        rc |= p.returncode
+    if rc != 0:
+        print("[Master] Some workers failed. Return code:", rc)
+    else:
+        print("[Master] All workers done. Output:", args.output_dir)
+if __name__ == "__main__":
+    main()

big_vision/.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

big_vision/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,26 @@

+# How to Contribute
+At this time we do not plan to accept non-trivial contributions. The main
+purpose of this codebase is to allow the community to reproduce results from our
+publications.
+You are however free to start a fork of the project for your purposes as
+permitted by the license.
+## Contributor License Agreement
+Contributions to this project must be accompanied by a Contributor License
+Agreement (CLA). You (or your employer) retain the copyright to your
+contribution; this simply gives us permission to use and redistribute your
+contributions as part of the project. Head over to
+<https://cla.developers.google.com/> to see your current agreements on file or
+to sign a new one.
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+## Community Guidelines
+This project follows
+[Google's Open Source Community Guidelines](https://opensource.google/conduct/).

big_vision/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

big_vision/README.md ADDED Viewed

	@@ -0,0 +1,499 @@

+# Big Vision
+This codebase is designed for training large-scale vision models using
+[Cloud TPU VMs](https://cloud.google.com/blog/products/compute/introducing-cloud-tpu-vms)
+or GPU machines. It is based on [Jax](https://github.com/google/jax)/[Flax](https://github.com/google/flax)
+libraries, and uses [tf.data](https://www.tensorflow.org/guide/data) and
+[TensorFlow Datasets](https://www.tensorflow.org/datasets) for scalable and
+reproducible input pipelines.
+The open-sourcing of this codebase has two main purposes:
+1. Publishing the code of research projects developed in this codebase (see a
+   list below).
+2. Providing a strong starting point for running large-scale vision experiments
+   on GPU machines and Google Cloud TPUs, which should scale seamlessly and
+   out-of-the box from a single TPU core to a distributed setup with up to 2048
+   TPU cores.
+`big_vision` aims to support research projects at Google. We are unlikely to
+work on feature requests or accept external contributions, unless they were
+pre-approved (ask in an issue first). For a well-supported transfer-only
+codebase, see also [vision_transformer](https://github.com/google-research/vision_transformer).
+Note that `big_vision` is quite dynamic codebase and, while we intend to keep
+the core code fully-functional at all times, we can not guarantee timely updates
+of the project-specific code that lives in the `.../proj/...` subfolders.
+However, we provide a [table](#project-specific-commits) with last known
+commits where specific projects were known to work.
+The following research projects were originally conducted in the `big_vision`
+codebase:
+### Architecture research
+- [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929), by
+  Alexey Dosovitskiy*, Lucas Beyer*, Alexander Kolesnikov*, Dirk Weissenborn*,
+  Xiaohua Zhai*, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer,
+  Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, and Neil Houlsby*
+- [Scaling Vision Transformers](https://arxiv.org/abs/2106.04560), by
+  Xiaohua Zhai*, Alexander Kolesnikov*, Neil Houlsby, and Lucas Beyer*\
+  Resources: [config](big_vision/configs/proj/scaling_laws/train_vit_g.py).
+- [How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers](https://arxiv.org/abs/2106.10270), by
+  Andreas Steiner*, Alexander Kolesnikov*, Xiaohua Zhai*, Ross Wightman,
+  Jakob Uszkoreit, and Lucas Beyer*
+- [MLP-Mixer: An all-MLP Architecture for Vision](https://arxiv.org/abs/2105.01601), by
+  Ilya Tolstikhin*, Neil Houlsby*, Alexander Kolesnikov*, Lucas Beyer*,
+  Xiaohua Zhai, Thomas Unterthiner, Jessica Yung, Andreas Steiner,
+  Daniel Keysers, Jakob Uszkoreit, Mario Lucic, Alexey Dosovitskiy\
+  Resources: [config](big_vision/configs/mlp_mixer_i1k.py).
+- [Better plain ViT baselines for ImageNet-1k](https://arxiv.org/abs/2205.01580), by
+  Lucas Beyer, Xiaohua Zhai, Alexander Kolesnikov\
+  Resources: [config](big_vision/configs/vit_s16_i1k.py)
+- [UViM: A Unified Modeling Approach for Vision with Learned Guiding Codes](https://arxiv.org/abs/2205.10337), by
+  Alexander Kolesnikov^*, André Susano Pinto^*, Lucas Beyer*, Xiaohua Zhai*, Jeremiah Harmsen*, Neil Houlsby*\
+  Resources: [readme](big_vision/configs/proj/uvim/README.md), [configs](big_vision/configs/proj/uvim), [colabs](big_vision/configs/proj/uvim).
+- [FlexiViT: One Model for All Patch Sizes](https://arxiv.org/abs/2212.08013), by
+  Lucas Beyer*, Pavel Izmailov*, Alexander Kolesnikov*, Mathilde Caron*, Simon
+  Kornblith*, Xiaohua Zhai*, Matthias Minderer*, Michael Tschannen*, Ibrahim
+  Alabdulmohsin*, Filip Pavetic*\
+  Resources: [readme](big_vision/configs/proj/flexivit/README.md), [configs](big_vision/configs/proj/flexivit).
+- [Dual PatchNorm](https://arxiv.org/abs/2302.01327), by Manoj Kumar, Mostafa Dehghani, Neil Houlsby.
+- [Getting ViT in Shape: Scaling Laws for Compute-Optimal Model Design](https://arxiv.org/abs/2305.13035), by
+  Ibrahim Alabdulmohsin*, Xiaohua Zhai*, Alexander Kolesnikov, Lucas Beyer*.
+- (partial) [Scaling Vision Transformers to 22 Billion Parameters](https://arxiv.org/abs/2302.05442), by
+  Mostafa Dehghani*, Josip Djolonga*, Basil Mustafa*, Piotr Padlewski*, Jonathan Heek*, *wow many middle authors*, Neil Houlsby*.
+- (partial) [Finite Scalar Quantization: VQ-VAE Made Simple](https://arxiv.org/abs/2309.15505), by
+  Fabian Mentzer, David Minnen, Eirikur Agustsson, Michael Tschannen.
+- [GIVT: Generative Infinite-Vocabulary Transformers](https://arxiv.org/abs/2312.02116), by
+  Michael Tschannen, Cian Eastwood, Fabian Mentzer.\
+  Resources: [readme](big_vision/configs/proj/givt/README.md), [config](big_vision/configs/proj/givt/givt_imagenet2012.py), [colab](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/givt/givt_demo_colab.ipynb).
+- [Unified Auto-Encoding with Masked Diffusion](https://arxiv.org/abs/2406.17688), by
+  Philippe Hansen-Estruch, Sriram Vishwanath, Amy Zhang, Manan Tomar.
+### Multimodal research
+- [LiT: Zero-Shot Transfer with Locked-image Text Tuning](https://arxiv.org/abs/2111.07991), by
+  Xiaohua Zhai*, Xiao Wang*, Basil Mustafa*, Andreas Steiner*, Daniel Keysers,
+  Alexander Kolesnikov, and Lucas Beyer*\
+  Resources: [trainer](big_vision/trainers/proj/image_text/contrastive.py), [config](big_vision/configs/proj/image_text/lit_coco.py), [colab](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/lit.ipynb).
+- [Image-and-Language Understanding from Pixels Only](https://arxiv.org/abs/2212.08045), by
+  Michael Tschannen, Basil Mustafa, Neil Houlsby\
+  Resources: [readme](big_vision/configs/proj/clippo/README.md), [config](big_vision/configs/proj/clippo/train_clippo.py), [colab](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/clippo/clippo_colab.ipynb).
+- [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343), by
+  Xiaohua Zhai*, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer*\
+  Resources: [colab and models](https://colab.research.google.com/github/google-research/big_vision/blob/main/big_vision/configs/proj/image_text/SigLIP_demo.ipynb), code TODO.
+- [A Study of Autoregressive Decoders for Multi-Tasking in Computer Vision](https://arxiv.org/abs/2303.17376), by
+  Lucas Beyer*, Bo Wan*, Gagan Madan*, Filip Pavetic*, Andreas Steiner*, Alexander Kolesnikov, André Susano Pinto, Emanuele Bugliarello, Xiao Wang, Qihang Yu, Liang-Chieh Chen, Xiaohua Zhai*.
+- [Image Captioners Are Scalable Vision Learners Too](https://arxiv.org/abs/2306.07915), by
+  Michael Tschannen*, Manoj Kumar*, Andreas Steiner*, Xiaohua Zhai, Neil Houlsby, Lucas Beyer*.\
+  Resources: [readme](big_vision/configs/proj/cappa/README.md), [config](big_vision/configs/proj/cappa/pretrain.py), [model](big_vision/models/proj/cappa/cappa.py).
+- [Three Towers: Flexible Contrastive Learning with Pretrained Image Models](https://arxiv.org/abs/2305.16999), by Jannik Kossen, Mark Collier, Basil Mustafa, Xiao Wang, Xiaohua Zhai, Lucas Beyer, Andreas Steiner, Jesse Berent, Rodolphe Jenatton, Efi Kokiopoulou.
+- (partial) [PaLI: A Jointly-Scaled Multilingual Language-Image Model](https://arxiv.org/abs/2209.06794), by Xi Chen, Xiao Wang, Soravit Changpinyo, *wow so many middle authors*, Anelia Angelova, Xiaohua Zhai, Neil Houlsby, Radu Soricut.
+- (partial) [PaLI-3 Vision Language Models: Smaller, Faster, Stronger](https://arxiv.org/abs/2310.09199), by Xi Chen, Xiao Wang, Lucas Beyer, Alexander Kolesnikov, Jialin Wu, Paul Voigtlaender, Basil Mustafa, Sebastian Goodman, Ibrahim Alabdulmohsin, Piotr Padlewski, Daniel Salz, Xi Xiong, Daniel Vlasic, Filip Pavetic, Keran Rong, Tianli Yu, Daniel Keysers, Xiaohua Zhai, Radu Soricut.
+- [LocCa](https://arxiv.org/abs/2403.19596), by
+  Bo Wan, Michael Tschannen, Yongqin Xian, Filip Pavetic, Ibrahim Alabdulmohsin, Xiao Wang, André Susano Pinto, Andreas Steiner, Lucas Beyer, Xiaohua Zhai.
+- [PaliGemma](https://arxiv.org/abs/2407.07726),
+  [PaliGemma 2](https://arxiv.org/abs/2412.03555), by *wow many authors*.\
+- Resources: [readme](big_vision/configs/proj/paligemma/README.md),
+    [model](big_vision/models/proj/paligemma/paligemma.py),
+    [transfer configs](big_vision/configs/proj/paligemma/transfers),
+    [datasets](big_vision/datasets),
+    [CountBenchQA](big_vision/datasets/countbenchqa/data/countbench_paired_questions.json).
+### Training
+- [Knowledge distillation: A good teacher is patient and consistent](https://arxiv.org/abs/2106.05237), by
+  Lucas Beyer*, Xiaohua Zhai*, Amélie Royer*, Larisa Markeeva*, Rohan Anil,
+  and Alexander Kolesnikov*\
+  Resources: [README](big_vision/configs/proj/distill/README.md), [trainer](big_vision/trainers/proj/distill/distill.py), [colab](https://colab.research.google.com/drive/1nMykzUzsfQ_uAxfj3k35DYsATnG_knPl?usp=sharing).
+- [Sharpness-Aware Minimization for Efficiently Improving Generalization](https://arxiv.org/abs/2010.01412), by
+  Pierre Foret, Ariel Kleiner, Hossein Mobahi, Behnam Neyshabur
+- [Surrogate Gap Minimization Improves Sharpness-Aware Training](https://arxiv.org/abs/2203.08065), by Juntang Zhuang, Boqing Gong, Liangzhe Yuan, Yin Cui, Hartwig Adam, Nicha Dvornek, Sekhar Tatikonda, James Duncan and Ting Liu \
+  Resources: [trainer](big_vision/trainers/proj/gsam/gsam.py), [config](big_vision/configs/proj/gsam/vit_i1k_gsam_no_aug.py) [reproduced results](https://github.com/google-research/big_vision/pull/8#pullrequestreview-1078557411)
+- [Tuning computer vision models with task rewards](https://arxiv.org/abs/2302.08242), by
+  André Susano Pinto*, Alexander Kolesnikov*, Yuge Shi, Lucas Beyer, Xiaohua Zhai.
+- (partial) [VeLO: Training Versatile Learned Optimizers by Scaling Up](https://arxiv.org/abs/2211.09760) by
+  Luke Metz, James Harrison, C. Daniel Freeman, Amil Merchant, Lucas Beyer, James Bradbury, Naman Agrawal, Ben Poole, Igor Mordatch, Adam Roberts, Jascha Sohl-Dickstein.
+### Misc
+- [Are we done with ImageNet?](https://arxiv.org/abs/2006.07159), by
+  Lucas Beyer*, Olivier J. Hénaff*, Alexander Kolesnikov*, Xiaohua Zhai*, Aäron van den Oord*.
+- [No Filter: Cultural and Socioeconomic Diversity in Contrastive Vision-Language Models](https://arxiv.org/abs/2405.13777), by
+  Angéline Pouget, Lucas Beyer, Emanuele Bugliarello, Xiao Wang, Andreas Peter Steiner, Xiaohua Zhai, Ibrahim Alabdulmohsin.
+# Codebase high-level organization and principles in a nutshell
+The main entry point is a trainer module, which typically does all the
+boilerplate related to creating a model and an optimizer, loading the data,
+checkpointing and training/evaluating the model inside a loop. We provide the
+canonical trainer `train.py` in the root folder. Normally, individual projects
+within `big_vision` fork and customize this trainer.
+All models, evaluators and preprocessing operations live in the corresponding
+subdirectories and can often be reused between different projects. We encourage
+compatible APIs within these directories to facilitate reusability, but it is
+not strictly enforced, as individual projects may need to introduce their custom
+APIs.
+We have a powerful configuration system, with the configs living in the
+`configs/` directory. Custom trainers and modules can directly extend/modify
+the configuration options.
+Project-specific code resides in the `.../proj/...` namespace. It is not always
+possible to keep project-specific in sync with the core `big_vision` libraries,
+Below we provide the [last known commit](#project-specific-commits)
+for each project where the project code is expected to work.
+Training jobs are robust to interruptions and will resume seamlessly from the
+last saved checkpoint (assuming a user provides the correct `--workdir` path).
+Each configuration file contains a comment at the top with a `COMMAND` snippet
+to run it, and some hint of expected runtime and results. See below for more
+details, but generally speaking, running on a GPU machine involves calling
+`python -m COMMAND` while running on TPUs, including multi-host, involves
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all
+  --command "bash big_vision/run_tpu.sh COMMAND"
+```
+See instructions below for more details on how to run `big_vision` code on a
+GPU machine or Google Cloud TPU.
+By default we write checkpoints and logfiles. The logfiles are a list of JSON
+objects, and we provide a short and straightforward [example colab to read
+and display the logs and checkpoints](https://colab.research.google.com/drive/1R_lvV542WUp8Q2y8sbyooZOGCplkn7KI?usp=sharing).
+# Current and future contents
+The first release contains the core part of pre-training, transferring, and
+evaluating classification models at scale on Cloud TPU VMs.
+We have since added the following key features and projects:
+- Contrastive Image-Text model training and evaluation as in LiT and CLIP.
+- Patient and consistent distillation.
+- Scaling ViT.
+- MLP-Mixer.
+- UViM.
+Features and projects we plan to release in the near future, in no particular
+order:
+- ImageNet-21k in TFDS.
+- Loading misc public models used in our publications (NFNet, MoCov3, DINO).
+- Memory-efficient Polyak-averaging implementation.
+- Advanced JAX compute and memory profiling. We are using internal tools for
+    this, but may eventually add support for the publicly available ones.
+We will continue releasing code of our future publications developed within
+`big_vision` here.
+### Non-content
+The following exist in the internal variant of this codebase, and there is no
+plan for their release:
+- Regular regression tests for both quality and speed. They rely heavily on
+    internal infrastructure.
+- Advanced logging, monitoring, and plotting of experiments. This also relies
+    heavily on internal infrastructure. However, we are open to ideas on this
+    and may add some in the future, especially if implemented in a
+    self-contained manner.
+- Not yet published, ongoing research projects.
+# GPU Setup
+We first discuss how to setup and run `big_vision` on a (local) GPU machine,
+and then discuss the setup for Cloud TPUs. Note that data preparation step for
+(local) GPU setup can be largely reused for the Cloud TPU setup. While the
+instructions skip this for brevity, we highly recommend using a
+[virtual environment](https://docs.python.org/3/library/venv.html) when
+installing python dependencies.
+## Setting up python packages
+The first step is to checkout `big_vision` and install relevant python
+dependencies:
+```
+git clone https://github.com/google-research/big_vision
+cd big_vision/
+pip3 install --upgrade pip
+pip3 install -r big_vision/requirements.txt
+```
+The latest version of `jax` library can be fetched as
+```
+pip3 install --upgrade "jax[cuda]" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
+```
+You may need a different `jax` package, depending on CUDA and cuDNN libraries
+installed on your machine. Please consult
+[official jax documentation](https://github.com/google/jax#pip-installation-gpu-cuda)
+for more information.
+## Preparing tfds data
+For unified and reproducible access to standard datasets we opted to use the
+`tensorflow_datasets` (`tfds`) library. It requires each dataset to be
+downloaded, preprocessed and then to be stored on a hard drive (or, if you use
+"Google Cloud", preferably stored in a "GCP bucket".).
+Many datasets can be downloaded and preprocessed automatically when used
+for the first time. Nevertheless, we intentionally disable this feature and
+recommend doing dataset preparation step separately, ahead of the first run. It
+will make debugging easier if problems arise and some datasets, like
+`imagenet2012`, require manually downloaded data.
+Most of the datasets, e.g. `cifar100`, `oxford_iiit_pet` or `imagenet_v2`
+can be fully automatically downloaded and prepared by running
+```
+cd big_vision/
+python3 -m big_vision.tools.download_tfds_datasets cifar100 oxford_iiit_pet imagenet_v2
+```
+A full list of datasets is available at [this link](https://www.tensorflow.org/datasets/catalog/overview#all_datasets).
+Some datasets, like `imagenet2012` or `imagenet2012_real`, require the data to
+be downloaded manually and placed into `$TFDS_DATA_DIR/downloads/manual/`,
+which defaults to `~/tensorflow_datasets/downloads/manual/`. For example, for
+`imagenet2012` and `imagenet2012_real` one needs to place the official
+`ILSVRC2012_img_train.tar` and `ILSVRC2012_img_val.tar` files in that directory
+and then run
+`python3 -m big_vision.tools.download_tfds_datasets imagenet2012 imagenet2012_real`
+(which may take ~1 hour).
+If you use `Google Cloud` and, TPUs in particular, you can then upload
+the preprocessed data (stored in `$TFDS_DATA_DIR`) to
+"Google Cloud Bucket" and use the bucket on any of your (TPU) virtual
+machines to access the data.
+## Running on a GPU machine
+Finally, after installing all python dependencies and preparing `tfds` data,
+the user can run the job using config of their choice, e.g. to train `ViT-S/16`
+model on ImageNet data, one should run the following command:
+```
+python3 -m big_vision.train --config big_vision/configs/vit_s16_i1k.py --workdir workdirs/`date '+%m-%d_%H%M'`
+```
+or to train MLP-Mixer-B/16, run (note the `gpu8` config param that reduces the default batch size and epoch count):
+```
+python3 -m big_vision.train --config big_vision/configs/mlp_mixer_i1k.py:gpu8 --workdir workdirs/`date '+%m-%d_%H%M'`
+```
+# Cloud TPU VM setup
+## Create TPU VMs
+To create a single machine with 8 TPU cores, follow the following Cloud TPU JAX
+document:
+https://cloud.google.com/tpu/docs/run-calculation-jax
+To support large-scale vision research, more cores with multiple hosts are
+recommended. Below we provide instructions on how to do it.
+First, create some useful variables, which we be reused:
+```
+export NAME=<a name of the TPU deployment, e.g. my-tpu-machine>
+export ZONE=<GCP geographical zone, e.g. europe-west4-a>
+export GS_BUCKET_NAME=<Name of the storage bucket, e.g. my_bucket>
+```
+The following command line will create TPU VMs with 32 cores,
+4 hosts.
+```
+gcloud compute tpus tpu-vm create $NAME --zone $ZONE --accelerator-type v3-32 --version tpu-ubuntu2204-base
+```
+## Install `big_vision` on TPU VMs
+Fetch the `big_vision` repository, copy it to all TPU VM hosts, and install
+dependencies.
+```
+git clone https://github.com/google-research/big_vision
+gcloud compute tpus tpu-vm scp --recurse big_vision/big_vision $NAME: --zone=$ZONE --worker=all
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "bash big_vision/run_tpu.sh"
+```
+## Download and prepare TFDS datasets
+We recommend preparing `tfds` data locally as described above and then uploading
+the data to `Google Cloud` bucket. However, if you prefer, the datasets which
+do not require manual downloads can be prepared automatically using a TPU
+machine as described below. Note that TPU machines have only 100 GB of disk
+space, and multihost TPU slices do not allow for external disks to be attached
+in a write mode, so the instructions below may not work for preparing large
+datasets. As yet another alternative, we provide instructions
+[on how to prepare `tfds` data on CPU-only GCP machine](#preparing-tfds-data-on-a-standalone-gcp-cpu-machine).
+Specifically, the seven TFDS datasets used during evaluations will be generated
+under `~/tensorflow_datasets` on TPU machine with this command:
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=0 --command "TFDS_DATA_DIR=~/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.tools.download_tfds_datasets cifar10 cifar100 oxford_iiit_pet oxford_flowers102 cars196 dtd uc_merced"
+```
+You can then copy the datasets to GS bucket, to make them accessible to all TPU workers.
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=0 --command "rm -r ~/tensorflow_datasets/downloads && gsutil cp -r ~/tensorflow_datasets gs://$GS_BUCKET_NAME"
+```
+If you want to integrate other public or custom datasets, i.e. imagenet2012,
+please follow [the official guideline](https://www.tensorflow.org/datasets/catalog/overview).
+## Pre-trained models
+For the full list of pre-trained models check out the `load` function defined in
+the same module as the model code. And for example config on how to use these
+models, see `configs/transfer.py`.
+## Run the transfer script on TPU VMs
+The following command line fine-tunes a pre-trained `vit-i21k-augreg-b/32` model
+on `cifar10` dataset.
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "TFDS_DATA_DIR=gs://$GS_BUCKET_NAME/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.train --config big_vision/configs/transfer.py:model=vit-i21k-augreg-b/32,dataset=cifar10,crop=resmall_crop --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'` --config.lr=0.03"
+```
+## Run the train script on TPU VMs
+To train your own big_vision models on a large dataset,
+e.g. `imagenet2012` ([prepare the TFDS dataset](https://www.tensorflow.org/datasets/catalog/imagenet2012)),
+run the following command line.
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "TFDS_DATA_DIR=gs://$GS_BUCKET_NAME/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.train --config big_vision/configs/bit_i1k.py  --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'`"
+```
+## FSDP training.
+`big_vision` supports flexible parameter and model sharding strategies.
+Currently, we support a popular FSDP sharding via a simple config change, see [this config example](big_vision/configs/transfer.py).
+For example, to run FSDP finetuning of a pretrained ViT-L model, run the following command (possible adjusting batch size depending on your hardware):
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "TFDS_DATA_DIR=gs://$GS_BUCKET_NAME/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.train --config big_vision/configs/transfer.py:model=vit-i21k-augreg-l/16,dataset=oxford_iiit_pet,crop=resmall_crop,fsdp=True,batch_size=256 --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'` --config.lr=0.03"
+```
+## Image-text training with SigLIP.
+A minimal example that uses public `coco` captions data:
+```
+gcloud compute tpus tpu-vm ssh $NAME --zone=$ZONE --worker=all --command "TFDS_DATA_DIR=gs://$GS_BUCKET_NAME/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.trainers.proj.image_text.siglip --config big_vision/configs/proj/image_text/siglip_lit_coco.py --workdir gs://$GS_BUCKET_NAME/big_vision/`date '+%Y-%m-%d_%H%M'`"
+```
+## Sometimes useful gcloud commands
+- Destroy the TPU machines: `gcloud compute tpus tpu-vm delete $NAME --zone $ZONE`
+- Remove all big_vision-related folders on all hosts: `gcloud compute tpus tpu-vm ssh $NAME --zone $ZONE --worker=all --command 'rm -rf ~/big_vision ~/bv_venv'`
+## Preparing `tfds` data on a standalone GCP CPU machine.
+First create a new machine and a disk (feel free to adjust exact machine type and disk settings/capacity):
+```
+export NAME_CPU_HOST=<A name of a CPU-only machine>
+export NAME_DISK=<A name of a disk>
+gcloud compute instances create $NAME_CPU_HOST --machine-type c3-standard-22 --zone $ZONE --image-family ubuntu-2204-lts --image-project ubuntu-os-cloud
+gcloud compute disks create $NAME_DISK --size 1000GB --zone $ZONE --type pd-balanced
+```
+Now attach the disk to the newly create machine:
+```
+gcloud compute instances attach-disk $NAME_CPU_HOST --disk $NAME_DISK --zone $ZONE
+```
+Next, `ssh` to the machine `gcloud compute ssh $NAME_CPU_HOST --zone=$ZONE` and
+[follow instructions to format and mount the disk](https://cloud.google.com/compute/docs/disks/format-mount-disk-linux).
+Let's assume it was mounted to `/mnt/disks/tfds`.
+Almost there, now clone and set up `big_vision`:
+```
+gcloud compute ssh $NAME_CPU_HOST --zone=$ZONE --command "git clone https://github.com/google-research/big_vision.git && cd big_vision && sh big_vision/run_tpu.sh"
+```
+Finally, prepare the dataset (e.g. `coco_captions`) using the utility script and
+copy the result to you google cloud bucket:
+```
+gcloud compute ssh $NAME_CPU_HOST --zone=$ZONE --command "cd big_vision && TFDS_DATA_DIR=/mnt/disks/tfds/tensorflow_datasets bash big_vision/run_tpu.sh big_vision.tools.download_tfds_datasets coco_captions"
+gcloud compute ssh $NAME_CPU_HOST --zone=$ZONE --command "rm -rf /mnt/disks/tfds/tensorflow_datasets/downloads && gsutil cp -r /mnt/disks/tfds/tensorflow_datasets gs://$GS_BUCKET_NAME"
+```
+# ViT baseline
+We provide a well-tuned ViT-S/16 baseline in the config file named
+`vit_s16_i1k.py`. It achieves 76.5% accuracy on ImageNet validation split in
+90 epochs of training, being a strong and simple starting point for research
+on the ViT models.
+Please see our [arXiv note](https://arxiv.org/abs/2205.01580) for more details
+and if this baseline happens to by useful for your research, consider citing
+```
+@article{vit_baseline,
+  url = {https://arxiv.org/abs/2205.01580},
+  author = {Beyer, Lucas and Zhai, Xiaohua and Kolesnikov, Alexander},
+  title = {Better plain ViT baselines for ImageNet-1k},
+  journal={arXiv preprint arXiv:2205.01580},
+  year = {2022},
+}
+```
+# Project specific commits
+The last known commit where the specific project code is expected to work. The
+core code and configs are expected to work at head.
+| Project    | Commit                                                                                        |
+|------------|-----------------------------------------------------------------------------------------------|
+| UViM       | https://github.com/google-research/big_vision/commit/21bd6ebe253f070f584d8b777ad76f4abce51bef |
+| image_text | https://github.com/google-research/big_vision/commit/8921d5141504390a8a4f7b2dacb3b3c042237290 |
+| distill    | https://github.com/google-research/big_vision/commit/2f3f493af048dbfd97555ff6060f31a0e686f17f |
+| GSAM       | WIP                                                                                           |
+| CLIPPO     | https://github.com/google-research/big_vision/commit/fd2d3bd2efc9d89ea959f16cd2f58ae8a495cd44 |
+| CapPa      | https://github.com/google-research/big_vision/commit/7ace659452dee4b68547575352c022a2eef587a5 |
+| GIVT       | https://github.com/google-research/big_vision/commit/0cb70881dd33b3343b769347dc19793c4994b8cb |
+# Citing the codebase
+If you found this codebase useful for your research, please consider using
+the following BibTEX to cite it:
+```
+@misc{big_vision,
+  author = {Beyer, Lucas and Zhai, Xiaohua and Kolesnikov, Alexander},
+  title = {Big Vision},
+  year = {2022},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/google-research/big_vision}}
+}
+```
+# Disclaimer
+This is not an official Google Product.
+# License
+Unless explicitly noted otherwise, everything in the big_vision codebase
+(including models and colabs) is released under the Apache2 license.
+See the LICENSE file for the full license text.

big_vision/__init__.py ADDED Viewed

File without changes

big_vision/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (179 Bytes). View file

big_vision/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (52.6 kB). View file

big_vision/configs/__init__.py ADDED Viewed

File without changes

big_vision/configs/bit_i1k.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training BiT on ILSVRC-2012 as in https://arxiv.org/abs/1912.11370
+Run training of a BiT-ResNet-50x1 variant, which takes ~32min on v3-128:
+big_vision.train \
+    --config big_vision/configs/bit_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+    --config.model.depth 50 --config.model.width 1
+"""
+# from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config(runlocal=False):
+  """Config for training on ImageNet-1k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 1000
+  config.loss = 'softmax_xent'
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000  # Per host.
+  pp_common = '|onehot(1000, key="{lbl}", key_result="labels")'
+  pp_common += '|value_range(-1, 1)|keep("image", "labels")'
+  config.input.pp = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common.format(lbl='label')
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'bit'
+  config.model = dict(
+      depth=50,  # You can also pass e.g. [3, 5, 10, 2]
+      width=1.0,
+  )
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  # linear scaling rule. Don't forget to sweep if sweeping batch_size.
+  config.wd = (1e-4 / 256) * config.input.batch_size
+  config.lr = (0.1 / 256) * config.input.batch_size
+  config.schedule = dict(decay_type='cosine', warmup_steps=1000)
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+        cache='final_data',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  # config.evals.fewshot = get_fewshot_lsr(runlocal=runlocal)
+  # config.evals.fewshot.log_steps = 1000
+  if runlocal:
+    config.input.batch_size = 32
+    config.input.cache_raw = False
+    config.input.shuffle_buffer_size = 100
+    local_eval = config.evals.val
+    config.evals = {'val': local_eval}
+    config.evals.val.cache = 'none'
+  return config

big_vision/configs/bit_i21k.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""A config for pre-training BiT on ImageNet-21k.
+This config relies on the Imagenet-21k tfds dataset, which is not yet
+available publicly in TFDS. We intend to add the dataset to public TFDS soon,
+and this config will then be runnable.
+"""
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config():
+  """Config for training on imagenet-21k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 21843
+  config.init_head_bias = -10.0
+  config.loss = 'sigmoid_xent'
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet21k',
+      split='full[51200:]',
+  )
+  config.input.batch_size = 4096
+  config.input.shuffle_buffer_size = 250_000  # Per host, so small-ish is ok.
+  pp_common = '|value_range(-1, 1)|onehot({onehot_args})|keep("image", "labels")'
+  pp_common_i21k = pp_common.format(onehot_args=f'{config.num_classes}')
+  pp_common_i1k = pp_common.format(onehot_args='1000, key="label", key_result="labels"')
+  config.input.pp = 'decode_jpeg_and_inception_crop(224)|flip_lr' + pp_common_i21k
+  pp_eval = 'decode|resize_small(256)|central_crop(224)'
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'bit_paper'
+  config.model = dict(depth=50, width=1.0)
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  # linear scaling rule. Don't forget to sweep if sweeping batch_size.
+  config.lr = (0.03 / 256) * config.input.batch_size
+  config.wd = (3e-5 / 256) * config.input.batch_size
+  config.schedule = dict(decay_type='cosine', warmup_steps=5000)
+  # Evaluations on i21k itself.
+  def eval_i21k(split):
+    return dict(
+        type='classification',
+        data={**config.input.data, 'split': split},
+        pp_fn=pp_eval + pp_common_i21k,
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.test = eval_i21k('full[:25_600]')
+  config.evals.val = eval_i21k('full[25_600:51_200]')
+  config.evals.train = eval_i21k('full[51_200:76_800]')
+  # Few-shot evaluators
+  config.evals.fewshot = get_fewshot_lsr()
+  config.evals.fewshot.log_steps = 25_000
+  return config

big_vision/configs/common.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A few things commonly used across A LOT of config files."""
+import string
+import ml_collections as mlc
+def input_for_quicktest(config_input, quicktest):
+  if quicktest:
+    config_input.batch_size = 8
+    config_input.shuffle_buffer_size = 10
+    config_input.cache_raw = False
+def parse_arg(arg, lazy=False, **spec):
+  """Makes ConfigDict's get_config single-string argument more usable.
+  Example use in the config file:
+    import big_vision.configs.common as bvcc
+    def get_config(arg):
+      arg = bvcc.parse_arg(arg,
+          res=(224, int),
+          runlocal=False,
+          schedule='short',
+      )
+      # ...
+      config.shuffle_buffer = 250_000 if not arg.runlocal else 50
+  Ways that values can be passed when launching:
+    --config amazing.py:runlocal,schedule=long,res=128
+    --config amazing.py:res=128
+    --config amazing.py:runlocal  # A boolean needs no value for "true".
+    --config amazing.py:runlocal=False  # Explicit false boolean.
+    --config amazing.py:128  # The first spec entry may be passed unnamed alone.
+  Uses strict bool conversion (converting 'True', 'true' to True, and 'False',
+    'false', '' to False).
+  Args:
+    arg: the string argument that's passed to get_config.
+    lazy: allow lazy parsing of arguments, which are not in spec. For these,
+      the type is auto-extracted in dependence of most complex possible type.
+    **spec: the name and default values of the expected options.
+      If the value is a tuple, the value's first element is the default value,
+      and the second element is a function called to convert the string.
+      Otherwise the type is automatically extracted from the default value.
+  Returns:
+    ConfigDict object with extracted type-converted values.
+  """
+  # Normalize arg and spec layout.
+  arg = arg or ''  # Normalize None to empty string
+  spec = {k: get_type_with_default(v) for k, v in spec.items()}
+  result = mlc.ConfigDict(type_safe=False)  # For convenient dot-access only.
+  # Expand convenience-cases for a single parameter without = sign.
+  if arg and ',' not in arg and '=' not in arg:
+    # (think :runlocal) If it's the name of sth in the spec (or there is no
+    # spec), it's that in bool.
+    if arg in spec or not spec:
+      arg = f'{arg}=True'
+    # Otherwise, it is the value for the first entry in the spec.
+    else:
+      arg = f'{list(spec.keys())[0]}={arg}'
+      # Yes, we rely on Py3.7 insertion order!
+  # Now, expand the `arg` string into a dict of keys and values:
+  raw_kv = {raw_arg.split('=')[0]:
+                raw_arg.split('=', 1)[-1] if '=' in raw_arg else 'True'
+            for raw_arg in arg.split(',') if raw_arg}
+  # And go through the spec, using provided or default value for each:
+  for name, (default, type_fn) in spec.items():
+    val = raw_kv.pop(name, None)
+    result[name] = type_fn(val) if val is not None else default
+  if raw_kv:
+    if lazy:  # Process args which are not in spec.
+      for k, v in raw_kv.items():
+        result[k] = autotype(v)
+    else:
+      raise ValueError(f'Unhandled config args remain: {raw_kv}')
+  return result
+def get_type_with_default(v):
+  """Returns (v, string_to_v_type) with lenient bool parsing."""
+  # For bool, do safe string conversion.
+  if isinstance(v, bool):
+    def strict_bool(x):
+      assert x.lower() in {'true', 'false', ''}
+      return x.lower() == 'true'
+    return (v, strict_bool)
+  # If already a (default, type) tuple, use that.
+  if isinstance(v, (tuple, list)):
+    assert len(v) == 2 and isinstance(v[1], type), (
+        'List or tuple types are currently not supported because we use `,` as'
+        ' dumb delimiter. Contributions (probably using ast) welcome. You can'
+        ' unblock by using a string with eval(s.replace(";", ",")) or similar')
+    return (v[0], v[1])
+  # Otherwise, derive the type from the default value.
+  return (v, type(v))
+def autotype(x):
+  """Auto-converts string to bool/int/float if possible."""
+  assert isinstance(x, str)
+  if x.lower() in {'true', 'false'}:
+    return x.lower() == 'true'  # Returns as bool.
+  try:
+    return int(x)  # Returns as int.
+  except ValueError:
+    try:
+      return float(x)  # Returns as float.
+    except ValueError:
+      return x  # Returns as str.
+def pack_arg(**kw):
+  """Packs key-word args as a string to be parsed by `parse_arg()`."""
+  for v in kw.values():
+    assert ',' not in f'{v}', f"Can't use `,` in config_arg value: {v}"
+  return ','.join([f'{k}={v}' for k, v in kw.items()])
+def arg(**kw):
+  """Use like `add(**bvcc.arg(res=256, foo=bar), lr=0.1)` to pass config_arg."""
+  return {'config_arg': pack_arg(**kw), **kw}
+def _get_field_ref(config_dict, field_name):
+  path = field_name.split('.')
+  for field in path[:-1]:
+    config_dict = getattr(config_dict, field)
+  return config_dict.get_ref(path[-1])
+def format_str(format_string, config):
+  """Format string with reference fields from config.
+  This makes it easy to build preprocess strings that contain references to
+  fields tha are edited after. E.g.:
+  ```
+  config = mlc.ConficDict()
+  config.res = (256, 256)
+  config.pp = bvcc.format_str('resize({res})', config)
+  ...
+  # if config.res is modified (e.g. via sweeps) it will propagate to pp field:
+  config.res = (512, 512)
+  assert config.pp == 'resize((512, 512))'
+  ```
+  Args:
+    format_string: string to format with references.
+    config: ConfigDict to get references to format the string.
+  Returns:
+    A reference field which renders a string using references to config fields.
+  """
+  output = ''
+  parts = string.Formatter().parse(format_string)
+  for (literal_text, field_name, format_spec, conversion) in parts:
+    assert not format_spec and not conversion
+    output += literal_text
+    if field_name:
+      output += _get_field_ref(config, field_name).to_str()
+  return output

big_vision/configs/common_fewshot.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Most common few-shot eval configuration."""
+import ml_collections as mlc
+def get_fewshot_lsr(target_resolution=224, resize_resolution=256,
+                    runlocal=False, pp=None, **kw):
+  """Returns a standard-ish fewshot eval configuration."""
+  kw.setdefault('representation_layer', 'pre_logits')
+  kw.setdefault('shots', (1, 5, 10, 25))
+  kw.setdefault('l2_reg', 2.0 ** 10)
+  kw.setdefault('num_seeds', 3)
+  kw.setdefault('prefix', '')  # No prefix as we already use a/ z/ and zz/
+  # Backward-compatible default:
+  if not any(f'log_{x}' in kw for x in ['steps', 'percent', 'examples', 'epochs']):  # pylint: disable=line-too-long
+    kw['log_steps'] = 25_000
+  config = mlc.ConfigDict(kw)
+  config.type = 'fewshot_lsr'
+  config.datasets = {
+      'caltech': ('caltech101', 'train', 'test'),  # copybara:srtip
+      'cars': ('cars196:2.1.0', 'train', 'test'),
+      'cifar100': ('cifar100', 'train', 'test'),
+      'dtd': ('dtd', 'train', 'test'),
+      # The first 65000 ImageNet samples have at least 30 shots per any class.
+      # Commented out by default because needs manual download.
+      # 'imagenet': ('imagenet2012', 'train[:65000]', 'validation'),
+      'pets': ('oxford_iiit_pet', 'train', 'test'),
+      'uc_merced': ('uc_merced', 'train[:1000]', 'train[1000:]'),
+  } if not runlocal else {
+      'pets': ('oxford_iiit_pet', 'train', 'test'),
+  }
+  pp = pp or '|'.join([
+      'decode',
+      f'resize({resize_resolution})',
+      f'central_crop({target_resolution})',
+      'value_range(-1,1)'
+  ])
+  pp += '|keep("image", "label")'
+  config.pp_train = pp
+  config.pp_eval = pp
+  config.display_first = [('imagenet', 10)] if not runlocal else [('pets', 10)]
+  return config

big_vision/configs/load_and_eval.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pytype: disable=not-writable,attribute-error
+# pylint: disable=line-too-long,missing-function-docstring
+r"""A config to load and eval key model using the core train.py.
+The runtime varies widely depending on the model, but each one should reproduce
+the corresponding paper's numbers.
+This configuration makes use of the "arg" to get_config to select which model
+to run, so a few examples are given below:
+Run and evaluate a BiT-M ResNet-50x1 model that was transferred to i1k:
+big_vision.train \
+    --config big_vision/configs/load_and_eval.py:name=bit_paper,batch_size=8 \
+    --config.model_init M-imagenet2012 --config.model.width 1 --config.model.depth 50
+Run and evaluate the recommended ViT-B/32 from "how to train your vit" paper:
+big_vision.train \
+    --config big_vision/configs/load_and_eval.py:name=vit_i21k,batch_size=8 \
+    --config.model.variant B/32 --config.model_init howto-i21k-B/32
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+def eval_only(config, batch_size, spec_for_init):
+  """Set a few configs that turn trainer into (almost) eval-only."""
+  config.total_steps = 0
+  config.input = {}
+  config.input.batch_size = batch_size
+  config.input.data = dict(name='bv:dummy', spec=spec_for_init)
+  config.optax_name = 'identity'
+  config.lr = 0.0
+  config.mesh = [('data', -1)]
+  config.sharding_strategy = [('params/.*', 'fsdp(axis="data")')]
+  config.sharding_rules = [('act_batch', ('data',))]
+  return config
+def get_config(arg=''):
+  config = bvcc.parse_arg(arg, name='bit_paper', batch_size=4)
+  # Make the config eval-only by setting some dummies.
+  eval_only(config, config.batch_size, spec_for_init=dict(
+      image=dict(shape=(224, 224, 3), dtype='float32'),
+  ))
+  config.evals = dict(fewshot=get_fewshot_lsr())
+  # Just calls the function with the name given as `config`.
+  # Could also be a giant if-block if you're into that kind of thing.
+  globals()[config.name](config)
+  return config
+def bit_paper(config):
+  config.num_classes = 1000
+  config.model_name = 'bit_paper'
+  config.model_init = 'M-imagenet2012'  # M = i21k, -imagenet2012 = fine-tuned
+  config.model = dict(width=1, depth=50)
+  def get_eval(split, lbl, dataset='imagenet2012_real'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        loss_name='softmax_xent',
+        cache='none',  # Only run once, on low-mem machine.
+        pp_fn=(
+            'decode|resize(384)|value_range(-1, 1)'
+            f'|onehot(1000, key="{lbl}", key_result="labels")'
+            '|keep("image", "labels")'
+        ),
+    )
+  config.evals.test = get_eval('validation', 'original_label')
+  config.evals.real = get_eval('validation', 'real_label')
+  config.evals.v2 = get_eval('test', 'label', 'imagenet_v2')
+def vit_i1k(config):
+  config.num_classes = 1000
+  config.model_name = 'vit'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                      rep_size=True)
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet2012', split='validation'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key="label", key_result="labels")|keep("image", "labels")',
+      loss_name='softmax_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )
+def mlp_mixer_i1k(config):
+  config.num_classes = 1000
+  config.model_name = 'mlp_mixer'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='L/16')
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet2012', split='validation'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(1000, key="label", key_result="labels")|keep("image", "labels")',
+      loss_name='softmax_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )
+def vit_i21k(config):
+  config.num_classes = 21843
+  config.model_name = 'vit'
+  config.model_init = ''  # Will be set in sweep.
+  config.model = dict(variant='B/32', pool_type='tok')
+  config.evals.val = dict(
+      type='classification',
+      data=dict(name='imagenet21k', split='full[:51200]'),
+      pp_fn='decode|resize_small(256)|central_crop(224)|value_range(-1, 1)|onehot(21843)|keep("image", "labels")',
+      loss_name='sigmoid_xent',
+      cache='none',  # Only run once, on low-mem machine.
+  )

big_vision/configs/mlp_mixer_i1k.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""A config for training MLP-Mixer-B/16 model on ILSVRC-2012 ("ImageNet-1k").
+Achieves 76.3% top-1 accuracy on the test split in 2h11m on TPU v3-128
+with 300 epochs. A shorter 60 epochs run is expected to get to 70.5% in 27m.
+big_vision.train \
+    --config big_vision/configs/mlp_mixer_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+"""
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+def get_config(mode=None):
+  """Config for training Mixer on i1k."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 1000
+  config.loss = 'sigmoid_xent'
+  config.init_head_bias = -6.9
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)'
+      '|flip_lr'
+      '|randaug(2,15)'
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="label", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  pp_eval = (
+      'decode'
+      '|resize_small(256)|central_crop(224)'
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  config.prefetch_to_device = 2
+  # Model section
+  config.model_name = 'mlp_mixer'
+  config.model = dict()
+  config.model.variant = 'B/16'
+  config.model.stoch_depth = 0.1
+  config.mixup = dict(fold_in=None, p=0.5)
+  # Optimizer section
+  config.optax_name = 'scale_by_adam'
+  config.grad_clip_norm = 1.
+  config.lr = 0.001
+  config.wd = 1e-4
+  config.schedule = dict(
+      decay_type='linear',
+      warmup_steps=10_000,
+      linear_end=1e-5,
+  )
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+        cache_final=mode != 'gpu8',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  config.fewshot = get_fewshot_lsr()
+  if mode == 'gpu8':
+    config.total_epochs = 60
+    config.input.batch_size = 512
+    config.input.cache_raw = False
+  if mode == 'regression_test':
+    config.total_epochs = 60
+  return config

big_vision/configs/transfer.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long,missing-function-docstring
+r"""A config for transferring vit-augreg.
+Best HP selected on (mini)val, expected test results (repeated 5 times):
+ViT-Augreg-B/32:
+    Dataset, crop, learning rate, mean (%), range (%)
+  - ImageNet, inception_crop, 0.03, 83.27, [83.22...83.33]
+  - Cifar10, resmall_crop, 0.003, 98.55, [98.46...98.6]
+  - Cifar100, resmall_crop, 0.01, 91.35, [91.09...91.62]
+  - Pets, inception_crop, 0.003, 93.78, [93.62...94.00]
+  - Flowers, inception_crop, 0.003, 99.43, [99.42...99.45]
+Command to run:
+big_vision.train \
+    --config big_vision/configs/transfer.py:model=vit-i21k-augreg-b/32,dataset=cifar10,crop=resmall_crop \
+    --workdir gs://$GS_BUCKET_NAME/big_vision/workdir/`date '+%m-%d_%H%M'` --config.lr=0.03
+"""
+import big_vision.configs.common as bvcc
+import ml_collections as mlc
+def _set_model(config, model):
+  """Load pre-trained models: vit or bit."""
+  # Reset the head to init (of zeros) when transferring.
+  config.model_load = dict(dont_load=['head/kernel', 'head/bias'])
+  if model == 'vit-i21k-augreg-b/32':
+    # Load "recommended" upstream B/32 from https://arxiv.org/abs/2106.10270
+    config.model_name = 'vit'
+    config.model_init = 'howto-i21k-B/32'
+    config.model = dict(variant='B/32', pool_type='tok')
+  elif model == 'vit-i21k-augreg-l/16':
+    config.model_name = 'vit'
+    config.model_init = 'howto-i21k-L/16'
+    config.model = dict(variant='L/16', pool_type='tok')
+  elif model == 'vit-s16':
+    config.model_name = 'vit'
+    config.model_init = 'i1k-s16-300ep'
+    config.model = dict(variant='S/16', pool_type='gap', posemb='sincos2d',
+                        rep_size=True)
+  elif model == 'bit-m-r50x1':
+    config.model_name = 'bit_paper'
+    config.model_init = 'M'
+    config.model = dict(depth=50, width=1)
+  else:
+    raise ValueError(f'Unknown model: {model}, please define customized model.')
+def _set_dataset(config, dataset, crop='inception_crop', h_res=448, l_res=384):
+  if dataset == 'cifar10':
+    _set_task(config, 'cifar10', 'train[:98%]', 'train[98%:]', 'test', 10, steps=10_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'cifar100':
+    _set_task(config, 'cifar100', 'train[:98%]', 'train[98%:]', 'test', 100, steps=10_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'imagenet2012':
+    _set_task(config, 'imagenet2012', 'train[:99%]', 'train[99%:]', 'validation', 1000, steps=20_000, warmup=500, crop=crop, h_res=h_res, l_res=l_res)
+    _set_imagenet_variants(config)
+  elif dataset == 'oxford_iiit_pet':
+    _set_task(config, 'oxford_iiit_pet', 'train[:90%]', 'train[90%:]', 'test', 37, steps=500, warmup=100, crop=crop, h_res=h_res, l_res=l_res)
+  elif dataset == 'oxford_flowers102':
+    _set_task(config, 'oxford_flowers102', 'train[:90%]', 'train[90%:]', 'test', 102, steps=500, warmup=100, crop=crop, h_res=h_res, l_res=l_res)
+  else:
+    raise ValueError(
+        f'Unknown dataset: {dataset}, please define customized dataset.')
+def _set_task(config, dataset, train, val, test, n_cls,
+              steps=20_000, warmup=500, lbl='label', crop='resmall_crop',
+              flip=True, h_res=448, l_res=384):
+  """Vision task with val and test splits."""
+  config.total_steps = steps
+  config.schedule = dict(
+      warmup_steps=warmup,
+      decay_type='cosine',
+  )
+  config.input.data = dict(name=dataset, split=train)
+  pp_common = (
+      '|value_range(-1, 1)|'
+      f'onehot({n_cls}, key="{lbl}", key_result="labels")|'
+      'keep("image", "labels")'
+  )
+  if crop == 'inception_crop':
+    pp_train = f'decode|inception_crop({l_res})'
+  elif crop == 'resmall_crop':
+    pp_train = f'decode|resize_small({h_res})|random_crop({l_res})'
+  elif crop == 'resize_crop':
+    pp_train = f'decode|resize({h_res})|random_crop({l_res})'
+  else:
+    raise ValueError(f'Unknown crop: {crop}. Must be one of: '
+                     'inception_crop, resmall_crop, resize_crop')
+  if flip:
+    pp_train += '|flip_lr'
+  config.input.pp = pp_train + pp_common
+  pp = f'decode|resize_small({h_res})|central_crop({l_res})' + pp_common
+  config.num_classes = n_cls
+  def get_eval(split):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        loss_name='softmax_xent',
+        log_steps=100,
+        pp_fn=pp,
+    )
+  config.evals = dict(val=get_eval(val), test=get_eval(test))
+def _set_imagenet_variants(config, h_res=448, l_res=384):
+  """Evaluation tasks on ImageNet variants: v2 and real."""
+  pp = (f'decode|resize_small({h_res})|central_crop({l_res})'
+        '|value_range(-1, 1)|onehot(1000, key="{lbl}", key_result="labels")|'
+        'keep("image", "labels")'
+        )
+  # Special-case rename for i1k (val+test -> minival+val)
+  config.evals.minival = config.evals.val
+  config.evals.val = config.evals.test
+  # NOTE: keep test == val for convenience in subsequent analysis.
+  config.evals.real = dict(type='classification')
+  config.evals.real.data = dict(name='imagenet2012_real', split='validation')
+  config.evals.real.pp_fn = pp.format(lbl='real_label')
+  config.evals.real.loss_name = config.loss
+  config.evals.real.log_steps = 100
+  config.evals.v2 = dict(type='classification')
+  config.evals.v2.data = dict(name='imagenet_v2', split='test')
+  config.evals.v2.pp_fn = pp.format(lbl='label')
+  config.evals.v2.loss_name = config.loss
+  config.evals.v2.log_steps = 100
+def get_config(arg=None):
+  """Config for adaptation."""
+  arg = bvcc.parse_arg(arg, model='vit', dataset='cifar10', crop='resmall_crop',
+                       h_res=448, l_res=384, batch_size=512, fsdp=False,
+                       runlocal=False)
+  config = mlc.ConfigDict()
+  config.input = {}
+  config.input.batch_size = arg.batch_size if not arg.runlocal else 8
+  config.input.shuffle_buffer_size = 50_000 if not arg.runlocal else 100
+  config.log_training_steps = 10
+  config.ckpt_steps = 1000
+  config.ckpt_timeout = 600
+  # Optimizer section
+  config.optax_name = 'big_vision.momentum_hp'
+  config.grad_clip_norm = 1.0
+  config.wd = None  # That's our default, but just being explicit here!
+  config.loss = 'softmax_xent'
+  config.lr = 0.01
+  config.mixup = dict(p=0.0)
+  config.seed = 0
+  _set_dataset(config, arg.dataset, arg.crop, arg.h_res, arg.l_res)
+  _set_model(config, arg.model)
+  if arg.fsdp:
+    config.mesh = [('data', -1)]
+    config.sharding_strategy = [('.*', 'fsdp(axis="data")')]
+    config.sharding_rules = [('act_batch', ('data',))]
+    config.model.scan = True
+  return config

big_vision/configs/vit_i1k.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT on ILSVRC-2012 as in https://arxiv.org/abs/2106.10270
+This config does NOT include regularization (dropout, stochastic depth), which
+was shown to help with B/32, B/16, L/16 models in the paper (Figure 4).
+This configuration makes use of the "arg" to get_config to select which model
+to run, so a few examples are given below:
+Run training of a B/16 model:
+big_vision.train \
+    --config big_vision/configs/vit_i1k.py:variant=B/16 \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'`
+Run training of a B/32 model with custom aug-strenght and 300ep:
+big_vision.train \
+    --config big_vision/configs/vit_i1k.py:variant=B/32,aug=light1 \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'` \
+    --config.total_epochs 300
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+MIXUP_DEF = {
+    'none': dict(p=0.0, fold_in=None),
+    'light1': dict(p=0.0, fold_in=None),
+    'light2': dict(p=0.2, fold_in=None),
+    'medium1': dict(p=0.2, fold_in=None),
+    'medium2': dict(p=0.5, fold_in=None),
+    'strong1': dict(p=0.5, fold_in=None),
+    'strong2': dict(p=0.8, fold_in=None),
+}
+RANDAUG_DEF = {
+    'none': '',
+    'light1': 'randaug(2,0)',  # Actually not nothing!
+    'light2': 'randaug(2,10)',
+    'medium1': 'randaug(2,15)',
+    'medium2': 'randaug(2,15)',
+    'strong1': 'randaug(2,20)',
+    'strong2': 'randaug(2,20)',
+}
+def get_config(arg=None):
+  """Config for training."""
+  arg = bvcc.parse_arg(arg, variant='B/16', runlocal=False, aug='')
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 1000
+  config.loss = 'sigmoid_xent'
+  config.init_head_bias = -6.9
+  # If this gives a KeyError, lookup Fig4 of the paper and add an entry.
+  # Note, this here is a good average between 30ep and 300ep, sometimes you coud
+  # find a slightly better setting for either of them.
+  aug_setting = arg.aug or {
+      'Ti/16': 'light1',
+      'S/32': 'medium1',
+      'S/16': 'medium2',
+      'B/32': 'medium2',
+      'B/16': 'medium2',
+      'L/16': 'medium2',
+  }[arg.variant]
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 4096
+  config.input.cache = 'raw_data' if arg.runlocal else 'none'  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  pp_common = (
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)|flip_lr|' +
+      RANDAUG_DEF[aug_setting] +
+      pp_common.format(lbl='label')
+  )
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  # Aggressive pre-fetching because our models here are small, so we not only
+  # can afford it, but we also need it for the smallest models to not be
+  # bottle-necked by the input pipeline. Play around with it for -L models tho.
+  config.input.prefetch = 8
+  config.prefetch_to_device = 4
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(
+      variant=arg.variant,
+      rep_size=True,
+      pool_type='tok',
+  )
+  # Optimizer section
+  config.grad_clip_norm = 1.0
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  # The modified AdaFactor we introduced in https://arxiv.org/abs/2106.04560
+  # almost always behaves exactly like adam, but at a fraction of the memory
+  # cost (specifically, adam_bf16 = +1.5M, adafactor = +0.5M), hence it is a
+  # good idea to try it when you are memory-bound!
+  # config.optax_name = 'big_vision.scale_by_adafactor'
+  # A good flag to play with when hitting instabilities, is the following:
+  # config.optax = dict(beta2_cap=0.95)
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = MIXUP_DEF[aug_setting]
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+        cache='final_data' if arg.runlocal else 'none',
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  config.fewshot = get_fewshot_lsr(runlocal=arg.runlocal)
+  config.fewshot.log_steps = 10_000
+  # Make a few things much smaller for quick local debugging testruns.
+  if arg.runlocal:
+    config.input.shuffle_buffer_size = 10
+    config.input.batch_size = 8
+    config.input.cache_raw = False
+    config.evals.train.data.split = 'train[:16]'
+    config.evals.minival.data.split = 'train[:16]'
+    config.evals.val.data.split = 'validation[:16]'
+    config.evals.v2.data.split = 'test[:16]'
+    config.evals.real.data.split = 'validation[:16]'
+  return config

big_vision/configs/vit_i21k.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT on ImageNet-21k as in https://arxiv.org/abs/2106.10270
+This config relies on the Imagenet-21k tfds dataset, which is not yet
+available publicly in TFDS. We intend to add the dataset to public TFDS soon,
+and this config will then be runnable.
+Note that regularization (dropout, stochastic depth) is not currently
+implemented. This was not beneficial for ImageNet-21k pre-trainning.
+"""
+import big_vision.configs.common as bvcc
+from big_vision.configs.common_fewshot import get_fewshot_lsr
+import ml_collections as mlc
+MIXUP_DEF = {
+    'none': dict(p=0.0, fold_in=None),
+    'light1': dict(p=0.0, fold_in=None),
+    'light2': dict(p=0.2, fold_in=None),
+    'medium1': dict(p=0.2, fold_in=None),
+    'medium2': dict(p=0.5, fold_in=None),
+    'strong1': dict(p=0.5, fold_in=None),
+    'strong2': dict(p=0.8, fold_in=None),
+}
+RANDAUG_DEF = {
+    'none': '',
+    'light1': 'randaug(2,0)',  # Actually not nothing!
+    'light2': 'randaug(2,10)',
+    'medium1': 'randaug(2,15)',
+    'medium2': 'randaug(2,15)',
+    'strong1': 'randaug(2,20)',
+    'strong2': 'randaug(2,20)',
+}
+def get_config(arg=None):
+  """Config for training."""
+  arg = bvcc.parse_arg(arg, variant='B/16', runlocal=False, aug=None)
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 300
+  config.num_classes = 21843
+  config.init_head_bias = -10.0
+  config.loss = 'sigmoid_xent'
+  # If this gives a KeyError, lookup Fig4 of the paper and add an entry.
+  # Note, this here is a good average between 30ep and 300ep, sometimes you coud
+  # find a slightly better setting for either of them.
+  aug_setting = {
+      'Ti/16': 'none',
+      'S/32': 'none',
+      'S/16': 'light1',
+      'B/32': 'light2',
+      'B/16': 'light2',
+      'L/16': 'medium2',
+  }[arg.variant]
+  config.input = dict()
+  config.input.data = dict(
+      name='imagenet21k',
+      split='full[51200:]',
+  )
+  config.input.batch_size = 4096
+  config.input.shuffle_buffer_size = 250_000  # Per host, so small-ish is ok.
+  pp_common = '|value_range(-1, 1)|onehot({onehot_args})|keep("image", "labels")'
+  pp_common_i21k = pp_common.format(onehot_args=f'{config.num_classes}')
+  pp_common_i1k = pp_common.format(onehot_args='1000, key="label", key_result="labels"')
+  config.input.pp = f'decode_jpeg_and_inception_crop(224)|flip_lr|{RANDAUG_DEF[aug_setting]}' + pp_common_i21k
+  pp_eval = 'decode|resize_small(256)|central_crop(224)'
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  # Aggressive pre-fetching because our models here are small, so we not only
+  # can afford it, but we also need it for the smallest models to not be
+  # bottle-necked by the input pipeline. Play around with it for -L models tho.
+  config.input.prefetch = 8
+  config.prefetch_to_device = 4
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(variant=arg.variant, pool_type='gap', posemb='learn')
+  # Optimizer section
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  config.grad_clip_norm = 1.0
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = MIXUP_DEF[aug_setting]
+  # Evaluations on i21k itself.
+  def eval_i21k(split):
+    return dict(
+        type='classification',
+        data={**config.input.data, 'split': split},
+        pp_fn=pp_eval + pp_common_i21k,
+        loss_name=config.loss,
+        log_steps=1000,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.test = eval_i21k('full[:25_600]')
+  config.evals.val = eval_i21k('full[25_600:51_200]')
+  config.evals.train = eval_i21k('full[51_200:76_800]')
+  # Few-shot evaluators
+  config.evals.fewshot = get_fewshot_lsr(runlocal=arg.runlocal)
+  config.evals.fewshot.log_steps = 25_000
+  # Make a few things much smaller for quick local debugging testruns.
+  if arg.runlocal:
+    config.input.shuffle_buffer_size = 10
+    config.input.batch_size = 8
+    config.evals.test.data.split = 'full[:16]'
+    config.evals.train.data.split = 'full[:16]'
+    config.evals.val.data.split = 'full[:16]'
+    config.evals.i1k_val.data.split = 'validation[:16]'
+    config.evals.i1k_v2.data.split = 'test[:16]'
+    config.evals.i1k_a.data.split = 'test[:16]'
+    config.evals.i1k_r.data.split = 'test[:16]'
+  return config

big_vision/configs/vit_s16_i1k.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: disable=line-too-long
+r"""Pre-training ViT-S/16 on ILSVRC-2012 following https://arxiv.org/abs/2205.01580.
+This should take 6-7h to finish 90ep on a TPU-v3-8 and reach 76.5%,
+see the tech report for more details.
+Command to run:
+big_vision.train \
+    --config big_vision/configs/vit_s16_i1k.py \
+    --workdir gs://[your_bucket]/big_vision/`date '+%m-%d_%H%M'`
+To run for 300ep, add `--config.total_epochs 300` to the command.
+"""
+import ml_collections as mlc
+def get_config():
+  """Config for training."""
+  config = mlc.ConfigDict()
+  config.seed = 0
+  config.total_epochs = 90
+  config.num_classes = 1000
+  config.loss = 'softmax_xent'
+  config.input = {}
+  config.input.data = dict(
+      name='imagenet2012',
+      split='train[:99%]',
+  )
+  config.input.batch_size = 1024
+  config.input.cache_raw = True  # Needs up to 120GB of RAM!
+  config.input.shuffle_buffer_size = 250_000
+  pp_common = (
+      '|value_range(-1, 1)'
+      '|onehot(1000, key="{lbl}", key_result="labels")'
+      '|keep("image", "labels")'
+  )
+  config.input.pp = (
+      'decode_jpeg_and_inception_crop(224)|flip_lr|randaug(2,10)' +
+      pp_common.format(lbl='label')
+  )
+  pp_eval = 'decode|resize_small(256)|central_crop(224)' + pp_common
+  # To continue using the near-defunct randaug op.
+  config.pp_modules = ['ops_general', 'ops_image', 'ops_text', 'archive.randaug']
+  config.log_training_steps = 50
+  config.ckpt_steps = 1000
+  # Model section
+  config.model_name = 'vit'
+  config.model = dict(
+      variant='S/16',
+      rep_size=True,
+      pool_type='gap',
+      posemb='sincos2d',
+  )
+  # Optimizer section
+  config.grad_clip_norm = 1.0
+  config.optax_name = 'scale_by_adam'
+  config.optax = dict(mu_dtype='bfloat16')
+  config.lr = 0.001
+  config.wd = 0.0001
+  config.schedule = dict(warmup_steps=10_000, decay_type='cosine')
+  config.mixup = dict(p=0.2, fold_in=None)
+  # Eval section
+  def get_eval(split, dataset='imagenet2012'):
+    return dict(
+        type='classification',
+        data=dict(name=dataset, split=split),
+        pp_fn=pp_eval.format(lbl='label'),
+        loss_name=config.loss,
+        log_steps=2500,  # Very fast O(seconds) so it's fine to run it often.
+    )
+  config.evals = {}
+  config.evals.train = get_eval('train[:2%]')
+  config.evals.minival = get_eval('train[99%:]')
+  config.evals.val = get_eval('validation')
+  config.evals.v2 = get_eval('test', dataset='imagenet_v2')
+  config.evals.real = get_eval('validation', dataset='imagenet2012_real')
+  config.evals.real.pp_fn = pp_eval.format(lbl='real_label')
+  return config

big_vision/datasets/ai2d/ai2d.py ADDED Viewed

File without changes

big_vision/datasets/aokvqa/aokvqa.py ADDED Viewed

File without changes

big_vision/datasets/chartqa/chartqa.py ADDED Viewed

File without changes

big_vision/datasets/coco35l/coco35l.py ADDED Viewed

File without changes

big_vision/datasets/core.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Core data functions, dispatch calls to the requested dataset."""
+import importlib
+# Note: intentionally not using ABC to avoid forcing implementation of every
+# method, since one can imagine train-only datasets for example.
+class DataSource:
+  """The API that any data source should implement."""
+  def get_tfdata(self, ordered, *, process_split=True, allow_cache=True):
+    """Creates this data object as a tf.data.Dataset.
+    This will be called separately in each process, and it is up to the dataset
+    implementation to shard it accordingly if desired!
+    Args:
+      ordered: if True, the dataset should use deterministic ordering, if False
+        it may have undefined ordering. Think of True == val, False == train.
+      process_split: if False then every process receives the entire dataset
+        (e.g.  for evaluators running in a single process).
+      allow_cache: whether to allow caching the opened data or not.
+    Returns:
+      A tf.data.Dataset object.
+    Raises:
+      RuntimeError: if not implemented by the dataset, but called.
+    """
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+  @property
+  def total_examples(self):
+    """Returns number of examples in the dataset, regardless of sharding."""
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+  def num_examples_per_process(self):
+    """Returns a list of the numer of examples for each process.
+    This is only needed for datasets that should go through make_for_inference.
+    Returns:
+      Returns a list of the numer of examples for each process.
+      Ideally, this would always be `[total() / nprocess] * nprocess`, but in
+      reality we can almost never perfectly shard a dataset across arbitrary
+      number of processes.
+      One alternative option that can work in some cases is to not even shard
+      the dataset and thus return `[num_examples()] * nprocess.
+    Raises:
+      RuntimeError: if not implemented by the dataset, but called.
+    """
+    raise RuntimeError("not implemented for {self.__class__.__name__}")
+def get(name, **kw):
+  if name.startswith("bv:"):
+    mod = importlib.import_module(f"big_vision.datasets.{name[3:]}")
+    return mod.DataSource(**kw)
+  else:
+    mod = importlib.import_module("big_vision.datasets.tfds")
+    return mod.DataSource(name, **kw)

big_vision/datasets/countbenchqa/countbenchqa.py ADDED Viewed

File without changes

big_vision/datasets/docvqa/docvqa.py ADDED Viewed

File without changes

big_vision/datasets/gqa/gqa.py ADDED Viewed

File without changes

big_vision/datasets/imagenet/class_names.py ADDED Viewed

File without changes

big_vision/datasets/infovqa/infovqa.py ADDED Viewed

File without changes

big_vision/datasets/jsonl.py ADDED Viewed

	@@ -0,0 +1,177 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Simple data input from .jsonl files."""
+import hashlib
+import json
+from multiprocessing.pool import ThreadPool
+import os
+import tempfile
+import urllib.request
+from absl import logging
+import big_vision.datasets.core as ds_core
+import jax
+import numpy as np
+import overrides
+import tensorflow as tf
+def cached_download(url, dest=None, verbose=True):
+  """Download `url` to local file and return path to that, but with caching."""
+  # NOTE: there is a small chance of saving corrupted data if the process is
+  # interrupted in the middle of writing the file. Then, reading in the input
+  # pipeline will fail, and the fix is to nuke the temp folder.
+  # Compute a temp name based on the URL, so we can check if we already
+  # downloaded it before.
+  dest = dest or os.path.join(tempfile.gettempdir(), "bv")
+  os.makedirs(dest, exist_ok=True)
+  dest = os.path.join(dest, hashlib.md5(url.encode()).hexdigest())
+  # NOTE: we should use last-modified header to know whether to re-download.
+  if os.path.isfile(dest):
+    return dest
+  if verbose:
+    print(f"\rRetrieving {url} into {dest}", end="", flush=True)
+  with urllib.request.urlopen(url) as f:
+    data = f.read()
+  with open(dest, "wb+") as f:
+    f.write(data)
+  return dest
+class DataSource(ds_core.DataSource):
+  """.jsonl DataSource."""
+  def __init__(self, fname, *, fopen_keys=(), download_keys=(),
+               start=0, stop=float("inf")):
+    """Create data-source that's jsonl + data files (eg images).
+    This correctly supports multi-host in that each host only reads a subset of
+    the dataset automatically. However, currently, all hosts download all items
+    if `download_keys` is specified. TODO: b/lbeyer - This can be improved.
+    Args:
+      fname: str, the path to the jsonl file that holds the dataset.
+      fopen_keys: collection of str or dict, the keys in the dataset whose
+        string value actually is a file-path that should be opened and read,
+        and its content is what goes into the batch (eg image filenames
+        commonly ["image"]).
+        If a dict, the values are folders prefixed to the filenames.
+        Supports gs:// for reading from buckets.
+      download_keys: collection of str, the keys in the dataset whose string
+        value actually is a URL from which the file should be downloaded first.
+        files are downloaded to a persistent tmp folder using the URL hash as
+        filename. If the file already exists, the download is skipped.
+        Must be a subset of `fopen_keys`.
+      start: int, index of the first row to use; use for slicing the data.
+      stop: int or inf, index of the row after the last one to use.
+    Note:
+      This simple data input does not allow for nested/hierarchical values,
+      or in any way more complicated values like vectors. Use TFDS for that.
+      The way start/stop arguments are used is as in list slicing[start:stop].
+    """
+    self.examples = []
+    with tf.io.gfile.GFile(fname) as f:
+      for i, line in enumerate(f):
+        if (start or 0) <= i < (stop or float("inf")):
+          try:
+            self.examples.append(json.loads(line))
+          except json.decoder.JSONDecodeError as e:
+            raise ValueError(f"Invalid JSON in line {i}:\n{line}") from e
+    if download_keys:
+      for k in download_keys:
+        assert k in fopen_keys, (
+            f"{k} in download_keys but missing from fopen_keys {fopen_keys}")
+      # TODO: b/lbeyer - use info from trainer instead, move that to utils.
+      logging.info(  # pylint: disable=logging-fstring-interpolation
+          f"\u001b[33mNOTE\u001b[0m: Downloading {download_keys} "
+          f"for dataset {fname} ({len(self.examples)} examples) ...")
+      def _dl_one(ex):
+        for k in download_keys:
+          ex[k] = cached_download(ex[k])
+      ThreadPool(100).map(_dl_one, self.examples)
+      print("Done")
+      logging.info("\u001b[33mNOTE\u001b[0m: Done downloading.")
+    # Normalize.
+    if isinstance(fopen_keys, (list, tuple)):
+      self.fopen_keys = {k: "" for k in fopen_keys}
+    else:
+      self.fopen_keys = fopen_keys or {}
+    # We need to apply fopen path prefix here already, because doing so while
+    # actually reading the files in TF, things are symbolic :(
+    for ex in self.examples:
+      for k, dirname in self.fopen_keys.items():
+        ex[k] = os.path.join(dirname, ex[k])
+  def _indices(self, *, process_split=True, process_index=None):
+    indices = np.arange(len(self.examples))
+    if not process_split:
+      return list(indices)
+    pid = jax.process_index() if process_index is None else process_index
+    return list(np.array_split(indices, jax.process_count())[pid])
+  @overrides.overrides
+  def get_tfdata(self, ordered=False, *, process_split=True, allow_cache=True):
+    del allow_cache  # We don't cache anything anyways.
+    assert not process_split or len(self.examples) >= jax.process_count(), (
+        "Process splitting the data with fewer examples than processes!?")
+    my_idxs = self._indices(process_split=process_split)
+    if not ordered:
+      np.random.shuffle(my_idxs)
+    dataset = tf.data.Dataset.from_generator(
+        generator=lambda: ({"id": str(i), **self.examples[i]} for i in my_idxs),
+        output_signature={
+            "id": _guess_signature("0"),
+            **{k: _guess_signature(v) for k, v in self.examples[0].items()},
+            })
+    def _read_files(example):
+      for k in self.fopen_keys:
+        example[k] = tf.io.read_file(example[k])
+      return example
+    dataset = dataset.map(_read_files)
+    return dataset
+  @property
+  @overrides.overrides
+  def total_examples(self):
+    return len(self.examples)
+  @overrides.overrides
+  def num_examples_per_process(self):
+    return [len(self._indices(process_index=pid))
+            for pid in range(jax.process_count())]
+def _guess_signature(value):
+  return tf.TensorSpec.from_tensor(tf.constant(value))

big_vision/datasets/nocaps/nocaps.py ADDED Viewed

File without changes

big_vision/datasets/okvqa/okvqa.py ADDED Viewed

File without changes

big_vision/datasets/pope/pope.py ADDED Viewed

File without changes

big_vision/datasets/refcoco/refcoco.py ADDED Viewed

File without changes

big_vision/datasets/rsvqa_hr/rsvqa_hr.py ADDED Viewed

File without changes

big_vision/datasets/rsvqa_lr/rsvqa_lr.py ADDED Viewed

File without changes

big_vision/datasets/scicap/scicap.py ADDED Viewed

File without changes

big_vision/datasets/science_qa/science_qa.py ADDED Viewed

File without changes

big_vision/datasets/screen2words/screen2words.py ADDED Viewed

File without changes

big_vision/datasets/sequence_packing.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2024 Big Vision Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Packed Sequence Op."""
+# Forked from
+# https://github.com/google/maxtext/blob/main/MaxText/sequence_packing.py.
+from typing import Dict, Optional, List, Union
+from flax import traverse_util
+import tensorflow as tf
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+FLATTEN_SEPARATOR = "<|sep|>"
+def pack_dataset(
+    dataset: tf.data.Dataset,
+    batch_size: int | None,
+    key2length: Union[int, Dict[str, int]],
+    keys: Optional[List[str | tuple[str, ...]]] = None) -> tf.data.Dataset:
+  """Creates a 'packed' version of a dataset on-the-fly.
+  Wrap `tensorflow.grain` ops.
+  This is meant to replace the irritation of having to create a separate
+  "packed" version of a dataset to train efficiently on TPU.
+  Each example in the output dataset represents several examples in the
+  input dataset.
+  For each key in the input dataset, two additional keys are created:
+  <key>_segment_ids: an int32 tensor identifying the parts
+     representing the original example.
+  <key>_positions: an int32 tensor identifying the position within the original
+     example.
+  Example:
+  Two input examples get combined to form an output example.
+  The input examples are:
+  {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0]}
+  {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1]}
+  The output example is:
+  {
+                 "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0]
+             "inputs_seg": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0]
+             "inputs_pos": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0]
+                "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0]
+            "targets_seg": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0]
+            "targets_pos": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0]
+  }
+  0 represents padding in both the inputs and the outputs.
+  Sequences in the incoming examples are truncated to length "length", and the
+  sequences in the output examples all have fixed (padded) length "length".
+  Args:
+    dataset: A `tf.data.Dataset`.
+    batch_size: Batch size of the packed dataset.
+    key2length: An integer, or a dict from feature-key to integer.
+    keys: A list of strings (e.g. ["inputs", "targets"]).
+  Returns:
+    A `tf.data.Dataset`.
+  """
+  raise ValueError("Not implemented in OSS yet.")

big_vision/datasets/stvqa/stvqa.py ADDED Viewed

File without changes

big_vision/datasets/tallyqa/tallyqa.py ADDED Viewed

File without changes

big_vision/datasets/textcaps/textcaps.py ADDED Viewed

File without changes